From d94139bf27ab32c0cdef5341323e74350282594a Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sun, 11 Feb 2024 14:07:19 +0200 Subject: [PATCH] iq1_s: scalar CPU dot product --- ggml-quants.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++-- ggml-quants.h | 1 + ggml.c | 2 +- 3 files changed, 57 insertions(+), 3 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index e4b72a850..bb3f7bb55 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -9282,6 +9282,52 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void #endif } +void ggml_vec_dot_iq1_s_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + assert(n % QK_K == 0); + + const block_iq1_s * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + + int db[4]; + uint16_t idx[4]; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * sc = x[i].scales; + + int sumi = 0; + for (int i32 = 0; i32 < QK_K/32; ++i32) { + idx[0] = qs[0] | ((sc[0] & 0x08) << 5); + idx[1] = qs[1] | ((sc[0] & 0x80) << 1); + idx[2] = qs[2] | ((sc[1] & 0x08) << 5); + idx[3] = qs[3] | ((sc[1] & 0x80) << 1); + db[0] = (2*(sc[0] & 7) + 1); + db[1] = (2*((sc[0] >> 4) & 7) + 1); + db[2] = (2*(sc[1] & 7) + 1); + db[3] = (2*((sc[1] >> 4) & 7) + 1); + for (int l = 0; l < 4; ++l) { + const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]); + int suml = 0; + for (int j = 0; j < 8; ++j) suml += q8[j] * grid[j]; + sumi += db[l] * suml; + q8 += 8; + } + qs += 4; + sc += 2; + } + + sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * sumi; + } + + *s = sumf; + +} + // ================================ IQ2 quantization ============================================= typedef struct { @@ -10472,6 +10518,12 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy memset(L, 1, 8); continue; } + // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem. + // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two + // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights + // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and + // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale + // for each possible and score for each split. for (int j = 0; j < 8; ++j) { pairs[2*j] = xb[j]; idx[2*j] = j; @@ -10504,6 +10556,8 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy for (int j = 0; j < 8; ++j) L[j] = 2 - L[j]; scale = -scale; } + // Now we check if the solution found above corresponds to a grid point and, if not, use a neighbouring + // grid point that minimizes SSD. uint16_t u = 0; for (int j = 0; j < 8; ++j) u |= (L[j] << 2*j); int grid_index = kmap_q2xs[u]; @@ -10525,8 +10579,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy } float d = max_scale/15; - //y[ibl].d = GGML_FP32_TO_FP16(d*1.075f); // 1.075f is another fudge factor. Don't ask me why it is needed. - y[ibl].d = GGML_FP32_TO_FP16(d*1.085f); // 1.08f is another fudge factor. Don't ask me why it is needed. + y[ibl].d = GGML_FP32_TO_FP16(d*1.085f); // 1.085f is another fudge factor. Don't ask me why it is needed. float id = 1/d; for (int ib = 0; ib < QK_K/8; ++ib) { int l = nearest_int(0.5f*(id*scales[ib]-1)); diff --git a/ggml-quants.h b/ggml-quants.h index 43913d198..ad381cfab 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -267,6 +267,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); // // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") diff --git a/ggml.c b/ggml.c index 80839ecf3..ee5a60e18 100644 --- a/ggml.c +++ b/ggml.c @@ -681,7 +681,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_iq1_s, .from_float = NULL, .from_float_reference = NULL, - .vec_dot = NULL, + .vec_dot = ggml_vec_dot_iq1_s_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q8_K] = {