From e435bfd93cbc03970450486c4ea526a0fa5aa7f6 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Fri, 21 Apr 2023 10:26:49 +0200 Subject: [PATCH] RMSE-optimized quants for all quantization types By default this new option is ON. One can turn it off by setting LLAMA_NO_RMSE. With this option enabled, the Q4_3 quantization results in a perplexity of 6.0344, so 0.0273 lower than simple Q4_3 quantization. --- CMakeLists.txt | 7 + Makefile | 4 + ggml.c | 355 ++++++++++++++++++++++++++++++++++++++----------- 3 files changed, 286 insertions(+), 80 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 11ebe9eb6..1f31cfa20 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,6 +68,9 @@ option(LLAMA_ACCELERATE "llama: enable Accelerate framework" option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF) option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) +# RMSE minimization when quantizing +option(LLAMA_NO_RMSE "llama: disable RMSE minimization" OFF) + option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) @@ -99,6 +102,10 @@ if (NOT MSVC) endif() endif() +if (LLAMA_NO_RMSE) + add_compile_definitions(GGML_NO_RMSE) +endif() + if (APPLE AND LLAMA_ACCELERATE) find_library(ACCELERATE_FRAMEWORK Accelerate) if (ACCELERATE_FRAMEWORK) diff --git a/Makefile b/Makefile index b297959c9..04ddc4f10 100644 --- a/Makefile +++ b/Makefile @@ -134,6 +134,10 @@ ifneq ($(filter armv8%,$(UNAME_M)),) CFLAGS += -mfp16-format=ieee -mno-unaligned-access endif +ifdef LLAMA_NO_RMSE + CFLAGS += -DGGML_NO_RMSE +endif + # # Print build information # diff --git a/ggml.c b/ggml.c index 281b20283..78982eec8 100644 --- a/ggml.c +++ b/ggml.c @@ -670,10 +670,107 @@ typedef struct { } block_q8_0; static_assert(sizeof(block_q8_0) == 3*sizeof(float) + QK8_0, "wrong q8_0 block size/padding"); +#ifndef GGML_NO_RMSE +// Stuff for RMSE-minimizing quantization +static inline int nearest_int(float fval) { + assert(fval <= 4194303.f); + float val = fval + 12582912.f; + int i; memcpy(&i, &val, sizeof(int)); + return (i & 0x007fffff) - 0x00400000; +} + +static float kquantize_q4_with_bounds(int n, int nmin, int nmax, const float * restrict X, int nCandidates, + const float * restrict candidates, int8_t * restrict L) { + assert (nmin >= INT8_MIN); + assert (nmax <= INT8_MAX); + float amax = 0; + for (int i=0; i sumlxM2*suml2P) { + if (sumlxP2 > best*suml2P) { + best = sumlxP2/suml2P; bestScale = iscale; + } + } else { + if (sumlxM2 > best*suml2M) { + best = sumlxM2/suml2M; bestScale = -iscale; + } + } + } + float sumlx = 0; int suml2 = 0; + for (int i=0; i best*suml2) { + best = sumlx2/suml2; bestScale = iscale; + } + } + float sumlx = 0; int suml2 = 0; + for (int i=0; i max) max = x[j]; + } + if (max == min) { + *result_a = min; + *result_b = 1.f; + for (int j=0; j 0 && fabsf(a - aold) < epsilon*fabsf(aold) && fabsf(b - bold) < epsilon*fabsf(bold)) break; + } + float err = 0; + for (int j=0; j simple_err) { + a = min; b = (max - min)/15; + for (int j=0; j= INT8_MIN); - assert (nmax <= INT8_MAX); - float amax = 0; - for (int i=0; i sumlxM2*suml2P) { - if (sumlxP2 > best*suml2P) { - best = sumlxP2/suml2P; bestScale = iscale; - } - } else { - if (sumlxM2 > best*suml2M) { - best = sumlxM2/suml2M; bestScale = -iscale; - } + + x += QK4_1; + } +} +static void quantize_row_q4_3_rmse(const float * restrict x, block_q4_3 * restrict y, int k) { + assert(k % QK4_3 == 0); + + int8_t L[QK4_3]; + float tmp_x[QK4_3]; + + const int nb = k / QK4_3; + + for (int i = 0; i < nb; i++) { + float a, b; + quantize_row_q41_helper(QK4_3, x, L, tmp_x, &a, &b); + y[i].d = GGML_FP32_TO_FP16(b); + y[i].m = GGML_FP32_TO_FP16(a); + + for (int l = 0; l < QK4_3; l += 2) { + const uint8_t vi0 = (uint8_t)(L[l+0]); + const uint8_t vi1 = (uint8_t)(L[l+1]); + + assert(vi0 < 16); + assert(vi1 < 16); + + y[i].qs[l/2] = vi0 | (vi1 << 4); } + + x += QK4_3; } - float sumlx = 0; int suml2 = 0; - for (int i=0; i> 4; + + hist[vi0]++; + hist[vi1]++; + } +} + size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK4_0 == 0); const int nb = k / QK4_0; @@ -12084,13 +12304,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * quantize_row_q4_0_reference(src + j, y, k); for (int i = 0; i < nb; i++) { - for (int l = 0; l < QK4_0; l += 2) { - const uint8_t vi0 = y[i].qs[l/2] & 0xF; - const uint8_t vi1 = y[i].qs[l/2] >> 4; - - hist[vi0]++; - hist[vi1]++; - } + collect_quant_histogram(QK4_0, y[i].qs, hist); } } @@ -12107,13 +12321,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * quantize_row_q4_1_reference(src + j, y, k); for (int i = 0; i < nb; i++) { - for (int l = 0; l < QK4_1; l += 2) { - const uint8_t vi0 = y[i].qs[l/2] & 0xF; - const uint8_t vi1 = y[i].qs[l/2] >> 4; - - hist[vi0]++; - hist[vi1]++; - } + collect_quant_histogram(QK4_1, y[i].qs, hist); } } @@ -12127,17 +12335,10 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * for (int j = 0; j < n; j += k) { block_q4_2 * restrict y = (block_q4_2 *)dst + j/QK4_2; - //quantize_row_q4_2_reference(src + j, y, k); - quantize_row_q4_2_rmse(src + j, y, k); + quantize_row_q4_2_reference(src + j, y, k); for (int i = 0; i < nb; i++) { - for (int l = 0; l < QK4_2; l += 2) { - const uint8_t vi0 = y[i].qs[l/2] & 0xF; - const uint8_t vi1 = y[i].qs[l/2] >> 4; - - hist[vi0]++; - hist[vi1]++; - } + collect_quant_histogram(QK4_2, y[i].qs, hist); } } @@ -12154,13 +12355,7 @@ size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * quantize_row_q4_3_reference(src + j, y, k); for (int i = 0; i < nb; i++) { - for (int l = 0; l < QK4_3; l += 2) { - const uint8_t vi0 = y[i].qs[l/2] & 0xF; - const uint8_t vi1 = y[i].qs[l/2] >> 4; - - hist[vi0]++; - hist[vi1]++; - } + collect_quant_histogram(QK4_3, y[i].qs, hist); } }