From 298ff34221493ae5487fb25180ffe119f87a3717 Mon Sep 17 00:00:00 2001
From: Amy <milkdrop2000@protonmail.com>
Date: Tue, 13 Jun 2023 09:11:42 +0100
Subject: [PATCH] clarified dynamic precision picking in QX

---
 ggml.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/ggml.c b/ggml.c
index dd973d6f9..e7bb3c049 100644
--- a/ggml.c
+++ b/ggml.c
@@ -877,16 +877,19 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s
 // max block size is 256 because some feed_forward tensors have a width of 11008 weights, which is not divisible by 512
 #define QKX_0 256
 
-// there is no byte-exact C struct to represent a QX_0 block, but a high-level representation of a block is:
+// There is no byte-exact C struct to represent a QX_0 block, but a high-level representation of a block is:
 // ggml_fp16_t delta;
 // ggml_fp16_t min;
 // uint8_t block_metadata;
 // [bitstream of weights]
 
-// quantization parameters for QX_0 (used only when running ./quantize, irrelevant during inference)
+
+// Quantization parameters for QX_0 (used only when running ./quantize, irrelevant during inference)
+// Quantization starts at QX_0_STARTING_QBITS bits, and then moves down to QX_0_START_OF_ATTEMPTED_QBITS
+// and tries lower and lower bit precisions from there
 // TODO maybe move these to commandline arguments...?
 #define QX_0_STARTING_QBITS 4
-#define QX_0_STARTING_QBITS_DOWNSCALING 2
+#define QX_0_START_OF_ATTEMPTED_QBITS 2
 
 
 // reference implementation for deterministic creation of model files
@@ -3204,7 +3207,7 @@ static void ggml_vec_dot_qx_0_q8_0(const int n, float * restrict s, const void *
     // __AVX2__ doesn't seem to actually make much of a difference,
     // a lot of optimizing could possibly be done, including possibly using AVX2
     // for dequantization...?
-    
+
     #if defined(__AVX2__)
     __m256 rolling_sum = _mm256_setzero_ps();
     #endif
@@ -16604,7 +16607,19 @@ size_t ggml_quantize_qx_0(const float * src, void * dst, int n, int64_t * hist,
         float min_value = -(max_quantization_errors[qbits] * ((1 << qbits) - 1));
         float mult_range = 2 * max_quantization_errors[qbits];
         
-        for (uint8_t test_qbit = QX_0_STARTING_QBITS_DOWNSCALING; test_qbit >= 1; test_qbit--) {
+        // The quantizer starts at a QX_0_STARTING_QBITS quantized block (e.g. 4bits), but then
+        // attempts to move to a lower precision defined by QX_0_START_OF_ATTEMPTED_QBITS.
+        // It keeps looking to see if 3, 2 or 1 bit precision leads to a smaller file size.
+        //
+        // The decrease in precision does not always lead to a smaller file when we need to maintain
+        // a fixed max quantization error, since lower bits mean a smaller value range, which might lead
+        // to more values being moved to 16bits, which might in the end actually increase our block's size.
+        //
+        // If values are very close to the mean, then a lower precision is more advantageous since we don't
+        // need a large quantization range, but otherwise it's likely more beneficial to stay at a higher precision.
+        // The loop below calculates this ideal trade-off for us!
+
+        for (uint8_t test_qbit = QX_0_START_OF_ATTEMPTED_QBITS; test_qbit >= 1; test_qbit--) {
             // calculate the mean of non-fp16 values and define that as the center of the quantization range
             double mean = 0;
             for (int j = 0; j < QKX_0; j++) {