diff --git a/ggml.c b/ggml.c
index 7ad696ea8..dd973d6f9 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3191,20 +3191,24 @@ __attribute__((optimize("unroll-loops")))
 static void ggml_vec_dot_qx_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
     uint32_t nb = n / QKX_0;
     GGML_ASSERT(QKX_0 % QK8_0 == 0);
+
     *s = 0;
 
     uint8_t * quant_row = (uint8_t *) vx;
-
-    // row_data stores dequantized values of the current block
-    float f32_row_data[QKX_0];
-
     const block_q8_0 * restrict column = vy;
-    
-    uint32_t column_idx = 0;
+    uint32_t column_i = 0; // current index in column
 
-    __m256 rolling_sum = _mm256_setzero_ps();
+    // row_data is a buffer which stores dequantized float values for a current block
+    float f32_row_data[QKX_0];
     
-    // IMPORTANT, Quantized weights should be kept <= 4bits. Change this number for higher values
+    // __AVX2__ doesn't seem to actually make much of a difference,
+    // a lot of optimizing could possibly be done, including possibly using AVX2
+    // for dequantization...?
+    
+    #if defined(__AVX2__)
+    __m256 rolling_sum = _mm256_setzero_ps();
+    #endif
+
     float qvals[1 << 4];
 
     for (int b = 0; b < nb; b++) {
@@ -3218,22 +3222,31 @@ static void ggml_vec_dot_qx_0_q8_0(const int n, float * restrict s, const void *
         const uint8_t qbits = *((uint8_t *) data_start);
         data_start = (uint16_t*) ((uint8_t*) data_start + 1);
 
-        mult_value /= ((1 << qbits) - 1);
         quant_row = (uint8_t * ) data_start;
 
+        // Any qbits are supported, but the size of qvals needs to be changed to 1 << max_expected_qbits.
+        // So if you have at most 7bit values, you can change qvals's declaration to qvals[1 << 7].
+        // Additionally, the "fp_chooser == 0" optimized branch only works if qbits is "3" or a power of 2,
+        // so feel free to disable it entirely and run the slower "else" statement which works for pretty much
+        // any qbit value.
+        GGML_ASSERT(qbits <= 4);
+
         uint32_t offset = 0;
         uint8_t data_offset = 0;
 
+        // Cache quantized values
         for (int i = 0; i < (1 << qbits); i++) {
             qvals[i] = min_value + mult_value * i;
         }
 
-        // 64 is the size in bits of uint64_t
+        // Parse in sub-blocks of 64 since they are managed by a single uint64_t which decides if a given weight
+        // is on 16bit or quantized. This means that we can do a fast fp16_indicator == 0 check (i.e. all weights are quantized)
+        // to speed up peformance
         for (int jb = 0; jb < QKX_0 / 64; jb++) {
-            uint64_t fp16_chooser = block_start[jb];
+            uint64_t fp16_indicator = block_start[jb];
 
             // all weights are quantized in this section; ALSO this ONLY works when qbits is <= 4, since (qbits != 3) simply checks if qbits is a power of 2
-            if (fp16_chooser == 0) {
+            if (fp16_indicator == 0) {
                 if (qbits == 3) {
                     // same principle as on the regular data_offset branch, but this time the qbits cross byte boundaries, so we need to manage it by hand
                     for (int i = 0; i < 5; i++) {
@@ -3242,8 +3255,8 @@ static void ggml_vec_dot_qx_0_q8_0(const int n, float * restrict s, const void *
                             row_ptr[i * 11 + k] = qvals[((((uint64_t *) data_start)[0] >> (data_offset + k * qbits)) & ((1 << qbits) - 1))];
                         }
 
-                        data_start += 2; // this is the same event as in if (data_start >= 16), but happening twice, stealthily
-                        data_offset += 1; // it's actually +33, but we are rounding
+                        data_start += 2; // this is the same event as in if (data_start >= 16), but happening twice
+                        data_offset += 1; // it's actually +33, but the "+32" is represented in data_start above, so the remainder is simply +1
                     }
 
                     for (int k = 0; k < 9; k ++) {
@@ -3292,24 +3305,17 @@ static void ggml_vec_dot_qx_0_q8_0(const int n, float * restrict s, const void *
                 offset += qbits * 64;
             } else {
                 for (int i = 0; i < 64; i++) {
-                    // next 32 values are free
-                    if (fp16_chooser & 1) {
+                    if (fp16_indicator & 1) {
+                        // Current weight is fp16
                         offset += 16;
                         row_ptr[i] = GGML_FP16_TO_FP32((((uint32_t *) data_start)[0] >> data_offset) & ((1 << 16) - 1));
 
-                        #ifdef ame_debug
-                        printf("%f (16bit)\n", row_ptr[i]);
-                        #endif
-
                         data_start += 1;
                     } else {
+                        // Current weight is quantized
                         offset += qbits;
                         row_ptr[i] = qvals[((((uint32_t *) data_start)[0] >> data_offset) & ((1 << qbits) - 1))];
 
-                        #ifdef ame_debug
-                        printf("%ld -> %f (%dbit)\n", ((((uint32_t *) data_start)[0] >> data_offset) & ((1 << qbits) - 1)), row_ptr[i], qbits);
-                        #endif
-
                         data_offset += qbits;
 
                         if (data_offset >= 16) {
@@ -3318,26 +3324,17 @@ static void ggml_vec_dot_qx_0_q8_0(const int n, float * restrict s, const void *
                         }
                     }
 
-                    fp16_chooser >>= 1;
-
-                    // uint8_t sz = qbits << ((fp16_chooser & 1) << 1);
-
-                    // get_bits(data_start, offset, &w, sz);
-                    // offset += sz;
-
-                    // if (sz == qbits) {
-                    //     row_save[i] = qvals_f16[w];
-                    // } else {
-                    //     row_save[i] = w;
-                    // }
+                    // Shift the fp16 indicator to the right, to move to the next weight
+                    fp16_indicator >>= 1;
                 }
             }
 
             for (int jb = 0; jb < 64 / QK8_0; jb++) {
-                __m256 column_multiplier = _mm256_set1_ps(GGML_FP16_TO_FP32(column[column_idx].d));
+                #if defined(__AVX2__)
+                __m256 column_multiplier = _mm256_set1_ps(GGML_FP16_TO_FP32(column[column_i].d));
 
                 for (int i = 0; i < QK8_0/8; i++) {
-                    __m128i test = _mm_loadu_si128((const __m128i *) (column[column_idx].qs + i * 8));
+                    __m128i test = _mm_loadu_si128((const __m128i *) (column[column_i].qs + i * 8));
                     __m256i work = _mm256_cvtepi8_epi32(test);
                     __m256 workf = _mm256_cvtepi32_ps(work);
                     
@@ -3347,30 +3344,38 @@ static void ggml_vec_dot_qx_0_q8_0(const int n, float * restrict s, const void *
                     workf = _mm256_mul_ps(workf, row);
                     rolling_sum = _mm256_fmadd_ps(workf, column_multiplier, rolling_sum);
                 }
+                
+                #else
+                // scalar
+                float sub_sum = 0;
 
-                column_idx += 1;
+                for (int i = 0; i < QK8_0; i++) {
+                    sub_sum += row_ptr[jb * QK8_0 + i] * column[column_i].qs[i];
+                }
+
+                sub_sum *= GGML_FP16_TO_FP32(column[column_i].d);
+                *s += sub_sum;
+
+                #endif
+
+                column_i += 1;
             }
 
-            // horrible manual loop unroll for testing, 1 iteration only
-            // int i = 0;
-            // uint16_t w = 0;
-
-            // if (unlikely(fp16_chooser & 1)) { get_bits(data_start, offset, &w, 16); offset += 16; row_save[i] = w; } else { get_bits(data_start, offset, &w, qbits); offset += qbits; row_save[i] = qvals_f16[w]; } fp16_chooser >>= 1; i++;
-            
             row_ptr += 64;
         }
 
-        //printf("offset: %d\n", offset);
         GGML_ASSERT(offset % 8 == 0);
         quant_row += offset / 8;
     }
     
+    #if defined(__AVX2__)
     float rolling_sum_vec[8];
     _mm256_store_ps(rolling_sum_vec, rolling_sum);
     
     for (int i = 0; i < 8; i++) {
         *s += rolling_sum_vec[i];
     }
+    #endif
 }
 
 static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
@@ -16524,31 +16529,68 @@ size_t ggml_quantize_qx_0(const float * src, void * dst, int n, int64_t * hist,
     const uint8_t * dst_8 = dst;
     uint64_t dst_offset = 0;
 
+    // define max quantization errors for every bit precision
+    // i.e max_quantization_errors[1] holds max error for 1bit quantized weights
+    //     max_quantization_errors[2] holds max error for 2bit quantized weights
+    //     max_quantization_errors[3] holds max error for 3bit quantized weights
+    //     etc.
+    //
+    // max quantization error here means that every single quantized weight is within
+    // said value (e.g. 0.004) from its original value
+    //
+    // this can be replaced with a max allowed RMSE, a set percentage of weights being within
+    // a certain range, etc... The current implementation here is pretty much just an example
     double max_quantization_errors[5] = {0, 0.004, 0.004, 0, 0.004};
+    
+
+    // How maximum quantization error is implemented here:
+    //
+    // Each block holds both fp16 and "qbit" quantized weights mixed together arbitrarily.
+    // This mixing is handled by a few numbers at the start of each block, the bit of each number
+    // indicating if a given weight (corresponding to that bit) is stored on 16bit or is quantized.
+    //
+    // There is a metadata byte which indicates the qbit precision of the current block, and
+    // its values are in [1,2,3,4], but this can easily be extended to allow any other bit precisions,
+    // such as 5, 6, 9, 13 bits or anything else.
+    //
+    // To guarantee that each weight is within max_quantization_error, we first need to look at what range
+    // of values this allows us to have. Since we have "qbits" bits, then we have (1 << qbits) possible values
+    // the quantized weights can take. The maximum distance between two quantized points can be "2 * max_quantization_error"
+    // since any weight situated within these two points will be <= max_quantization_error of its closest point.
+    //
+    // A visual 2bit example would be: -->|<---->|<---->|<---->|<--
+    // Where "|" are the quantized points, and "-->" represents max_quantization_error on the number line.
+    //
+    // Any value outside this range will have to be kept on 16bit, since it cannot be within max_quantization_error
+    // of its quantized point.
+    //
+    //
+    // Note: Each block is kept byte-aligned for simplicity, which means that the number of 16bit weights and qbit weights
+    // in the bitstream has to be balanced such that the total number of bits is divisible by 8.
+    // e.g. If we have 3 4bit values and 253 16bit values, we will need to revert a 4bit value to 16bit in order
+    // to keep the total number of bits divisble by 8. If we were to quantize a weight instead, we would lose
+    // the "max_quantization_error" guarantee. However, each block doesn't need to remain byte-aligned, the requirement
+    // only holds for each row, so a big potential improvement could be made here, since we have quite a few unnecessary
+    // 16bit weights.
 
     for (int i = 0; i < nb; i++) {
-        // each 64bit TODO
-        uint64_t fp16s[QKX_0 / 64];
-
-        memset(fp16s, 0, sizeof(uint64_t) * (QKX_0 / 64));
+        // each 64bit value holds binary data of whether the current weight (corresponding to a specific bit)
+        // is stored on 16bit or is quantized. "QKX_0 / 64" is here since we need multiple 64bit numbers if
+        // the QX_0 block is larger than 64 weights.
+        uint64_t fp16_indicators[QKX_0 / 64];
+        memset(fp16_indicators, 0, sizeof(uint64_t) * (QKX_0 / 64));
 
         uint8_t qbits = QX_0_STARTING_QBITS;
         float thresh = max_quantization_errors[qbits] * (1 << qbits);
 
         int fp16_count = 0;
 
-        // max_quantization_error indicates that no value should be >= max_quantization_error away from
-        // its quantized value;
-        // that means, the total range for the quantized values will be max_quantization_error * 2 * (1 << qbits) (here, 16)
-        // for simplicty, we are going to center on 0, meaning that our fp16 threshold will be max_quantization_error * 16 values to the left and right
-        // -->|<---->|<---->|<---->|<-- 4bit example, --> = max_quant_error; we have "--> * 2 * 3 + --> + -->" positions where quantized values can be, == "--> * 4"
-
         for (int j = 0; j < QKX_0; j++) {
             float x = src[i * QKX_0 + j];
 
             if (fabsf(x) > thresh) {
-                // deactivate quant
-                fp16s[j / 64] |= (uint64_t) 1 << (j % 64);
+                // store this value on 16bits
+                fp16_indicators[j / 64] |= (uint64_t) 1 << (j % 64);
                 fp16_count += 1;
             }
         }
@@ -16556,30 +16598,31 @@ size_t ggml_quantize_qx_0(const float * src, void * dst, int n, int64_t * hist,
         uint16_t total_bits = fp16_count * 16 + (QKX_0 - fp16_count) * qbits;
         
         while ((total_bits % 8) != 0) {
-            total_bits += 16 - qbits; // simulate the replacement of a quantized weight with a 16bit one
+            total_bits += 16 - qbits; // simulate the replacement of a quantized weight with a 16bit one (needed for a block's byte alignment)
         }
 
         float min_value = -(max_quantization_errors[qbits] * ((1 << qbits) - 1));
-        float mult_range = 2 * max_quantization_errors[qbits] * ((1 << qbits) - 1);
+        float mult_range = 2 * max_quantization_errors[qbits];
         
         for (uint8_t test_qbit = QX_0_STARTING_QBITS_DOWNSCALING; test_qbit >= 1; test_qbit--) {
+            // calculate the mean of non-fp16 values and define that as the center of the quantization range
             double mean = 0;
             for (int j = 0; j < QKX_0; j++) {
-                if ((fp16s[j / 64] & ((uint64_t) 1 << (j % 64))) == 0) {
+                if ((fp16_indicators[j / 64] & ((uint64_t) 1 << (j % 64))) == 0) {
                     float x_fp32 = src[i * QKX_0 + j];
                     mean += x_fp32;
                 }
             }
-            mean /= (QKX_0 - fp16_count); // see where weights are centered
+            mean /= (QKX_0 - fp16_count);
             
             uint16_t total_fp16s_in_test_qbit = 0;
             thresh = max_quantization_errors[test_qbit] * (1 << test_qbit);
 
             for (int j = 0; j < QKX_0; j++) {
-                if ((fp16s[j / 64] & ((uint64_t) 1 << (j % 64))) == 0) {
+                if ((fp16_indicators[j / 64] & ((uint64_t) 1 << (j % 64))) == 0) {
                     float x = src[i * QKX_0 + j];
 
-                    // this weight would need to be put on 16bit
+                    // new outlier found for our current qbit
                     if (x < mean - thresh || x > mean + thresh) {
                         total_fp16s_in_test_qbit += 1;
                     }
@@ -16590,25 +16633,23 @@ size_t ggml_quantize_qx_0(const float * src, void * dst, int n, int64_t * hist,
             
             uint16_t total_bits_in_test_qbit = total_fp16s_in_test_qbit * 16 + test_qbit * (QKX_0 - total_fp16s_in_test_qbit);
             while ((total_bits_in_test_qbit % 8) != 0) {
-                total_bits_in_test_qbit += 16 - test_qbit; // simulate the replacement of a 3bit weight with a 16bit one
+                total_bits_in_test_qbit += 16 - test_qbit; // simulate the replacement of a qbit weight with a 16bit one
             }
             
             if (total_bits_in_test_qbit < total_bits) {
-                //printf("switching to %dbit! %d vs %d\n", test_qbit, total_bits, total_bits_in_test_qbit);
-
                 total_bits = total_bits_in_test_qbit;
                 qbits = test_qbit;
 
                 min_value = mean - (max_quantization_errors[test_qbit] * ((1 << qbits) - 1));
-                mult_range = 2 * max_quantization_errors[test_qbit] * ((1 << qbits) - 1);
+                mult_range = 2 * max_quantization_errors[test_qbit];
 
                 for (int j = 0; j < QKX_0; j++) {
-                    if ((fp16s[j / 64] & ((uint64_t) 1 << (j % 64))) == 0) {
+                    if ((fp16_indicators[j / 64] & ((uint64_t) 1 << (j % 64))) == 0) {
                         float x = src[i * QKX_0 + j];
 
-                        // this weight would need to be put on 16bit
+                        // mark outlier as stored on 16bit
                         if (x < mean - thresh || x > mean + thresh) {
-                            fp16s[j / 64] |= (uint64_t) 1 << (j % 64);
+                            fp16_indicators[j / 64] |= (uint64_t) 1 << (j % 64);
                             fp16_count += 1;
                         }
                     }
@@ -16616,13 +16657,14 @@ size_t ggml_quantize_qx_0(const float * src, void * dst, int n, int64_t * hist,
 
                 uint16_t total_test_bits = fp16_count * 16 + (QKX_0 - fp16_count) * qbits;
                 while ((total_test_bits % 8) != 0) {
-                    total_test_bits += 16 - test_qbit; // simulate the replacement of a 3bit weight with a 16bit one
+                    total_test_bits += 16 - test_qbit; // simulate the replacement of a qbit weight with a 16bit one
                 }
 
                 GGML_ASSERT(total_bits == total_test_bits);
             }
         }
 
+        // keep converting the largest qbit values to fp16 until the block is byte-aligned
         while (((QKX_0 - fp16_count) * qbits) % 8 != 0) {
             float maxi = 0;
             int target = -1;
@@ -16631,7 +16673,7 @@ size_t ggml_quantize_qx_0(const float * src, void * dst, int n, int64_t * hist,
                 float x = src[i * QKX_0 + j];
                 
                 // weight is not on 16bit
-                if ((fp16s[j / 64] & ((uint64_t) 1 << (j % 64))) == 0) {
+                if ((fp16_indicators[j / 64] & ((uint64_t) 1 << (j % 64))) == 0) {
                     float diff = fabsf(x);
                     if (diff > maxi || target == -1) {
                         maxi = diff;
@@ -16641,38 +16683,46 @@ size_t ggml_quantize_qx_0(const float * src, void * dst, int n, int64_t * hist,
             }
 
             GGML_ASSERT(target != -1);
-            fp16s[target / 64] |= (uint64_t) 1 << (target % 64);
+            fp16_indicators[target / 64] |= (uint64_t) 1 << (target % 64);
             fp16_count += 1;
         }
 
+        // store the current byte-offset of the current row, if "i" indicates that this is the first
+        // block of a row
         if (((i * QKX_0) % tensor_width == 0) && i != 0) {
             uint32_t row = (i * QKX_0) / tensor_width;
             extra_data[row - 1] = dst_offset;
         }
 
-        uint64_t * fp16_data = (uint64_t *) (dst_8 + dst_offset);
+        // write the fp16 indicators to dst
+        uint64_t * stored_fp16_indicators = (uint64_t *) (dst_8 + dst_offset);
 
-        // write the data
         for (int j = 0; j < QKX_0 / 64; j++) {
-            fp16_data[j] = fp16s[j];
+            stored_fp16_indicators[j] = fp16_indicators[j];
         }
 
         dst_offset += (QKX_0 / 64) * sizeof(uint64_t);
         
-        // write min value and multiplier (min_value + mult * quant_number, result should be divided by (1 << QBits) during multplication)
+        // Each weight is stored as min_value + mult * quantized_weight
+        // Similar to Zero-point quantization, or Q4_1
+
+        // Write min value and multiplier to dst
         *((uint16_t*) (dst_8 + dst_offset)) = ggml_fp32_to_fp16(min_value);
         dst_offset += sizeof(uint16_t);
         
         *((uint16_t*) (dst_8 + dst_offset)) = ggml_fp32_to_fp16(mult_range);
         dst_offset += sizeof(uint16_t);
         
+        // Store the "metadata" byte (for now it's just "qbits")
         *((uint8_t*) (dst_8 + dst_offset)) = qbits;
         dst_offset += sizeof(uint8_t);
 
+
+        // Store the quantization pivots / points
         float qvals[1 << qbits];
         
         for (int i = 0; i < (1 << qbits); i++) {
-            qvals[i] = min_value + (mult_range * i) / ((1 << qbits) - 1);
+            qvals[i] = min_value + (mult_range * i);
         }
 
         uint64_t bit_offset = 0;
@@ -16683,9 +16733,10 @@ size_t ggml_quantize_qx_0(const float * src, void * dst, int n, int64_t * hist,
         for (int j = 0; j < QKX_0; j++) {
             float x = src[i * QKX_0 + j];
 
-            if (fp16s[j / 64] & ((uint64_t) 1 << (j % 64))) {
+            if (fp16_indicators[j / 64] & ((uint64_t) 1 << (j % 64))) {
                 ggml_fp16_t x_f16 = ggml_fp32_to_fp16(x);
 
+                // store the full fp16 weight
                 write_bits(data, bit_offset, x_f16, 16);
                 bit_offset += 16;
                 fp16_count_chk += 1;
@@ -16693,6 +16744,7 @@ size_t ggml_quantize_qx_0(const float * src, void * dst, int n, int64_t * hist,
                 uint8_t q = 0;
                 float min_dist = fabsf(x - qvals[0]);
 
+                // find closest quantization point
                 for (int iv = 0; iv < (1 << qbits); iv++) {
                     float dist = fabsf(x - qvals[iv]);
                     if (dist < min_dist) {
@@ -16706,13 +16758,17 @@ size_t ggml_quantize_qx_0(const float * src, void * dst, int n, int64_t * hist,
             }
         }
         
+        // check that the reported fp16_count is coherent with the bits stored in fp16_indicators
         GGML_ASSERT(fp16_count == fp16_count_chk);
+
+        // check that the number of bits from quantized values is divisible by 8
         GGML_ASSERT((((QKX_0 - fp16_count) * qbits) % 8) == 0);
         
         dst_offset += ((QKX_0 - fp16_count) * qbits) / 8;
         dst_offset += fp16_count * 2;
     }
     
+    // store the total size of the tensor as the last element of extra_data
     extra_data[n / tensor_width - 1] = dst_offset;
 
     return dst_offset;
diff --git a/llama.cpp b/llama.cpp
index e0ff34861..158d9ebe8 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -883,12 +883,6 @@ struct llama_model_loader {
             if (lt.shards.at(0).extra_data_file_off != 0) {
                 lt.extra_data = (uint64_t *) ((uint8_t *) mapping->addr + lt.shards.at(0).extra_data_file_off);
             }
-            printf("load data for %s\n", lt.name.c_str());
-            
-            if (lt.extra_data != NULL) {
-                printf("extra_data_file_off: %zu, data: %p, extra_data: %p\n", lt.shards.at(0).extra_data_file_off, lt.data, lt.extra_data);
-                printf("extra_data for %s: %lu %lu ... %lu\n", lt.name.c_str(), lt.extra_data[0], lt.extra_data[1], lt.extra_data[lt.ne[1] - 1]);
-            }
             
         } else if (lt.split_type == SPLIT_NONE) {
             llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;