From 1e06f12714eeaf5eb06c807b3b3384aca563e0bf Mon Sep 17 00:00:00 2001
From: Amy <milkdrop2000@protonmail.com>
Date: Tue, 13 Jun 2023 10:39:04 +0100
Subject: [PATCH] Removed trailing whitespaces, removed variable-length arrays,
 removed debug print

---
 ggml.c    | 56 ++++++++++++++++++++++++++++---------------------------
 llama.cpp | 28 +++++++++++++++-------------
 2 files changed, 44 insertions(+), 40 deletions(-)

diff --git a/ggml.c b/ggml.c
index 879f8780c..2fff71523 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3203,7 +3203,7 @@ static void ggml_vec_dot_qx_0_q8_0(const int n, float * restrict s, const void *
 
     // row_data is a buffer which stores dequantized float values for a current block
     float f32_row_data[QKX_0];
-    
+
     // __AVX2__ doesn't seem to actually make much of a difference,
     // a lot of optimizing could possibly be done, including possibly using AVX2
     // for dequantization...?
@@ -3280,8 +3280,8 @@ static void ggml_vec_dot_qx_0_q8_0(const int n, float * restrict s, const void *
                     const uint8_t data_block_size = 64;
                     // we can take a full 64bit block
                     const uint8_t weights_per_u64_data_block = data_block_size / qbits;
-                    const uint8_t num_of_data_blocks_needed = 64 / weights_per_u64_data_block; // because we have 64 qbit-sized weights here 
-                    
+                    const uint8_t num_of_data_blocks_needed = 64 / weights_per_u64_data_block; // because we have 64 qbit-sized weights here
+
                     for (int i = 0; i < num_of_data_blocks_needed; i++) {
                         for (int k = 0; k < weights_per_u64_data_block; k ++) {
                             row_ptr[i * weights_per_u64_data_block + k] = qvals[(((const uint64_t *) data_start)[0] >> (k * qbits)) & ((1 << qbits) - 1)];
@@ -3340,14 +3340,14 @@ static void ggml_vec_dot_qx_0_q8_0(const int n, float * restrict s, const void *
                     __m128i test = _mm_loadu_si128((const __m128i *) (column[column_i].qs + i * 8));
                     __m256i work = _mm256_cvtepi8_epi32(test);
                     __m256 workf = _mm256_cvtepi32_ps(work);
-                    
+
                     // multiply with our 8 parts of the row at row_data
                     __m256 row = _mm256_loadu_ps(row_ptr + jb * QK8_0 + i * 8);
 
                     workf = _mm256_mul_ps(workf, row);
                     rolling_sum = _mm256_fmadd_ps(workf, column_multiplier, rolling_sum);
                 }
-                
+
                 #else
                 // scalar
                 float sub_sum = 0;
@@ -3370,11 +3370,11 @@ static void ggml_vec_dot_qx_0_q8_0(const int n, float * restrict s, const void *
         GGML_ASSERT(offset % 8 == 0);
         quant_row += offset / 8;
     }
-    
+
     #if defined(__AVX2__)
     float rolling_sum_vec[8];
     _mm256_store_ps(rolling_sum_vec, rolling_sum);
-    
+
     for (int i = 0; i < 8; i++) {
         *s += rolling_sum_vec[i];
     }
@@ -4530,7 +4530,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
     }
 
     result->nb[0] = GGML_TYPE_SIZE[type];
-    
+
     if (type == GGML_TYPE_QX_0) {
         // QX_0 doesn't have a set stride size for a row; that value is stored in the "extra" part of the tensor
         result->nb[1] = 0;
@@ -10464,7 +10464,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
         const int i3 = i03;
 
         void * src0_row;
-        
+
         if (type == GGML_TYPE_QX_0) {
             if (ir > 0) {
                 src0_row = (void *) ((char *) src0->data + ((uint64_t *) src0->extra)[ir - 1]);
@@ -10478,7 +10478,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
 
         char * src1_col =          ((char *)      wdata + (      (0 + i12*ne11 + i13*ne12*ne11)*row_size));
         float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
-        
+
         for (int64_t ic = 0; ic < ne11; ++ic) {
             vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
         }
@@ -16528,7 +16528,7 @@ size_t ggml_quantize_qx_0(const float * src, void * dst, int n, int64_t * hist,
     assert(n % QKX_0 == 0);
     assert(tensor_width % QKX_0 == 0);
     const int nb = n / QKX_0;
-    
+
     uint8_t * dst_8 = dst;
     uint64_t dst_offset = 0;
 
@@ -16544,7 +16544,7 @@ size_t ggml_quantize_qx_0(const float * src, void * dst, int n, int64_t * hist,
     // this can be replaced with a max allowed RMSE, a set percentage of weights being within
     // a certain range, etc... The current implementation here is pretty much just an example
     float max_quantization_errors[5] = {0, 0.004, 0.004, 0, 0.004};
-    
+
 
     // How maximum quantization error is implemented here:
     //
@@ -16599,14 +16599,14 @@ size_t ggml_quantize_qx_0(const float * src, void * dst, int n, int64_t * hist,
         }
 
         uint16_t total_bits = fp16_count * 16 + (QKX_0 - fp16_count) * qbits;
-        
+
         while ((total_bits % 8) != 0) {
             total_bits += 16 - qbits; // simulate the replacement of a quantized weight with a 16bit one (needed for a block's byte alignment)
         }
 
         float min_value = -(max_quantization_errors[qbits] * ((1 << qbits) - 1));
         float mult_range = 2 * max_quantization_errors[qbits];
-        
+
         // The quantizer starts at a QX_0_STARTING_QBITS quantized block (e.g. 4bits), but then
         // attempts to move to a lower precision defined by QX_0_START_OF_ATTEMPTED_QBITS.
         // It keeps looking to see if 3, 2 or 1 bit precision leads to a smaller file size.
@@ -16629,7 +16629,7 @@ size_t ggml_quantize_qx_0(const float * src, void * dst, int n, int64_t * hist,
                 }
             }
             mean /= (QKX_0 - fp16_count);
-            
+
             uint16_t total_fp16s_in_test_qbit = 0;
             thresh = max_quantization_errors[test_qbit] * (1 << test_qbit);
 
@@ -16645,12 +16645,12 @@ size_t ggml_quantize_qx_0(const float * src, void * dst, int n, int64_t * hist,
                     total_fp16s_in_test_qbit += 1;
                 }
             }
-            
+
             uint16_t total_bits_in_test_qbit = total_fp16s_in_test_qbit * 16 + test_qbit * (QKX_0 - total_fp16s_in_test_qbit);
             while ((total_bits_in_test_qbit % 8) != 0) {
                 total_bits_in_test_qbit += 16 - test_qbit; // simulate the replacement of a qbit weight with a 16bit one
             }
-            
+
             if (total_bits_in_test_qbit < total_bits) {
                 total_bits = total_bits_in_test_qbit;
                 qbits = test_qbit;
@@ -16686,7 +16686,7 @@ size_t ggml_quantize_qx_0(const float * src, void * dst, int n, int64_t * hist,
 
             for (int j = 0; j < QKX_0; j++) {
                 float x = src[i * QKX_0 + j];
-                
+
                 // weight is not on 16bit
                 if ((fp16_indicators[j / 64] & ((uint64_t) 1 << (j % 64))) == 0) {
                     float diff = fabsf(x);
@@ -16717,25 +16717,27 @@ size_t ggml_quantize_qx_0(const float * src, void * dst, int n, int64_t * hist,
         }
 
         dst_offset += (QKX_0 / 64) * sizeof(uint64_t);
-        
+
         // Each weight is stored as min_value + mult * quantized_weight
         // Similar to Zero-point quantization, or Q4_1
 
         // Write min value and multiplier to dst
         *((uint16_t*) (dst_8 + dst_offset)) = ggml_fp32_to_fp16(min_value);
         dst_offset += sizeof(uint16_t);
-        
+
         *((uint16_t*) (dst_8 + dst_offset)) = ggml_fp32_to_fp16(mult_range);
         dst_offset += sizeof(uint16_t);
-        
+
         // Store the "metadata" byte (for now it's just "qbits")
         *((uint8_t*) (dst_8 + dst_offset)) = qbits;
         dst_offset += sizeof(uint8_t);
 
 
         // Store the quantization pivots / points
-        float qvals[1 << qbits];
-        
+        // IMPORTANT: Change qvals's size depending on the maximum qbits expected
+        GGML_ASSERT(qbits <= 8);
+        float qvals[1 << 8];
+
         for (int j = 0; j < (1 << qbits); j++) {
             qvals[j] = min_value + (mult_range * j);
         }
@@ -16744,7 +16746,7 @@ size_t ggml_quantize_qx_0(const float * src, void * dst, int n, int64_t * hist,
         uint32_t * data = (uint32_t*) (dst_8 + dst_offset);
 
         int fp16_count_chk = 0;
-        
+
         for (int j = 0; j < QKX_0; j++) {
             float x = src[i * QKX_0 + j];
 
@@ -16772,17 +16774,17 @@ size_t ggml_quantize_qx_0(const float * src, void * dst, int n, int64_t * hist,
                 bit_offset += qbits;
             }
         }
-        
+
         // check that the reported fp16_count is coherent with the bits stored in fp16_indicators
         GGML_ASSERT(fp16_count == fp16_count_chk);
 
         // check that the number of bits from quantized values is divisible by 8
         GGML_ASSERT((((QKX_0 - fp16_count) * qbits) % 8) == 0);
-        
+
         dst_offset += ((QKX_0 - fp16_count) * qbits) / 8;
         dst_offset += fp16_count * 2;
     }
-    
+
     // store the total size of the tensor as the last element of extra_data
     extra_data[n / tensor_width - 1] = dst_offset;
 
diff --git a/llama.cpp b/llama.cpp
index 158d9ebe8..105e3e169 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -435,7 +435,7 @@ struct llama_load_tensor {
             GGML_ASSERT(ne.size() == 2);
 
             size = shards.at(0).size;
-            
+
             GGML_ASSERT(size != 0);
         } else {
             size = llama_calc_tensor_size(ne, type);
@@ -566,12 +566,14 @@ struct llama_file_loader {
 
             if (shard.type == GGML_TYPE_QX_0) {
                 shard.extra_data_file_off = file.tell();
-                
-                uint64_t extra_data[shard.ne[1]];
-                file.read_raw(extra_data, sizeof(uint64_t) * shard.ne[1]);
 
-                // set the size of the tensor here
-                shard.size = extra_data[shard.ne[1] - 1];
+                // seek until before the last element of extra_data
+                file.seek(sizeof(uint64_t) * (shard.ne[1] - 1), SEEK_CUR);
+
+                // get the tensor's size from here
+                uint64_t tensor_size = 0;
+                file.read_raw(&tensor_size, sizeof(uint64_t));
+                shard.size = tensor_size;
 
                 // realign, just in case extra_data isn't a multiple of 32B
                 file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
@@ -686,7 +688,7 @@ struct llama_file_saver {
 
         LLAMA_ASSERT(new_size == tensor_size);
         file.write_raw(new_data, new_size);
-        
+
         // QX_0 data may not be 32-byte aligned
         if (new_type == GGML_TYPE_QX_0) {
             file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
@@ -836,7 +838,7 @@ struct llama_model_loader {
             switch(lt.ggml_tensor->backend) {
                 case GGML_BACKEND_CPU:
                     lt.ggml_tensor->data = lt.data;
-        
+
                     if (lt.type == GGML_TYPE_QX_0) {
                         // QX_0 uses the extra field to store byte offsets in *data for each row except row 0
                         // (so extra[0] stores where row 1 starts, extra[1] is for row 2, and the last element
@@ -883,7 +885,7 @@ struct llama_model_loader {
             if (lt.shards.at(0).extra_data_file_off != 0) {
                 lt.extra_data = (uint64_t *) ((uint8_t *) mapping->addr + lt.shards.at(0).extra_data_file_off);
             }
-            
+
         } else if (lt.split_type == SPLIT_NONE) {
             llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
             file.seek(lt.shards.at(0).file_off, SEEK_SET);
@@ -1746,8 +1748,8 @@ static bool llama_eval_internal(
         lctx.n_p_eval += N;
     }
 
-    fprintf(stderr, "\nmodel eval time: %ldms\n", (ggml_time_us() - t_start_us) / 1000);
-    fflush(stderr);
+    // fprintf(stderr, "\nmodel eval time: %ldms\n", (ggml_time_us() - t_start_us) / 1000);
+    // fflush(stderr);
     return true;
 }
 
@@ -2399,7 +2401,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     if (nthread <= 0) {
         nthread = std::thread::hardware_concurrency();
     }
-    
+
     // multithreaded QX_0 quantization is not compatible with the current multithreaded quantization impl.
     // because, since blocks have an unknown size in bytes, we cannot section the output data in exact
     // chunks assigned to 1 thread. Multithreading would technically only be possible if we quantize
@@ -2558,7 +2560,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                         if (local_hist.empty()) {
                             local_hist.resize(hist_cur.size(), 0);
                         }
-                        
+
                         // pass in NULL for extra_data, since it's only required for QX_0, which doesn't support quantized threading
                         local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data(), NULL, 0);
                     }