diff --git a/ggml-quants.c b/ggml-quants.c index 4f3731067..665e381a3 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -3333,12 +3333,11 @@ size_t quantize_i2_s(const float * restrict src, void * restrict dst, int64_t nr int n = nrow * n_per_row; // f32 -> q8 - double i2_scale = 0; - for (int i=0; i 1e-6) { - i2_scale = (double)src[i]; - } + double max = 0; + for (int i = 0; i < n; ++i) { + max = MAX(max, (double)fabs((double)src[i])); } + double i2_scale = max; uint8_t* q8 = (uint8_t*)dst; for (int i=0; i