mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-13 20:14:29 +00:00
Use full range for q4_0 quantization
By keeping the sign of the highest magnitude, we can make sure the highest value maps to -8, which is currently unused. This is a bit of a freebie since it is fully backwards compatible with the current format. quantize-stats output: before(7B): q4_0 : mse 0.00000492, maxerr 0.14257812 after(7B): q4_0 : mse 0.00000386, maxerr 0.18200684 (Most layers have reduced maxerr under this rule, but the total max error is indeed slightly higher)
This commit is contained in:
parent
0e018fe008
commit
3698f79e6a
12
ggml.c
12
ggml.c
@ -680,13 +680,17 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
float max = 0.0f;
|
||||
|
||||
for (int l = 0; l < QK4_0; l++) {
|
||||
const float v = x[i*QK4_0 + l];
|
||||
amax = MAX(amax, fabsf(v));
|
||||
if (amax < fabsf(v)) {
|
||||
amax = fabsf(v);
|
||||
max = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = amax / ((1 << 3) - 1);
|
||||
const float d = max / -8;
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y[i].d = d;
|
||||
@ -695,8 +699,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
|
||||
const float v0 = x[i*QK4_0 + l + 0]*id;
|
||||
const float v1 = x[i*QK4_0 + l + 1]*id;
|
||||
|
||||
const uint8_t vi0 = (int8_t)roundf(v0) + 8;
|
||||
const uint8_t vi1 = (int8_t)roundf(v1) + 8;
|
||||
const uint8_t vi0 = MIN(15, (int8_t)roundf(v0) + 8);
|
||||
const uint8_t vi1 = MIN(15, (int8_t)roundf(v1) + 8);
|
||||
|
||||
assert(vi0 < 16);
|
||||
assert(vi1 < 16);
|
||||
|
Loading…
Reference in New Issue
Block a user