mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-13 04:00:16 +00:00
i2_s to absmax
This commit is contained in:
parent
7a8961fff5
commit
95dced07e4
@ -3333,12 +3333,11 @@ size_t quantize_i2_s(const float * restrict src, void * restrict dst, int64_t nr
|
|||||||
int n = nrow * n_per_row;
|
int n = nrow * n_per_row;
|
||||||
|
|
||||||
// f32 -> q8
|
// f32 -> q8
|
||||||
double i2_scale = 0;
|
double max = 0;
|
||||||
for (int i=0; i<n; i++) {
|
for (int i = 0; i < n; ++i) {
|
||||||
if (fabs((double)(src[i])) > 1e-6) {
|
max = MAX(max, (double)fabs((double)src[i]));
|
||||||
i2_scale = (double)src[i];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
double i2_scale = max;
|
||||||
|
|
||||||
uint8_t* q8 = (uint8_t*)dst;
|
uint8_t* q8 = (uint8_t*)dst;
|
||||||
for (int i=0; i<n; i++) {
|
for (int i=0; i<n; i++) {
|
||||||
@ -3363,11 +3362,9 @@ size_t quantize_i2_s(const float * restrict src, void * restrict dst, int64_t nr
|
|||||||
}
|
}
|
||||||
|
|
||||||
float* scale_ptr = (float*)((char*)i2_weight + n / 4);
|
float* scale_ptr = (float*)((char*)i2_weight + n / 4);
|
||||||
for (int i=0; i<8; i++) {
|
scale_ptr[0] = i2_scale;
|
||||||
scale_ptr[i] = i2_scale;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 32B for scale
|
// 32B for alignment
|
||||||
return nrow * row_size / 4 + 32;
|
return nrow * row_size / 4 + 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user