mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 19:21:46 +00:00
ggml : make i-quants work with super-blocks of 64 (CPU,Metal) (#5760)
* WIP: make i-quants work for QK_K = 64 * iq2_xs: attempt to fix AVX dot product for QK_K = 64 Tests pass, but I get gibberish. * QK_K = 64 tests pass on ARM_NEON and Metal Sadly, that does not mean it actually works. * Make CUDA compile with QK_K = 64 Tests don't pass, plus we get misaligned access * Q2_K: fixed bug in imatrix quantization for QK_K = 64 * iq1_s: turn off SIMD implementation for QK_K = 64 (it does not work) --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
parent
cb49e0f8c9
commit
7c4263d426
27
ggml-cuda.cu
27
ggml-cuda.cu
@ -544,14 +544,19 @@ static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong
|
|||||||
|
|
||||||
#define QR3_XS 8
|
#define QR3_XS 8
|
||||||
#define QI3_XS (QK_K / (4*QR3_XS))
|
#define QI3_XS (QK_K / (4*QR3_XS))
|
||||||
|
#if QK_K == 64
|
||||||
|
#define IQ3S_N_SCALE 2
|
||||||
|
#else
|
||||||
|
#define IQ3S_N_SCALE QK_K/64
|
||||||
|
#endif
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d;
|
half d;
|
||||||
uint8_t qs[QK_K/4];
|
uint8_t qs[QK_K/4];
|
||||||
uint8_t qh[QK_K/32];
|
uint8_t qh[QK_K/32];
|
||||||
uint8_t signs[QK_K/8];
|
uint8_t signs[QK_K/8];
|
||||||
uint8_t scales[QK_K/64];
|
uint8_t scales[IQ3S_N_SCALE];
|
||||||
} block_iq3_s;
|
} block_iq3_s;
|
||||||
static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 27*(QK_K/64), "wrong iq3_s block size/padding");
|
static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
|
||||||
|
|
||||||
#define QR1_S 8
|
#define QR1_S 8
|
||||||
#define QI1_S (QK_K / (4*QR1_S))
|
#define QI1_S (QK_K / (4*QR1_S))
|
||||||
@ -571,6 +576,11 @@ typedef struct {
|
|||||||
} block_iq4_nl;
|
} block_iq4_nl;
|
||||||
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
||||||
|
|
||||||
|
#if QK_K == 64
|
||||||
|
#define block_iq4_xs block_iq4_nl
|
||||||
|
#define QR4_XS QR4_NL
|
||||||
|
#define QI4_XS QI4_NL
|
||||||
|
#else
|
||||||
// QR4_XS = 8 is very slightly faster than QR4_XS = 4
|
// QR4_XS = 8 is very slightly faster than QR4_XS = 4
|
||||||
#define QR4_XS 8
|
#define QR4_XS 8
|
||||||
#define QI4_XS (QK_K / (4*QR4_XS))
|
#define QI4_XS (QK_K / (4*QR4_XS))
|
||||||
@ -581,7 +591,7 @@ typedef struct {
|
|||||||
uint8_t qs[QK_K/2];
|
uint8_t qs[QK_K/2];
|
||||||
} block_iq4_xs;
|
} block_iq4_xs;
|
||||||
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
||||||
|
#endif
|
||||||
|
|
||||||
#define WARP_SIZE 32
|
#define WARP_SIZE 32
|
||||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||||
@ -2439,9 +2449,9 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if QK_K != 64
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int i = blockIdx.x;
|
||||||
const block_iq4_xs * x = (const block_iq4_xs *)vx;
|
const block_iq4_xs * x = (const block_iq4_xs *)vx;
|
||||||
|
|
||||||
@ -2455,8 +2465,8 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
|
|||||||
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
||||||
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||||
|
|
||||||
@ -5382,8 +5392,7 @@ static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
|
|||||||
return 0.f;
|
return 0.f;
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
assert(false);
|
return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
|
||||||
return 0.f;
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -7444,7 +7453,11 @@ static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k,
|
|||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||||
const int nb = (k + QK_K - 1) / QK_K;
|
const int nb = (k + QK_K - 1) / QK_K;
|
||||||
|
#if QK_K == 64
|
||||||
|
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
|
#else
|
||||||
dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename src_t, typename dst_t>
|
template <typename src_t, typename dst_t>
|
||||||
|
@ -2560,12 +2560,16 @@ typedef struct {
|
|||||||
uint8_t qs[QK4_NL/2];
|
uint8_t qs[QK4_NL/2];
|
||||||
} block_iq4_nl;
|
} block_iq4_nl;
|
||||||
|
|
||||||
|
#if QK_K == 64
|
||||||
|
#define block_iq4_xs block_iq4_nl
|
||||||
|
#else
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d;
|
half d;
|
||||||
uint16_t scales_h;
|
uint16_t scales_h;
|
||||||
uint8_t scales_l[QK_K/64];
|
uint8_t scales_l[QK_K/64];
|
||||||
uint8_t qs[QK_K/2];
|
uint8_t qs[QK_K/2];
|
||||||
} block_iq4_xs;
|
} block_iq4_xs;
|
||||||
|
#endif
|
||||||
|
|
||||||
//====================================== dot products =========================
|
//====================================== dot products =========================
|
||||||
|
|
||||||
@ -4346,7 +4350,6 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
|
|||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
const int ix = tiisg;
|
const int ix = tiisg;
|
||||||
|
|
||||||
device const float * y4 = y + 32 * ix;
|
device const float * y4 = y + 32 * ix;
|
||||||
@ -4387,12 +4390,6 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
|
|||||||
|
|
||||||
y4 += 32 * 32;
|
y4 += 32 * 32;
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
(void) x;
|
|
||||||
(void) y;
|
|
||||||
(void) yl;
|
|
||||||
(void) nb32;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for (int row = 0; row < N_DST; ++row) {
|
for (int row = 0; row < N_DST; ++row) {
|
||||||
all_sum = simd_sum(sumf[row]);
|
all_sum = simd_sum(sumf[row]);
|
||||||
@ -4482,7 +4479,6 @@ void kernel_mul_mv_iq2_xs_f32_impl(
|
|||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
const int ix = tiisg;
|
const int ix = tiisg;
|
||||||
|
|
||||||
device const float * y4 = y + 32 * ix;
|
device const float * y4 = y + 32 * ix;
|
||||||
@ -4533,12 +4529,6 @@ void kernel_mul_mv_iq2_xs_f32_impl(
|
|||||||
|
|
||||||
y4 += 32 * 32;
|
y4 += 32 * 32;
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
(void) x;
|
|
||||||
(void) y;
|
|
||||||
(void) yl;
|
|
||||||
(void) nb32;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for (int row = 0; row < N_DST; ++row) {
|
for (int row = 0; row < N_DST; ++row) {
|
||||||
all_sum = simd_sum(sumf[row]);
|
all_sum = simd_sum(sumf[row]);
|
||||||
@ -4628,7 +4618,6 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
|
|||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
const int ix = tiisg;
|
const int ix = tiisg;
|
||||||
|
|
||||||
device const float * y4 = y + 32 * ix;
|
device const float * y4 = y + 32 * ix;
|
||||||
@ -4672,12 +4661,6 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
|
|||||||
|
|
||||||
y4 += 32 * 32;
|
y4 += 32 * 32;
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
(void) x;
|
|
||||||
(void) y;
|
|
||||||
(void) yl;
|
|
||||||
(void) nb32;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for (int row = 0; row < N_DST; ++row) {
|
for (int row = 0; row < N_DST; ++row) {
|
||||||
all_sum = simd_sum(sumf[row]);
|
all_sum = simd_sum(sumf[row]);
|
||||||
@ -5016,7 +4999,6 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
|||||||
|
|
||||||
const int nb32 = nb * (QK_K / 32);
|
const int nb32 = nb * (QK_K / 32);
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
const int ix = tiisg/2;
|
const int ix = tiisg/2;
|
||||||
const int il = tiisg%2;
|
const int il = tiisg%2;
|
||||||
|
|
||||||
@ -5055,12 +5037,6 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
|||||||
|
|
||||||
y4 += 16 * 32;
|
y4 += 16 * 32;
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
(void) x;
|
|
||||||
(void) y;
|
|
||||||
(void) yl;
|
|
||||||
(void) nb32;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for (int row = 0; row < N_DST; ++row) {
|
for (int row = 0; row < N_DST; ++row) {
|
||||||
all_sum = simd_sum(sumf[row]);
|
all_sum = simd_sum(sumf[row]);
|
||||||
@ -5167,6 +5143,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if QK_K != 64
|
||||||
void kernel_mul_mv_iq4_xs_f32_impl(
|
void kernel_mul_mv_iq4_xs_f32_impl(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
@ -5260,6 +5237,7 @@ void kernel_mul_mv_iq4_xs_f32_impl(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
[[host_name("kernel_mul_mv_iq1_s_f32")]]
|
[[host_name("kernel_mul_mv_iq1_s_f32")]]
|
||||||
kernel void kernel_mul_mv_iq1_s_f32(
|
kernel void kernel_mul_mv_iq1_s_f32(
|
||||||
@ -5344,7 +5322,11 @@ kernel void kernel_mul_mv_iq4_xs_f32(
|
|||||||
uint tiisg[[thread_index_in_simdgroup]],
|
uint tiisg[[thread_index_in_simdgroup]],
|
||||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||||
|
|
||||||
|
#if QK_K == 64
|
||||||
|
kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||||
|
#else
|
||||||
kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
//============================= templates and their specializations =============================
|
//============================= templates and their specializations =============================
|
||||||
@ -5770,6 +5752,9 @@ void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4
|
|||||||
|
|
||||||
template <typename type4x4>
|
template <typename type4x4>
|
||||||
void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
|
void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
|
||||||
|
#if QK_K == 64
|
||||||
|
dequantize_iq4_nl(xb, il, reg);
|
||||||
|
#else
|
||||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||||
const int ib32 = il/2;
|
const int ib32 = il/2;
|
||||||
il = il%2;
|
il = il%2;
|
||||||
@ -5786,6 +5771,7 @@ void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4
|
|||||||
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
||||||
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
||||||
@ -6334,7 +6320,11 @@ template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_t kernel_get_r
|
|||||||
template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_rows<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_rows<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
||||||
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||||
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||||
|
#if QK_K == 64
|
||||||
|
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, 2, dequantize_iq4_xs>;
|
||||||
|
#else
|
||||||
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||||
|
#endif
|
||||||
|
|
||||||
//
|
//
|
||||||
// matrix-matrix multiplication
|
// matrix-matrix multiplication
|
||||||
@ -6378,7 +6368,11 @@ template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_m
|
|||||||
template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
||||||
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||||
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||||
|
#if QK_K == 64
|
||||||
|
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_xs>;
|
||||||
|
#else
|
||||||
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||||
|
#endif
|
||||||
|
|
||||||
//
|
//
|
||||||
// indirect matrix-matrix multiplication
|
// indirect matrix-matrix multiplication
|
||||||
@ -6434,7 +6428,11 @@ template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel
|
|||||||
template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
||||||
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||||
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||||
|
#if QK_K == 64
|
||||||
|
template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, 2, dequantize_iq4_xs>;
|
||||||
|
#else
|
||||||
template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||||
|
#endif
|
||||||
|
|
||||||
//
|
//
|
||||||
// matrix-vector multiplication
|
// matrix-vector multiplication
|
||||||
@ -7707,7 +7705,11 @@ kernel void kernel_mul_mv_id_iq4_xs_f32(
|
|||||||
|
|
||||||
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
||||||
|
|
||||||
|
#if QK_K == 64
|
||||||
|
kernel_mul_mv_iq4_nl_f32_impl(
|
||||||
|
#else
|
||||||
kernel_mul_mv_iq4_xs_f32_impl(
|
kernel_mul_mv_iq4_xs_f32_impl(
|
||||||
|
#endif
|
||||||
src0[id],
|
src0[id],
|
||||||
(device const float *) (src1 + bid*nb11),
|
(device const float *) (src1 + bid*nb11),
|
||||||
dst + bid*ne0,
|
dst + bid*ne0,
|
||||||
|
148
ggml-quants.c
148
ggml-quants.c
@ -1877,7 +1877,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|||||||
float mins[QK_K/16];
|
float mins[QK_K/16];
|
||||||
float scales[QK_K/16];
|
float scales[QK_K/16];
|
||||||
float sw[QK_K/16];
|
float sw[QK_K/16];
|
||||||
float weight[QK_K/16];
|
float weight[16];
|
||||||
uint8_t Ls[QK_K/16], Lm[QK_K/16];
|
uint8_t Ls[QK_K/16], Lm[QK_K/16];
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
@ -1887,13 +1887,42 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|||||||
float sigma2 = sumx2/QK_K;
|
float sigma2 = sumx2/QK_K;
|
||||||
for (int j = 0; j < QK_K/16; ++j) {
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
const float * restrict qw = quant_weights + QK_K * i + 16*j;
|
const float * restrict qw = quant_weights + QK_K * i + 16*j;
|
||||||
for (int l = 0; l < QK_K/16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
|
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
|
||||||
for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
|
for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
|
||||||
scales[j] = make_qkx3_quants(QK_K/16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
|
float dm, mm;
|
||||||
float mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw);
|
#if QK_K == 64
|
||||||
|
float max_scale = 0, max_min = 0;
|
||||||
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
|
max_scale = MAX(max_scale, scales[j]);
|
||||||
|
max_min = MAX(max_min, mins[j]);
|
||||||
|
}
|
||||||
|
dm = max_scale/15;
|
||||||
|
mm = max_min/15;
|
||||||
|
if (max_scale) {
|
||||||
|
float id = 1/dm;
|
||||||
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
|
int l = nearest_int(id*scales[j]);
|
||||||
|
Ls[j] = MAX(0, MIN(15, l));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
memset(Ls, 0, QK_K/16);
|
||||||
|
}
|
||||||
|
if (max_min) {
|
||||||
|
float id = 1/mm;
|
||||||
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
|
int l = nearest_int(id*mins[j]);
|
||||||
|
Lm[j] = MAX(0, MIN(15, l));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
memset(Lm, 0, QK_K/16);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
|
||||||
|
mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw);
|
||||||
|
#endif
|
||||||
y[i].d = GGML_FP32_TO_FP16(dm);
|
y[i].d = GGML_FP32_TO_FP16(dm);
|
||||||
y[i].dmin = GGML_FP32_TO_FP16(mm);
|
y[i].dmin = GGML_FP32_TO_FP16(mm);
|
||||||
dm = GGML_FP16_TO_FP32(y[i].d);
|
dm = GGML_FP16_TO_FP32(y[i].d);
|
||||||
@ -4227,6 +4256,9 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
|
|||||||
|
|
||||||
void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) {
|
void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
|
#if QK_K == 64
|
||||||
|
dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
|
||||||
|
#else
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
@ -4246,6 +4278,7 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
|
|||||||
qs += 16;
|
qs += 16;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
//===================================== Q8_K ==============================================
|
//===================================== Q8_K ==============================================
|
||||||
@ -6306,7 +6339,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|||||||
|
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
|
||||||
int isum[4];
|
int isum[QK_K/16];
|
||||||
|
|
||||||
for (int i = 0; i < nb; ++i) {
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
@ -6322,14 +6355,14 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|||||||
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||||
|
|
||||||
isum[0] = isum[1] = isum[2] = isum[3] = 0;
|
memset(isum, 0, (QK_K/16)*sizeof(int));
|
||||||
for (int l = 0; l < 16; ++l) {
|
for (int l = 0; l < 16; ++l) {
|
||||||
isum[0] += q8[l+ 0] * ((q2[l] >> 0) & 3);
|
isum[0] += q8[l+ 0] * ((q2[l] >> 0) & 3);
|
||||||
isum[1] += q8[l+16] * ((q2[l] >> 2) & 3);
|
isum[1] += q8[l+16] * ((q2[l] >> 2) & 3);
|
||||||
isum[2] += q8[l+32] * ((q2[l] >> 4) & 3);
|
isum[2] += q8[l+32] * ((q2[l] >> 4) & 3);
|
||||||
isum[3] += q8[l+48] * ((q2[l] >> 6) & 3);
|
isum[3] += q8[l+48] * ((q2[l] >> 6) & 3);
|
||||||
}
|
}
|
||||||
for (int l = 0; l < 4; ++l) {
|
for (int l = 0; l < QK_K/16; ++l) {
|
||||||
isum[l] *= (sc[l] & 0xF);
|
isum[l] *= (sc[l] & 0xF);
|
||||||
}
|
}
|
||||||
sumf += dall * (isum[0] + isum[1] + isum[2] + isum[3]) - dmin * summs;
|
sumf += dall * (isum[0] + isum[1] + isum[2] + isum[3]) - dmin * summs;
|
||||||
@ -9488,15 +9521,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|||||||
|
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
|
|
||||||
const __m128i m4 = _mm_set1_epi8(0xf);
|
|
||||||
const __m128i m1 = _mm_set1_epi8(1);
|
|
||||||
const __m256i m511 = _mm256_set1_epi16(511);
|
|
||||||
const __m256i mone = _mm256_set1_epi8(1);
|
const __m256i mone = _mm256_set1_epi8(1);
|
||||||
|
|
||||||
static const uint8_t k_bit_helper[32] = {
|
|
||||||
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
|
||||||
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
|
||||||
};
|
|
||||||
static const char block_sign_shuffle_mask_1[32] = {
|
static const char block_sign_shuffle_mask_1[32] = {
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
||||||
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
|
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
|
||||||
@ -9510,11 +9535,77 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|||||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||||
};
|
};
|
||||||
|
|
||||||
const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
|
|
||||||
const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
|
const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
|
||||||
const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
|
const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
|
||||||
const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
|
const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
|
||||||
|
|
||||||
|
#if QK_K == 64
|
||||||
|
static const uint8_t k_bit_helper[16] = {
|
||||||
|
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
||||||
|
};
|
||||||
|
const __m128i bit_helper = _mm_loadu_si128((const __m128i*)k_bit_helper);
|
||||||
|
const __m128i m511 = _mm_set1_epi16(511);
|
||||||
|
typedef union {
|
||||||
|
__m128i vec_index;
|
||||||
|
uint16_t index[8];
|
||||||
|
} index_t;
|
||||||
|
|
||||||
|
index_t idx;
|
||||||
|
__m256 accumf = _mm256_setzero_ps();
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||||
|
const __m128i q2_data = _mm_loadu_si128((const __m128i*)x[i].qs);
|
||||||
|
idx.vec_index = _mm_and_si128(q2_data, m511);
|
||||||
|
|
||||||
|
const __m128i partial_sign_bits = _mm_srli_epi16(q2_data, 9);
|
||||||
|
const __m128i partial_sign_bits_upper = _mm_srli_epi16(q2_data, 13);
|
||||||
|
const __m128i partial_sign_bits_for_counting = _mm_xor_si128(partial_sign_bits, partial_sign_bits_upper);
|
||||||
|
|
||||||
|
const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
|
||||||
|
const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
|
||||||
|
const __m256i full_signs = _mm256_set_m128i(full_sign_bits, full_sign_bits);
|
||||||
|
|
||||||
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||||
|
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
|
||||||
|
|
||||||
|
const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[idx.index[3]], iq2xs_grid[idx.index[2]],
|
||||||
|
iq2xs_grid[idx.index[1]], iq2xs_grid[idx.index[0]]);
|
||||||
|
const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[idx.index[7]], iq2xs_grid[idx.index[6]],
|
||||||
|
iq2xs_grid[idx.index[5]], iq2xs_grid[idx.index[4]]);
|
||||||
|
|
||||||
|
__m256i signs;
|
||||||
|
signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_1);
|
||||||
|
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
||||||
|
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
|
||||||
|
|
||||||
|
signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_2);
|
||||||
|
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
||||||
|
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
|
||||||
|
|
||||||
|
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
||||||
|
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
||||||
|
|
||||||
|
const __m256i sc1 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
|
||||||
|
const __m256i sc2 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
|
||||||
|
|
||||||
|
const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
|
||||||
|
|
||||||
|
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sum), accumf);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = 0.125f * hsum_float_8(accumf);
|
||||||
|
#else
|
||||||
|
|
||||||
|
static const uint8_t k_bit_helper[32] = {
|
||||||
|
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
||||||
|
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
||||||
|
};
|
||||||
|
const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
|
||||||
|
const __m256i m511 = _mm256_set1_epi16(511);
|
||||||
|
const __m128i m4 = _mm_set1_epi8(0xf);
|
||||||
|
const __m128i m1 = _mm_set1_epi8(1);
|
||||||
|
|
||||||
uint64_t aux64;
|
uint64_t aux64;
|
||||||
|
|
||||||
// somewhat hacky, but gives a significant boost in performance
|
// somewhat hacky, but gives a significant boost in performance
|
||||||
@ -9603,6 +9694,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|||||||
}
|
}
|
||||||
|
|
||||||
*s = 0.125f * hsum_float_8(accumf);
|
*s = 0.125f * hsum_float_8(accumf);
|
||||||
|
#endif
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
@ -10199,7 +10291,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|||||||
|
|
||||||
const int nb = n / QK_K;
|
const int nb = n / QK_K;
|
||||||
|
|
||||||
#if defined __ARM_NEON
|
// TODO: implement for QK_K = 64
|
||||||
|
#if defined __ARM_NEON && QK_K == 256
|
||||||
|
|
||||||
const uint8x16_t m8 = vdupq_n_u8(0x08);
|
const uint8x16_t m8 = vdupq_n_u8(0x08);
|
||||||
const uint8x16_t m7 = vdupq_n_u8(0x07);
|
const uint8x16_t m7 = vdupq_n_u8(0x07);
|
||||||
@ -10256,7 +10349,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|||||||
|
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
|
|
||||||
#elif defined __AVX2__
|
// TODO: implement for QK_K = 64
|
||||||
|
#elif defined __AVX2__ && QK_K == 256
|
||||||
|
|
||||||
const __m128i m8 = _mm_set1_epi8(0x08);
|
const __m128i m8 = _mm_set1_epi8(0x08);
|
||||||
const __m128i m7 = _mm_set1_epi8(0x07);
|
const __m128i m7 = _mm_set1_epi8(0x07);
|
||||||
@ -10455,6 +10549,9 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|||||||
UNUSED(by);
|
UNUSED(by);
|
||||||
UNUSED(bs);
|
UNUSED(bs);
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
#if QK_K == 64
|
||||||
|
ggml_vec_dot_iq4_nl_q8_0(n, s, bs, vx, bx, vy, by, nrc);
|
||||||
|
#else
|
||||||
|
|
||||||
const block_iq4_xs * restrict x = vx;
|
const block_iq4_xs * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
@ -10574,6 +10671,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|||||||
}
|
}
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// ================================ IQ2 quantization =============================================
|
// ================================ IQ2 quantization =============================================
|
||||||
@ -10921,7 +11019,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|||||||
|
|
||||||
const int kMaxQ = 3;
|
const int kMaxQ = 3;
|
||||||
|
|
||||||
const int nbl = n/256;
|
const int nbl = n/QK_K;
|
||||||
|
|
||||||
block_iq2_xxs * y = vy;
|
block_iq2_xxs * y = vy;
|
||||||
|
|
||||||
@ -11094,7 +11192,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
|||||||
|
|
||||||
const int kMaxQ = 3;
|
const int kMaxQ = 3;
|
||||||
|
|
||||||
const int nbl = n/256;
|
const int nbl = n/QK_K;
|
||||||
|
|
||||||
block_iq2_xs * y = vy;
|
block_iq2_xs * y = vy;
|
||||||
|
|
||||||
@ -12037,7 +12135,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|||||||
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
||||||
GGML_ASSERT(n%QK_K == 0);
|
GGML_ASSERT(n%QK_K == 0);
|
||||||
|
|
||||||
const int nbl = n/256;
|
const int nbl = n/QK_K;
|
||||||
|
|
||||||
block_iq1_s * y = vy;
|
block_iq1_s * y = vy;
|
||||||
|
|
||||||
@ -12315,6 +12413,9 @@ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * rest
|
|||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||||
|
#if QK_K == 64
|
||||||
|
return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights);
|
||||||
|
#else
|
||||||
(void)hist;
|
(void)hist;
|
||||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||||
int nblock = n_per_row/QK_K;
|
int nblock = n_per_row/QK_K;
|
||||||
@ -12333,6 +12434,7 @@ size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, i
|
|||||||
qrow += nblock*sizeof(block_iq4_xs);
|
qrow += nblock*sizeof(block_iq4_xs);
|
||||||
}
|
}
|
||||||
return nrow * nblock * sizeof(block_iq4_xs);
|
return nrow * nblock * sizeof(block_iq4_xs);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
|
void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
|
||||||
@ -12363,7 +12465,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
|
|||||||
|
|
||||||
const int kMaxQ = 3;
|
const int kMaxQ = 3;
|
||||||
|
|
||||||
const int nbl = n/256;
|
const int nbl = n/QK_K;
|
||||||
|
|
||||||
block_iq2_s * y = vy;
|
block_iq2_s * y = vy;
|
||||||
|
|
||||||
|
@ -230,6 +230,10 @@ typedef struct {
|
|||||||
} block_iq4_nl;
|
} block_iq4_nl;
|
||||||
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
||||||
|
|
||||||
|
#if QK_K == 64
|
||||||
|
#define block_iq4_xs block_iq4_nl
|
||||||
|
//typedef struct block_iq4_nl block_iq4_xs;
|
||||||
|
#else
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ggml_fp16_t d;
|
ggml_fp16_t d;
|
||||||
uint16_t scales_h;
|
uint16_t scales_h;
|
||||||
@ -237,6 +241,7 @@ typedef struct {
|
|||||||
uint8_t qs[QK_K/2];
|
uint8_t qs[QK_K/2];
|
||||||
} block_iq4_xs;
|
} block_iq4_xs;
|
||||||
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
15
ggml.c
15
ggml.c
@ -728,14 +728,22 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||||||
},
|
},
|
||||||
[GGML_TYPE_IQ4_XS] = {
|
[GGML_TYPE_IQ4_XS] = {
|
||||||
.type_name = "iq4_xs",
|
.type_name = "iq4_xs",
|
||||||
|
#if QK_K == 64
|
||||||
|
.blck_size = QK4_NL,
|
||||||
|
#else
|
||||||
.blck_size = QK_K,
|
.blck_size = QK_K,
|
||||||
|
#endif
|
||||||
.type_size = sizeof(block_iq4_xs),
|
.type_size = sizeof(block_iq4_xs),
|
||||||
.is_quantized = true,
|
.is_quantized = true,
|
||||||
.to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
|
.to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
|
||||||
.from_float = quantize_row_iq4_xs,
|
.from_float = quantize_row_iq4_xs,
|
||||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference,
|
.from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference,
|
||||||
.vec_dot = ggml_vec_dot_iq4_xs_q8_K,
|
.vec_dot = ggml_vec_dot_iq4_xs_q8_K,
|
||||||
|
#if QK_K == 64
|
||||||
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
|
#else
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
#endif
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q8_K] = {
|
[GGML_TYPE_Q8_K] = {
|
||||||
@ -19830,6 +19838,9 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_IQ4_NL:
|
case GGML_TYPE_IQ4_NL:
|
||||||
|
#if QK_K == 64
|
||||||
|
case GGML_TYPE_IQ4_XS:
|
||||||
|
#endif
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK4_NL == 0);
|
GGML_ASSERT(start % QK4_NL == 0);
|
||||||
GGML_ASSERT(start % n_per_row == 0);
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
@ -19838,15 +19849,17 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|||||||
result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
|
#if QK_K != 64
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK4_NL == 0);
|
GGML_ASSERT(start % QK_K == 0);
|
||||||
GGML_ASSERT(start % n_per_row == 0);
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
|
#endif
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
size_t elemsize = sizeof(ggml_fp16_t);
|
size_t elemsize = sizeof(ggml_fp16_t);
|
||||||
|
Loading…
Reference in New Issue
Block a user