// quants kernels for ggml-cuda // QK = number of values after dequantization // QR = QK / number of values before dequantization // QI = number of 32 bit integers before dequantization #define QK4_0 32 #define QR4_0 2 #define QI4_0 4 typedef struct { half d; // delta uint8_t qs[QK4_0 / 2]; // nibbles / quants } block_q4_0; static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding"); #define QK4_1 32 #define QR4_1 2 #define QI4_1 4 typedef struct { half d; // delta half m; // min uint8_t qs[QK4_1 / 2]; // nibbles / quants } block_q4_1; static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); #define QK5_0 32 #define QR5_0 2 #define QI5_0 4 typedef struct { half d; // delta uint8_t qh[4]; // 5-th bit of quants uint8_t qs[QK5_0 / 2]; // nibbles / quants } block_q5_0; static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding"); #define QK5_1 32 #define QR5_1 2 #define QI5_1 4 typedef struct { half d; // delta half m; // min uint8_t qh[4]; // 5-th bit of quants uint8_t qs[QK5_1 / 2]; // nibbles / quants } block_q5_1; static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); #define QK8_0 32 #define QR8_0 1 #define QI8_0 8 typedef struct { half d; // delta int8_t qs[QK8_0]; // quants } block_q8_0; static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding"); #define QK8_1 32 #define QR8_1 1 #define QI8_1 8 typedef struct { half d; // delta half s; // unquantized sum int8_t qs[QK8_0]; // quants } block_q8_1; static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding"); //================================= k-quants #define QK_K 256 typedef struct { uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits uint8_t qs[QK_K/4]; // quants half d; // super-block scale for quantized scales half dmin; // super-block scale for quantized mins } block_q2_K; static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); typedef struct { uint8_t hmask[QK_K/8]; uint8_t qs[QK_K/4]; // nibbles / quants uint8_t scales[3*QK_K/64]; half d; } block_q3_K; static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding"); typedef struct { half d; // super-block scale for quantized scales half dmin; // super-block scale for quantized mins uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits uint8_t qs[QK_K/2]; // 4--bit quants } block_q4_K; static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding"); typedef struct { half d; // super-block scale for quantized scales half dmin; // super-block scale for quantized mins uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits uint8_t qh[QK_K/8]; // quants, high bit uint8_t qs[QK_K/2]; // quants, low 4 bits } block_q5_K; static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); typedef struct { uint8_t ql[QK_K/2]; // quants, lower 4 bits uint8_t qh[QK_K/4]; // quants, upper 2 bits int8_t scales[QK_K/16]; // scales half d; // delta } block_q6_K; static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding"); template using dot_kernel_k_t = void (*)(const void * vx, const int ib, const int iqs, const src1_t * y, dst_t & v); template using vec_dot_q_cuda_t = dst_t (*)(const void * vbq, const block_q8_1 * bq8_1, const int iqs); // TODO: f16 template static __global__ void quantize_q8_1(const src_t * x, void * vy, const int k) { const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { return; } block_q8_1 * y = (block_q8_1 *) vy; const int ib = i / QK8_0; // block index const int iqs = i % QK8_0; // quant index const float xi = x[i]; float amax = fabsf(xi); float sum = xi; #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32)); sum += __shfl_xor_sync(0xffffffff, sum, mask, 32); } const float d = amax / 127; const int8_t q = amax == 0.0f ? 0 : roundf(xi / d); y[ib].qs[iqs] = q; if (iqs > 0) { return; } y[ib].d = d; y[ib].s = sum; } template static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, vec2_t & v){ const block_q4_0 * x = (const block_q4_0 *) vx; const dst_t d = x[ib].d; const uint8_t vui = x[ib].qs[iqs]; v.x = vui & 0xF; v.y = vui >> 4; const vec2_t off2 = make_vec2_t(8, 8); const vec2_t d2 = make_vec2_t(d, d); v = (v - off2) * d2; } template static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, vec2_t & v){ const block_q4_1 * x = (const block_q4_1 *) vx; const dst_t d = x[ib].d; const dst_t m = x[ib].m; const uint8_t vui = x[ib].qs[iqs]; v.x = vui & 0xF; v.y = vui >> 4; const vec2_t d2 = make_vec2_t(d, d); const vec2_t m2 = make_vec2_t(m, m); v = v * d2 + m2; } template static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, vec2_t & v){ const block_q5_0 * x = (const block_q5_0 *) vx; const dst_t d = x[ib].d; uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; v.x = ((x[ib].qs[iqs] & 0xf) | xh_0); v.y = ((x[ib].qs[iqs] >> 4) | xh_1); const vec2_t off2 = make_vec2_t(16, 16); const vec2_t d2 = make_vec2_t(d, d); v = (v - off2) * d2; } template static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, vec2_t & v){ const block_q5_1 * x = (const block_q5_1 *) vx; const dst_t d = x[ib].d; const dst_t m = x[ib].m; uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; v.x = ((x[ib].qs[iqs] & 0xf) | xh_0); v.y = ((x[ib].qs[iqs] >> 4) | xh_1); const vec2_t d2 = make_vec2_t(d, d); const vec2_t m2 = make_vec2_t(m, m); v = v * d2 + m2; } template static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, vec2_t & v){ const block_q8_0 * x = (const block_q8_0 *) vx; const dst_t d = x[ib].d; v.x = x[ib].qs[iqs + 0]; v.y = x[ib].qs[iqs + 1]; const vec2_t d2 = make_vec2_t(d, d); v = v * d2; } //================================== k-quants static __global__ void dequantize_block_q2_K(const void * vx, float * yy) { const int i = blockIdx.x; const int tid = threadIdx.x; const int n = tid/32; const int l = tid - 32*n; const int is = 8*n + l/16; const block_q2_K * x = (const block_q2_K *) vx; const uint8_t q = x[i].qs[32*n + l]; float * y = yy + i*QK_K + 128*n; float dall = x[i].d; float dmin = x[i].dmin; y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); } static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { const block_q2_K * x = (const block_q2_K *) vx; // if n is 0, we want to do the lower 128, else the upper 128, // covering y[l+0], y[l+32], y[l+64], y[l+96] and // y[l+16], y[l+48], y[l+80], y[l+112] int n = iqs/128; // 0 or 1 int r = iqs - 128*n; // 0...120 in steps of 8 int l = r/8; // 0...15 in steps of 1 const float * y = yy + 128*n + l; const uint8_t * q = x[ib].qs + 32*n + l; const uint8_t * s = x[ib].scales + 8*n; const float dall = x[ib].d; const float dmin = x[ib].dmin; float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4)) + y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4)) + y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4)) + y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4)) + y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4)) + y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4)) + y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4)) + y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4)); result = sum; } static __global__ void dequantize_block_q3_K(const void * vx, float * yy) { int r = threadIdx.x/4; int i = blockIdx.x; int tid = r/2; int is0 = r%2; int l0 = 16*is0 + 4*(threadIdx.x%4); int n = tid / 4; int j = tid - 4*n; const block_q3_K * x = (const block_q3_K *) vx; uint8_t m = 1 << (4*n + j); int is = 8*n + 2*j + is0; int shift = 2*j; int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) : is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) : is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) : (x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4); float d_all = x[i].d; float dl = d_all * (us - 32); float * y = yy + i*QK_K + 128*n + 32*j; const uint8_t * q = x[i].qs + 32*n; const uint8_t * hm = x[i].hmask; for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); } static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { const block_q3_K * x = (const block_q3_K *) vx; const uint32_t kmask1 = 0x03030303; const uint32_t kmask2 = 0x0f0f0f0f; uint32_t aux[3]; uint32_t utmp[4]; // if n is 0, we want to do the lower 128, else the upper 128, // covering y[l+0], y[l+32], y[l+64], y[l+96] and // y[l+16], y[l+48], y[l+80], y[l+112] int n = iqs/128; // 0 or 1 int r = iqs - 128*n; // 0...120 in steps of 8 int l = r/8; // 0...15 in steps of 1 const float * y = yy + 128*n + l; const uint8_t * q = x[ib].qs + 32*n + l; const uint8_t * hm = x[ib].hmask + l; const int8_t * s = (const int8_t *)utmp + 8*n; memcpy(aux, x[ib].scales, 12); utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); const float dall = x[ib].d; const uint8_t m = 1 << (4*n); float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4)) + y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4)) + y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4)) + y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4)) + y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4)) + y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4)) + y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4)) + y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4)); result = sum * dall; } static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { if (j < 4) { d = q[j] & 63; m = q[j + 4] & 63; } else { d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); } } static __global__ void dequantize_block_q4_K(const void * vx, float * yy) { const block_q4_K * x = (const block_q4_K *) vx; const int i = blockIdx.x; //// assume 64 threads - this is very slightly better than the one below //const int tid = threadIdx.x; //const int il = tid/16; //const int ir = tid%16; //const int is = 2*il; //const int n = 2; // assume 32 threads const int tid = threadIdx.x; const int il = tid/8; const int ir = tid%8; const int is = 2*il; const int n = 4; float * y = yy + i*QK_K + 64*il + n*ir; const float dall = x[i].d; const float dmin = x[i].dmin; const uint8_t * q = x[i].qs + 32*il + n*ir; uint8_t sc, m; get_scale_min_k4(is + 0, x[i].scales, sc, m); const float d1 = dall * sc; const float m1 = dmin * m; get_scale_min_k4(is + 1, x[i].scales, sc, m); const float d2 = dall * sc; const float m2 = dmin * m; for (int l = 0; l < n; ++l) { y[l + 0] = d1 * (q[l] & 0xF) - m1; y[l +32] = d2 * (q[l] >> 4) - m2; } } static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { const block_q4_K * x = (const block_q4_K *) vx; // iqs is in 0...248 in steps of 8 => const int j = iqs / 64; // j is in 0...3 const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4 const int is = 2*j; // is is in 0...6 in steps of 2 const float * y = yy + 64*j + ir; const uint8_t * q = x[ib].qs + 32*j + ir; const float dall = x[ib].d; const float dmin = x[ib].dmin; uint8_t sc, m; get_scale_min_k4(is + 0, x[ib].scales, sc, m); const float d1 = dall * sc; const float m1 = dmin * m; get_scale_min_k4(is + 1, x[ib].scales, sc, m); const float d2 = dall * sc; const float m2 = dmin * m; float sum = 0; for (int k = 0; k < 4; ++k) { sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1); sum += y[k + 32] * (d2 * (q[k] >> 4) - m2); } result = sum; } static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { const block_q5_K * x = (const block_q5_K *) vx; const int i = blockIdx.x; // assume 64 threads - this is very slightly better than the one below const int tid = threadIdx.x; const int il = tid/16; // il is in 0...3 const int ir = tid%16; // ir is in 0...15 const int is = 2*il; // is is in 0...6 float * y = yy + i*QK_K + 64*il + 2*ir; const float dall = x[i].d; const float dmin = x[i].dmin; const uint8_t * ql = x[i].qs + 32*il + 2*ir; const uint8_t * qh = x[i].qh + 2*ir; uint8_t sc, m; get_scale_min_k4(is + 0, x[i].scales, sc, m); const float d1 = dall * sc; const float m1 = dmin * m; get_scale_min_k4(is + 1, x[i].scales, sc, m); const float d2 = dall * sc; const float m2 = dmin * m; uint8_t hm = 1 << (2*il); y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1; y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1; hm <<= 1; y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2; y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; } static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { const block_q5_K * x = (const block_q5_K *) vx; // iqs is in 0...248 in steps of 8 => const int j = iqs / 64; // j is in 0...3 const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4 const int is = 2*j; // is is in 0...6 in steps of 2 const float * y = yy + 64*j + ir; const uint8_t * ql = x[ib].qs + 32*j + ir; const uint8_t * qh = x[ib].qh + ir; const float dall = x[ib].d; const float dmin = x[ib].dmin; uint8_t sc, m; get_scale_min_k4(is + 0, x[ib].scales, sc, m); const float d1 = dall * sc; const float m1 = dmin * m; get_scale_min_k4(is + 1, x[ib].scales, sc, m); const float d2 = dall * sc; const float m2 = dmin * m; uint8_t hm = 1 << is; float sum = 0; for (int k = 0; k < 4; ++k) { sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1); } hm <<= 1; for (int k = 0; k < 4; ++k) { sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2); } result = sum; } template static __global__ void dequantize_block_q6_K(const void * vx, dst_t * yy) { const block_q6_K * x = (const block_q6_K *) vx; const int i = blockIdx.x; // assume 64 threads - this is very slightly better than the one below const int tid = threadIdx.x; const int ip = tid/32; // ip is 0 or 1 const int il = tid - 32*ip; // 0...32 const int is = 8*ip + il/16; // TODO: fp16 compute dst_t * y = yy + i*QK_K + 128*ip + il; const float d = x[i].d; const uint8_t * ql = x[i].ql + 64*ip + il; const uint8_t qh = x[i].qh[32*ip + il]; const int8_t * sc = x[i].scales + is; y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32); y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); } template static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const src1_t * yy, dst_t * dst, const int ncols, int nrows) { static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); const int row = blockIdx.y*blockDim.y + threadIdx.y; if (row > nrows) return; const int num_blocks_per_row = ncols / QK_K; const int ib0 = row*num_blocks_per_row; const block_q6_K * x = (const block_q6_K *)vx + ib0; const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... const int in = tid - step*im; // 0...15 or 0...7 #if K_QUANTS_PER_ITERATION == 1 const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 const int is = 0; #else const int l0 = 4 * in; // 0, 4, 8, ..., 28 const int is = in / 4; #endif const int ql_offset = 64*im + l0; const int qh_offset = 32*im + l0; const int s_offset = 8*im + is; const int y_offset = 128*im + l0; dst_t tmp = 0; // partial sum for thread in warp for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { const src1_t * y = yy + i * QK_K + y_offset; const uint8_t * ql = x[i].ql + ql_offset; const uint8_t * qh = x[i].qh + qh_offset; const int8_t * s = x[i].scales + s_offset; const dst_t d = x[i].d; #if K_QUANTS_PER_ITERATION == 1 float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); tmp += sum; #else dst_t sum = 0; for (int l = 0; l < 4; ++l) { sum += (dst_t)y[l+ 0] * (dst_t)s[0] * d * (dst_t)((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) + (dst_t)y[l+32] * (dst_t)s[2] * d * (dst_t)((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) + (dst_t)y[l+64] * (dst_t)s[4] * d * (dst_t)((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) + (dst_t)y[l+96] * (dst_t)s[6] * d * (dst_t)((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); } tmp += sum; #endif } // sum up partial sums and write back result #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); } if (tid == 0) { dst[row] = tmp; } } template dequantize_kernel> static __global__ void dequantize_block(const void * vx, dst_t * y, const int k) { const int i = blockDim.x*blockIdx.x + 2*threadIdx.x; if (i >= k) { return; } const int ib = i/qk; // block index const int iqs = (i%qk)/qr; // quant index const int iybs = i - i%qk; // y block start index const int y_offset = qr == 1 ? 1 : qk/2; // dequantize vec2_t v; dequantize_kernel(vx, ib, iqs, v); y[iybs + iqs + 0] = v.x; y[iybs + iqs + y_offset] = v.y; } template static __device__ __forceinline__ dst_t vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) { #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; int vi; memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int)); const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]); const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]); const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d); // subtract 8 from each quantized value const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808); const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808); // SIMD dot product of quantized values int sumi = __dp4a(vi0, ui0, 0); sumi = __dp4a(vi1, ui1, sumi); return sumi*d; #else return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= 600 } template static __device__ __forceinline__ dst_t vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) { #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]); const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]); const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]); const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d); const float m = bq4_1->m; const float s = bq8_1->s; const int vi0 = (vi >> 0) & 0x0F0F0F0F; const int vi1 = (vi >> 4) & 0x0F0F0F0F; // SIMD dot product of quantized values int sumi = __dp4a(vi0, ui0, 0); sumi = __dp4a(vi1, ui1, sumi); return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block #else return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= 600 } template static __device__ __forceinline__ dst_t vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) { #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq; int qs; memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int)); const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2); const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2); const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]); const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]); const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d); int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5 vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13 vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21 vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29 vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5 vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13 vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21 vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29 vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values return sumi*d; #else return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= 600 } template static __device__ __forceinline__ dst_t vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) { #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq; const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]); const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2); const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2); const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]); const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]); const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d); const float m = bq5_1->m; const float s = bq8_1->s; int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5 vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13 vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21 vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29 int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5 vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13 vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21 vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29 sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block #else return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= 600 } template static __device__ __forceinline__ dst_t vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) { #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq; int vi; memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int)); const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]); const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d); // SIMD dot product of quantized values int sumi = __dp4a(vi, ui, 0); return sumi*d; #else return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= 600 } template vec_dot_q_cuda> static __global__ void mul_mat_vec_q(const void * vx, const void * vy, dst_t * dst, const int ncols, const int nrows) { const int row = blockIdx.y*blockDim.y + threadIdx.y; if (row >= nrows) { return; } const int blocks_per_row = ncols / qk; const int blocks_per_warp = WARP_SIZE / qi; // partial sum for each thread float tmp = 0.0f; const block_q_t * x = (const block_q_t *) vx; const block_q8_1 * y = (const block_q8_1 *) vy; for (int i = 0; i < blocks_per_row; i += blocks_per_warp) { const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index const int iby = i + threadIdx.x / qi; // y block index const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int tmp += (float)vec_dot_q_cuda(&x[ibx], &y[iby], iqs); } // sum up partial sums and write back result #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); } if (threadIdx.x == 0) { dst[row] = (dst_t)tmp; } } template dequantize_kernel> static __global__ void dequantize_mul_mat_vec(const void * vx, const src1_t * y, dst_t * dst, const int ncols, const int nrows) { // qk = quantized weights per x block // qr = number of quantized weights per data value in x block const int row = blockIdx.y*blockDim.y + threadIdx.y; if (row >= nrows) { return; } const int tid = threadIdx.x; const int iter_stride = 2*GGML_CUDA_DMMV_X; const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter const int y_offset = qr == 1 ? 1 : qk/2; vec2_t tmp2 = make_vec2_t(0, 0); // partial sum for thread in warp for (int i = 0; i < ncols; i += iter_stride) { const int col = i + vals_per_iter*tid; const int ib = (row*ncols + col)/qk; // x block index const int iqs = (col%qk)/qr; // x quant index const int iybs = col - col%qk; // y block start index // processing >2 values per i iter is faster for fast GPUs #pragma unroll for (int j = 0; j < vals_per_iter; j += 2) { // process 2 vals per j iter // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val // dequantize vec2_t xc; dequantize_kernel(vx, ib, iqs + j/qr, xc); // matrix multiplication vec2_t yc = make_vec2_t( y[iybs + iqs + j/qr + 0], y[iybs + iqs + j/qr + y_offset]); tmp2 += xc * yc; } } // sum up partial sums and write back result // TODO: reducing as half2 may be faster, but requires special handling for float2 dst_t tmp = tmp2.x + tmp2.y; #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); } if (tid == 0) { dst[row] = tmp; } } template dot_kernel> static __global__ void dequantize_mul_mat_vec_k(const void * vx, const src1_t * y, dst_t * dst, const int ncols) { const int row = blockIdx.x*blockDim.y + threadIdx.y; const int tid = threadIdx.x; const int iter_stride = QK_K; const int vals_per_iter = iter_stride / n_thread; const int num_blocks_per_row = ncols / QK_K; const int ib0 = row*num_blocks_per_row; dst_t tmp = 0; // partial sum for thread in warp for (int i = 0; i < ncols; i += iter_stride) { const int col = i + vals_per_iter*tid; const int ib = ib0 + col/QK_K; // x block index const int iqs = col%QK_K; // x quant index const int iybs = col - col%QK_K; // y block start index dst_t v; dot_kernel(vx, ib, iqs, y + iybs, v); tmp += v; } // sum up partial sums and write back result #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); } if (tid == 0) { dst[row] = tmp; } }