mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-05 00:04:36 +00:00
Refactor getrows to use common code and get ready for q6_k.
This commit is contained in:
parent
5509f74318
commit
4b223ec432
@ -16,27 +16,12 @@
|
|||||||
#extension GL_EXT_debug_printf : enable
|
#extension GL_EXT_debug_printf : enable
|
||||||
|
|
||||||
#define QK4_0 32
|
#define QK4_0 32
|
||||||
#define QR4_0 2
|
|
||||||
#define QK4_1 32
|
#define QK4_1 32
|
||||||
|
|
||||||
#define GELU_COEF_A 0.044715
|
#define GELU_COEF_A 0.044715
|
||||||
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
|
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
|
||||||
|
|
||||||
#ifndef QK_K
|
|
||||||
#define QK_K 256
|
#define QK_K 256
|
||||||
#endif
|
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
#define K_SCALE_SIZE 12
|
|
||||||
#else
|
|
||||||
#define K_SCALE_SIZE 4
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define BM 128
|
|
||||||
#define BN 128
|
|
||||||
#define BK 8
|
|
||||||
#define TM 8
|
|
||||||
#define TN 8
|
|
||||||
|
|
||||||
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
|
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
|
||||||
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
|
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
|
||||||
@ -44,83 +29,76 @@
|
|||||||
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
|
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
|
||||||
|
|
||||||
#define sizeof_block_q4_0 0x12
|
#define sizeof_block_q4_0 0x12
|
||||||
#define sizeof_block_q4_1 0x14
|
|
||||||
struct block_q4_0 {
|
struct block_q4_0 {
|
||||||
float16_t d;
|
float16_t d;
|
||||||
uint8_t qs[QK4_0 / 2];
|
uint8_t qs[QK4_0 / 2];
|
||||||
};
|
};
|
||||||
|
mat4 dequantize_q4_0(const block_q4_0 xb, uint il) {
|
||||||
|
const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
|
||||||
|
const float d2 = d1 / 256.f;
|
||||||
|
const float md = -8.f * xb.d;
|
||||||
|
const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
|
||||||
|
const uint16_t mask1 = mask0 << 8;
|
||||||
|
|
||||||
|
mat4 reg;
|
||||||
|
for (int i=0;i<8;i++) {
|
||||||
|
uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
|
||||||
|
reg[i/2][2*(i%2)+0] = d1 * (b & mask0) + md;
|
||||||
|
reg[i/2][2*(i%2)+1] = d2 * (b & mask1) + md;
|
||||||
|
}
|
||||||
|
return reg;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define sizeof_block_q4_1 0x14
|
||||||
struct block_q4_1 {
|
struct block_q4_1 {
|
||||||
float16_t d;
|
float16_t d;
|
||||||
float16_t m;
|
float16_t m;
|
||||||
uint8_t qs[QK4_1 / 2];
|
uint8_t qs[QK4_1 / 2];
|
||||||
};
|
};
|
||||||
|
mat4 dequantize_q4_1(const block_q4_1 xb, uint il) {
|
||||||
|
const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
|
||||||
|
const float d2 = d1 / 256.f;
|
||||||
|
const float m = xb.m;
|
||||||
|
const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
|
||||||
|
const uint16_t mask1 = mask0 << 8;
|
||||||
|
|
||||||
#ifndef QK_K
|
mat4 reg;
|
||||||
#define QK_K 256
|
for (int i=0;i<8;i++) {
|
||||||
#endif
|
uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
|
||||||
|
reg[i/2][2*(i%2)+0] = ((b & mask0) * d1) + m;
|
||||||
|
reg[i/2][2*(i%2)+1] = ((b & mask1) * d2) + m;
|
||||||
|
}
|
||||||
|
return reg;
|
||||||
|
}
|
||||||
|
|
||||||
#if QK_K == 256
|
#define sizeof_block_q6_k 210
|
||||||
#define K_SCALE_SIZE 12
|
struct block_q6_k {
|
||||||
#else
|
|
||||||
#define K_SCALE_SIZE 4
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct block_q2_K {
|
|
||||||
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
|
||||||
uint8_t qs[QK_K/4]; // quants
|
|
||||||
float16_t d; // super-block scale for quantized scales
|
|
||||||
float16_t dmin; // super-block scale for quantized mins
|
|
||||||
};
|
|
||||||
// 84 bytes / block
|
|
||||||
|
|
||||||
struct block_q3_K {
|
|
||||||
uint8_t hmask[QK_K/8]; // quants - high bit
|
|
||||||
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
|
||||||
#if QK_K == 64
|
|
||||||
uint8_t scales[2];
|
|
||||||
#else
|
|
||||||
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
|
|
||||||
#endif
|
|
||||||
float16_t d; // super-block scale
|
|
||||||
};
|
|
||||||
|
|
||||||
#if QK_K == 64
|
|
||||||
typedef struct {
|
|
||||||
float16_t d[2]; // super-block scales/mins
|
|
||||||
uint8_t scales[2];
|
|
||||||
uint8_t qs[QK_K/2]; // 4-bit quants
|
|
||||||
} block_q4_K;
|
|
||||||
#else
|
|
||||||
struct block_q4_K {
|
|
||||||
float16_t d; // super-block scale for quantized scales
|
|
||||||
float16_t dmin; // super-block scale for quantized mins
|
|
||||||
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
|
||||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if QK_K == 64
|
|
||||||
struct block_q5_K {
|
|
||||||
float16_t d; // super-block scales/mins
|
|
||||||
int8_t scales[QK_K/16]; // 8-bit block scales
|
|
||||||
uint8_t qh[QK_K/8]; // quants, high bit
|
|
||||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
|
||||||
};
|
|
||||||
#else
|
|
||||||
struct block_q5_K {
|
|
||||||
float16_t d; // super-block scale for quantized scales
|
|
||||||
float16_t dmin; // super-block scale for quantized mins
|
|
||||||
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
|
||||||
uint8_t qh[QK_K/8]; // quants, high bit
|
|
||||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
|
||||||
};
|
|
||||||
// 176 bytes / block
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct block_q6_K {
|
|
||||||
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
||||||
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
||||||
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
||||||
float16_t d; // super-block scale
|
float16_t d; // super-block scale
|
||||||
};
|
};
|
||||||
// 210 bytes / block
|
mat4 dequantize_q6_k(const block_q6_k xb, uint il) {
|
||||||
|
const float16_t d_all = xb.d;
|
||||||
|
uint8_t ql[QK_K/2];
|
||||||
|
uint8_t qh[QK_K/4];
|
||||||
|
int8_t scales[QK_K/16];
|
||||||
|
|
||||||
|
const uint qlIndex = 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
|
||||||
|
const uint qhIndex = 32*(il/8) + 16*(il&1);
|
||||||
|
float16_t sc = xb.scales[(il%2) + 2 * ((il/2))];
|
||||||
|
il = (il/2) & 3;
|
||||||
|
|
||||||
|
const uint16_t kmask1 = il>1 ? uint16_t(il>2 ? 192 : 48) : uint16_t(il>0 ? 12 : 3);
|
||||||
|
const uint16_t kmask2 = il>1 ? uint8_t(0xF0) : uint8_t(0x0F);
|
||||||
|
const float16_t coef = il>1 ? float16_t(1.f/16.f) : float16_t(1.f);
|
||||||
|
const float16_t ml = float16_t(d_all * sc * 32.f);
|
||||||
|
const float16_t dl = float16_t(d_all * sc * coef);
|
||||||
|
mat4 reg;
|
||||||
|
for (int i = 0; i < 16; ++i) {
|
||||||
|
const float16_t q = (il&1) != 0 ? ((ql[qlIndex + i] & kmask2) | ((qh[qhIndex + i] & kmask1) << 2))
|
||||||
|
: ((ql[qlIndex + i] & kmask2) | ((qh[qhIndex + i] & kmask1) << 4));
|
||||||
|
reg[i/4][i%4] = dl * q - ml;
|
||||||
|
}
|
||||||
|
return reg;
|
||||||
|
}
|
||||||
|
25
kompute/op_getrows.comp
Normal file
25
kompute/op_getrows.comp
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
/**
|
||||||
|
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
|
||||||
|
*
|
||||||
|
* This software is licensed under the terms of the Software for Open Models License (SOM),
|
||||||
|
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
|
||||||
|
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
|
||||||
|
*/
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint i = gl_WorkGroupID.x;
|
||||||
|
const int r = inB[i + pcs.inBOff];
|
||||||
|
|
||||||
|
int z = 0;
|
||||||
|
for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) {
|
||||||
|
const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK;
|
||||||
|
const mat4 result = dequantize_block(inIndex, ind%NL);
|
||||||
|
for (uint j = 0; j < 4; ++j) {
|
||||||
|
for (uint k = 0; k < 4; ++k) {
|
||||||
|
const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z;
|
||||||
|
out_[outIndex] = result[j][k];
|
||||||
|
++z;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -25,11 +25,15 @@ layout (push_constant) uniform parameter {
|
|||||||
int nb1;
|
int nb1;
|
||||||
} pcs;
|
} pcs;
|
||||||
|
|
||||||
|
void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
|
||||||
|
for (int j = 0; j < k; j++) {
|
||||||
|
out_[y + j] = inA[x + j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint i = gl_WorkGroupID.x;
|
const uint i = gl_WorkGroupID.x;
|
||||||
const int r = inB[i + pcs.inBOff];
|
const int r = inB[i + pcs.inBOff];
|
||||||
|
|
||||||
for (int j = 0; j < pcs.ne00; j++) {
|
dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1 + pcs.outOff, pcs.ne00);
|
||||||
out_[i*pcs.nb1 + j + pcs.outOff] = inA[r*pcs.nb01/2+j + pcs.inAOff];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -10,6 +10,10 @@
|
|||||||
|
|
||||||
#include "common.comp"
|
#include "common.comp"
|
||||||
|
|
||||||
|
#define NL 2
|
||||||
|
#define BYTES_FOR_TYPE 4 /*bytes for float*/
|
||||||
|
#define SIZE_OF_BLOCK sizeof_block_q4_0
|
||||||
|
|
||||||
layout(local_size_x = 1) in;
|
layout(local_size_x = 1) in;
|
||||||
|
|
||||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||||
@ -25,40 +29,18 @@ layout (push_constant) uniform parameter {
|
|||||||
int nb1;
|
int nb1;
|
||||||
} pcs;
|
} pcs;
|
||||||
|
|
||||||
#define UNALIGNED_INPUT inA
|
|
||||||
|
|
||||||
block_q4_0 get_unaligned_block_q4_0(uint index) {
|
block_q4_0 get_unaligned_block_q4_0(uint index) {
|
||||||
block_q4_0 fres;
|
block_q4_0 fres;
|
||||||
fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
|
fres.d = u8BufToFloat16(inA, index);
|
||||||
[[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
|
[[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
|
||||||
fres.qs[it] = UNALIGNED_INPUT[index+2+it];
|
fres.qs[it] = inA[index+2+it];
|
||||||
}
|
}
|
||||||
return fres;
|
return fres;
|
||||||
}
|
}
|
||||||
|
|
||||||
void dequantize_row_q4_0(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
|
mat4 dequantize_block(uint index, uint il) {
|
||||||
const uint qk = QK4_0;
|
const block_q4_0 block = get_unaligned_block_q4_0(index);
|
||||||
|
return dequantize_q4_0(block, il);
|
||||||
const uint nb = k / qk;
|
|
||||||
|
|
||||||
for (uint i = 0; i < nb; i++) {
|
|
||||||
const block_q4_0 block = get_unaligned_block_q4_0(x + i*sizeof_block_q4_0);
|
|
||||||
|
|
||||||
const float16_t d = block.d;
|
|
||||||
|
|
||||||
for (uint j = 0; j < qk/2; ++j) {
|
|
||||||
const int x0 = (block.qs[j] & 0x0F) - 8;
|
|
||||||
const int x1 = (block.qs[j] >> 4) - 8;
|
|
||||||
|
|
||||||
out_[y+i*qk + j + 0 ] = float(x0)*d;
|
|
||||||
out_[y+i*qk + j + qk/2] = float(x1)*d;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void main() {
|
#include "op_getrows.comp"
|
||||||
const uint i = gl_WorkGroupID.x;
|
|
||||||
const int r = inB[i + pcs.inBOff];
|
|
||||||
|
|
||||||
dequantize_row_q4_0(uint(r*pcs.nb01) + pcs.inAOff, uint(i*pcs.nb1/4) + pcs.outOff, pcs.ne00);
|
|
||||||
}
|
|
||||||
|
@ -10,6 +10,10 @@
|
|||||||
|
|
||||||
#include "common.comp"
|
#include "common.comp"
|
||||||
|
|
||||||
|
#define NL 2
|
||||||
|
#define BYTES_FOR_TYPE 4 /*bytes for float*/
|
||||||
|
#define SIZE_OF_BLOCK sizeof_block_q4_1
|
||||||
|
|
||||||
layout(local_size_x = 1) in;
|
layout(local_size_x = 1) in;
|
||||||
|
|
||||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||||
@ -25,42 +29,19 @@ layout (push_constant) uniform parameter {
|
|||||||
int nb1;
|
int nb1;
|
||||||
} pcs;
|
} pcs;
|
||||||
|
|
||||||
#define UNALIGNED_INPUT inA
|
|
||||||
|
|
||||||
block_q4_1 get_unaligned_block_q4_1(uint index) {
|
block_q4_1 get_unaligned_block_q4_1(uint index) {
|
||||||
block_q4_1 fres;
|
block_q4_1 fres;
|
||||||
fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
|
fres.d = u8BufToFloat16(inA, index);
|
||||||
fres.m = u8BufToFloat16(UNALIGNED_INPUT, index+2);
|
fres.m = u8BufToFloat16(inA, index+2);
|
||||||
[[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
|
[[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
|
||||||
fres.qs[it] = UNALIGNED_INPUT[index+4+it];
|
fres.qs[it] = inA[index+4+it];
|
||||||
}
|
}
|
||||||
return fres;
|
return fres;
|
||||||
}
|
}
|
||||||
|
|
||||||
void dequantize_row_q4_1(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
|
mat4 dequantize_block(uint index, uint il) {
|
||||||
const uint qk = QK4_1;
|
const block_q4_1 block = get_unaligned_block_q4_1(index);
|
||||||
|
return dequantize_q4_1(block, il);
|
||||||
const uint nb = k / qk;
|
|
||||||
|
|
||||||
for (uint i = 0; i < nb; i++) {
|
|
||||||
const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_1);
|
|
||||||
|
|
||||||
const float16_t d = block.d;
|
|
||||||
const float16_t m = block.m;
|
|
||||||
|
|
||||||
for (uint j = 0; j < qk/2; ++j) {
|
|
||||||
const int x0 = (block.qs[j] & 0x0F);
|
|
||||||
const int x1 = (block.qs[j] >> 4);
|
|
||||||
|
|
||||||
out_[y+i*qk + j + 0 ] = float(x0)*d + m;
|
|
||||||
out_[y+i*qk + j + qk/2] = float(x1)*d + m;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void main() {
|
#include "op_getrows.comp"
|
||||||
const uint i = gl_WorkGroupID.x;
|
|
||||||
const int r = inB[i + pcs.inBOff];
|
|
||||||
|
|
||||||
dequantize_row_q4_1(uint(r*pcs.nb01) + pcs.inAOff, uint(i*pcs.nb1/4) + pcs.outOff, pcs.ne00);
|
|
||||||
}
|
|
||||||
|
Loading…
Reference in New Issue
Block a user