Consolidate code for mat x vec kernels and use subgroups more extensively.

This commit is contained in:
Adam Treat 2023-09-29 10:02:22 -04:00 committed by cebtenzzre
parent 77135a3bf5
commit 93306f16d0
16 changed files with 321 additions and 214 deletions

View File

@ -165,11 +165,20 @@ std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
if (heapSize < memoryRequired) if (heapSize < memoryRequired)
continue; continue;
vk::PhysicalDeviceSubgroupProperties subgroupProperties;
vk::PhysicalDeviceProperties2 deviceProperties2;
deviceProperties2.pNext = &subgroupProperties;
physicalDevices.at(i).getProperties2(&deviceProperties2);
if (subgroupProperties.subgroupSize < 32)
continue;
ggml_vk_device d; ggml_vk_device d;
d.index = i; d.index = i;
d.type = properties.deviceType; d.type = properties.deviceType;
d.heapSize = heapSize; d.heapSize = heapSize;
d.name = properties.deviceName; d.name = properties.deviceName;
d.subgroupSize = subgroupProperties.subgroupSize;
size_t n_idx = ++count_by_name[d.name]; size_t n_idx = ++count_by_name[d.name];
if (n_idx > 1) { if (n_idx > 1) {
d.name += " (" + std::to_string(n_idx) + ")"; d.name += " (" + std::to_string(n_idx) + ")";
@ -242,7 +251,7 @@ bool ggml_vk_init_device(const ggml_vk_device &device) {
bool ggml_vk_init_device(int device) { bool ggml_vk_init_device(int device) {
komputeManager()->initializeDevice(device, {}, komputeManager()->initializeDevice(device, {},
{"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage", {"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
"VK_KHR_16bit_storage", "VK_KHR_storage_buffer_storage_class"}); "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"});
return ggml_vk_has_device(); return ggml_vk_has_device();
} }
@ -772,9 +781,10 @@ void ggml_vk_soft_max(kp::Sequence& seq,
}; };
std::shared_ptr<kp::Algorithm> s_algo = nullptr; std::shared_ptr<kp::Algorithm> s_algo = nullptr;
if (!komputeManager()->hasAlgorithm(__func__)) if (!komputeManager()->hasAlgorithm(__func__)) {
s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}); const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
else { s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts});
} else {
s_algo = komputeManager()->getAlgorithm(__func__); s_algo = komputeManager()->getAlgorithm(__func__);
s_algo->setTensors({in, out}); s_algo->setTensors({in, out});
s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
@ -890,9 +900,10 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
}; };
std::shared_ptr<kp::Algorithm> s_algo = nullptr; std::shared_ptr<kp::Algorithm> s_algo = nullptr;
if (!komputeManager()->hasAlgorithm(__func__)) if (!komputeManager()->hasAlgorithm(__func__)) {
s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts}); const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
else { s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
} else {
s_algo = komputeManager()->getAlgorithm(__func__); s_algo = komputeManager()->getAlgorithm(__func__);
s_algo->setTensors({inA, inB, out}); s_algo->setTensors({inA, inB, out});
s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)}); s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)});
@ -907,26 +918,28 @@ void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_siz
const std::shared_ptr<kp::Tensor>& inB, const std::shared_ptr<kp::Tensor>& inB,
const std::shared_ptr<kp::Tensor>& out, const std::shared_ptr<kp::Tensor>& out,
uint32_t inAOff, uint32_t inBOff, uint32_t outOff, uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
int32_t ne01, int32_t ne11) { int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02) {
struct PushConstants { struct PushConstants {
uint32_t inAOff, inBOff, outOff; uint32_t inAOff, inBOff, outOff;
int32_t ne00, ne10, ne0; int32_t ne00, ne10, ne0, ne1, ne01, gqa;
} pushConsts { } pushConsts {
safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4), safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
ne00, ne10, ne0, ne00, ne10, ne0, ne1, ne01, ne12/ne02
}; };
std::shared_ptr<kp::Algorithm> s_algo = nullptr; std::shared_ptr<kp::Algorithm> s_algo = nullptr;
if (!komputeManager()->hasAlgorithm(__func__)) if (!komputeManager()->hasAlgorithm(__func__)) {
s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11)}, {}, {pushConsts}); const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
else { s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
} else {
s_algo = komputeManager()->getAlgorithm(__func__); s_algo = komputeManager()->getAlgorithm(__func__);
s_algo->setTensors({inA, inB, out}); s_algo->setTensors({inA, inB, out});
s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11)}); s_algo->setWorkgroup({unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)});
s_algo->setPushConstants<PushConstants>({pushConsts}); s_algo->setPushConstants<PushConstants>({pushConsts});
s_algo->updateDescriptors(s_kompute_context->pool.get()); s_algo->updateDescriptors(s_kompute_context->pool.get());
} }
seq.record<kp::OpTensorFill>({out});
seq.record<kp::OpAlgoDispatch>(s_algo); seq.record<kp::OpAlgoDispatch>(s_algo);
} }
@ -1182,7 +1195,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
const uint32_t nb3 = dst ? dst->nb[3] : 0; const uint32_t nb3 = dst ? dst->nb[3] : 0;
const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
// const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
const static std::shared_ptr<kp::Tensor> nullTensor = nullptr; const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
@ -1263,30 +1276,46 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
} break; } break;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
{ {
if ((src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32) if (src1t != GGML_TYPE_F32) {
&& src1->type == GGML_TYPE_F32) { fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
} else if (src0->type == GGML_TYPE_Q4_0
&& src1->type == GGML_TYPE_F32) {
ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11);
} else if (src0->type == GGML_TYPE_Q4_1
&& src1->type == GGML_TYPE_F32) {
ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11);
} else {
fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0->type, src1->type);
goto not_implemented; goto not_implemented;
} }
if (!ggml_is_transposed(src0)
&& !ggml_is_transposed(src1)
&& ne00%32 == 0
&& ne11 > 1) {
fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
goto not_implemented;
} else {
switch (src0t) {
case GGML_TYPE_F16:
case GGML_TYPE_F32:
ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
break;
case GGML_TYPE_Q4_0:
ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
break;
case GGML_TYPE_Q4_1:
ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
break;
default: {
fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
goto not_implemented;
}
}
}
} break; } break;
case GGML_OP_GET_ROWS: case GGML_OP_GET_ROWS:
{ {
if (src0->type == GGML_TYPE_F16) { if (src0t == GGML_TYPE_F16) {
ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
} else if (src0->type == GGML_TYPE_Q4_0) { } else if (src0t == GGML_TYPE_Q4_0) {
ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
} else if (src0->type == GGML_TYPE_Q4_1) { } else if (src0t == GGML_TYPE_Q4_1) {
ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
} else { } else {
fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0->type); fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t);
goto not_implemented; goto not_implemented;
} }
} break; } break;

View File

@ -34,6 +34,7 @@ struct ggml_vk_device {
size_t heapSize = 0; size_t heapSize = 0;
std::string name; std::string name;
std::string vendor; std::string vendor;
int subgroupSize = 0;
}; };
std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired); std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired);

View File

@ -43,7 +43,7 @@ void dequantize_row_q4_1(uint x /*Based from inA unaligned*/, uint y /*Based fro
const uint nb = k / qk; const uint nb = k / qk;
for (uint i = 0; i < nb; i++) { for (uint i = 0; i < nb; i++) {
const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_0); const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_1);
const float16_t d = block.d; const float16_t d = block.d;
const float16_t m = block.m; const float16_t m = block.m;

View File

@ -10,7 +10,9 @@
#include "common.comp" #include "common.comp"
layout(local_size_x = 64) in; #extension GL_KHR_shader_subgroup_arithmetic : require
layout(local_size_x_id = 0) in;
layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; }; layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
layout (binding = 1) readonly buffer tensorInB { float inB[]; }; layout (binding = 1) readonly buffer tensorInB { float inB[]; };
@ -29,8 +31,6 @@ layout (push_constant) uniform parameter {
int ne1; int ne1;
} pcs; } pcs;
shared float sum[gl_WorkGroupSize.x];
void main() { void main() {
const uint r0 = gl_WorkGroupID.x; const uint r0 = gl_WorkGroupID.x;
const uint r1 = gl_WorkGroupID.y; const uint r1 = gl_WorkGroupID.y;
@ -39,24 +39,13 @@ void main() {
const uint x = (r0*pcs.nb01 + im*pcs.nb02) / 2 + pcs.inAOff; // Based from inA const uint x = (r0*pcs.nb01 + im*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // based from inB const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // based from inB
sum[gl_LocalInvocationID.x] = 0.0; float sumf = 0.0f;
for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
for (uint i = gl_LocalInvocationID.x; i < pcs.ne00; i += gl_WorkGroupSize.x) { sumf += float(inA[x+i]) * float(inB[y+i]);
sum[gl_LocalInvocationID.x] += float(inA[x+i]) * float(inB[y+i]);
} }
// accumulate the sum from all threads in the threadgroup const float all_sum = subgroupAdd(sumf);
barrier(); if (subgroupElect()) {
memoryBarrierShared(); out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
if (gl_LocalInvocationID.x < i) {
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
}
barrier();
memoryBarrierShared();
}
if (gl_LocalInvocationID.x == 0) {
out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = sum[0];
} }
} }

View File

@ -10,7 +10,13 @@
#include "common.comp" #include "common.comp"
layout(local_size_x = 8, local_size_y = 8) in; #define BLOCKS_IN_QUANT QK4_0
#define SIZE_OF_BLOCK sizeof_block_q4_0
#define N_ROWS 4
layout(local_size_x_id = 0) in;
layout(local_size_y = 1) in;
layout(local_size_z = 1) in;
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
layout (binding = 1) readonly buffer tensorInB { float inB[]; }; layout (binding = 1) readonly buffer tensorInB { float inB[]; };
@ -23,58 +29,31 @@ layout (push_constant) uniform parameter {
int ne00; int ne00;
int ne10; int ne10;
int ne0; int ne0;
int ne1;
int ne01;
int gqa;
} pcs; } pcs;
shared float sum[64]; // The q4_0 version of this function
float block_q_n_dot_y(uint block_index, uint yb, uint il) {
vec2 acc = vec2(0.0, 0.0);
const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
float d = float(u8BufToFloat16(inA, index));
float sumy = 0.0f;
for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
const uint16_t b = u8BufToU16(inA, index + 2 + il + i);
void main() { const float yl0 = inB[yb + i];
const uint nb = uint(pcs.ne00/QK4_0); const float yl1 = inB[yb + i + 1];
const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
const uint r0 = gl_WorkGroupID.x; sumy += yl0 + yl1 + yl8 + yl9;
const uint r1 = gl_WorkGroupID.y;
const uint x = r0*nb; // Based from inA without base offset acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
const uint ix = gl_LocalInvocationID.y/4; // 0 or 1
const uint iy = gl_LocalInvocationID.y - 4*ix; // 0...3
const uint first = 4 * iy;
float sumf = 0.0;
for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
const uint index = (x+i)*sizeof_block_q4_0+pcs.inAOff;
const float d = float(u8BufToFloat16(inA, index));
const uint xl = first; // Based from bl->qs
const uint yl = y + i * QK4_0 + first; // Based from inB
vec2 acc = vec2(0.0, 0.0);
for (int j = 0; j < 4; ++j) {
const uint8_t b = inA[index+2+xl+j];
acc.x += inB[yl+j] * (b & 0xF) + inB[yl+j+16] * (b >> 4);
acc.y += inB[yl+j] + inB[yl+j+16];
}
sumf += d * (acc.x - 8.*acc.y);
}
sum[ith] = sumf;
//
// Accumulate the sum from all threads in the threadgroup
//
barrier();
if (ith == 0) {
float sumTotal = 0.0;
for (uint i = 0; i < nth; ++i) {
sumTotal += sum[i];
}
out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sumTotal;
} }
return d * (sumy * -8.f + acc[0] + acc[1]);
} }
#include "op_mul_mv_q_n.comp"

View File

@ -10,7 +10,13 @@
#include "common.comp" #include "common.comp"
layout(local_size_x = 8, local_size_y = 8) in; #define BLOCKS_IN_QUANT QK4_1
#define SIZE_OF_BLOCK sizeof_block_q4_1
#define N_ROWS 4
layout(local_size_x_id = 0) in;
layout(local_size_y = 1) in;
layout(local_size_z = 1) in;
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
layout (binding = 1) readonly buffer tensorInB { float inB[]; }; layout (binding = 1) readonly buffer tensorInB { float inB[]; };
@ -23,81 +29,33 @@ layout (push_constant) uniform parameter {
int ne00; int ne00;
int ne10; int ne10;
int ne0; int ne0;
int ne1;
int ne01;
int gqa;
} pcs; } pcs;
shared float sum[gl_WorkGroupSize.x*gl_WorkGroupSize.y]; // The q4_1 version of this function
float block_q_n_dot_y(uint block_index, uint yb, uint il) {
vec2 acc = vec2(0.0, 0.0);
const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
float d = float(u8BufToFloat16(inA, index));
float m = float(u8BufToFloat16(inA, index+2));
#define UNALIGNED_INPUT inA float sumy = 0.0f;
for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
block_q4_1 get_unaligned_block_q4_1(uint index) { const float yl0 = inB[yb + i];
block_q4_1 fres; const float yl1 = inB[yb + i + 1];
fres.d = u8BufToFloat16(UNALIGNED_INPUT, index); const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
fres.m = u8BufToFloat16(UNALIGNED_INPUT, index+2); const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
[[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
fres.qs[it] = UNALIGNED_INPUT[index+4+it]; sumy += yl0 + yl1 + yl8 + yl9;
acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
} }
return fres; return d * (acc[0] + acc[1]) + sumy * m;
} }
void main() { #include "op_mul_mv_q_n.comp"
const uint nb = uint(pcs.ne00/QK4_1);
const uint r0 = gl_WorkGroupID.x;
const uint r1 = gl_WorkGroupID.y;
const uint x = r0*nb; // Based from inA without base offset
const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
const uint ix = gl_LocalInvocationID.y/4; // 0 or 1
const uint iy = gl_LocalInvocationID.y - 4*ix; // 0...3
const uint first = 4 * iy;
float sumf = 0.0;
for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
//TODO: Removing the use of pointers has been quite hairy here. If something goes wrong here, this is most likely it:
const block_q4_1 block = get_unaligned_block_q4_1((x+i)*sizeof_block_q4_1+pcs.inAOff);
const float d = float(block.d);
const float m = float(block.m);
const uint xl = first; // Based from bl->qs
const uint yl = y + i * QK4_1 + first; // Based from inB
vec2 acc = vec2(0.0, 0.0);
for (int j = 0; j < 4; ++j) {
acc.x += inB[yl+j] * (d * (block.qs[xl+j] & 0xF) + m);
acc.y += inB[yl+j+16] * (d * (block.qs[xl+j] >> 4) + m);
}
sumf += d * (acc.x - acc.y);
}
sum[ith] = sumf;
//
// Accumulate the sum from all threads in the threadgroup
//
barrier();
memoryBarrierShared();
if (ith%4 == 0) {
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
}
barrier();
memoryBarrierShared();
if (ith%16 == 0) {
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
}
barrier();
memoryBarrierShared();
if (ith == 0) {
for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sum[0];
}
}

View File

@ -0,0 +1,49 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#extension GL_KHR_shader_subgroup_arithmetic : require
#extension GL_EXT_debug_printf : enable
void main() {
const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
const uint r0 = gl_WorkGroupID.x;
const uint r1 = gl_WorkGroupID.y;
const uint im = gl_WorkGroupID.z;
const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;
const uint offset0 = first_row * nb + im/pcs.gqa*(nb*pcs.ne0);
const uint x = offset0; // Based from inA without base offset
const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
const uint ix = gl_SubgroupInvocationID/2;
const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);
uint yb = y + ix * BLOCKS_IN_QUANT + il;
debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
for (uint ib = ix; ib < nb; ib += gl_SubgroupSize/2) {
for (int row = 0; row < N_ROWS; row++) {
const uint block_index = x + ib + row * nb;
sumf[row] += block_q_n_dot_y(block_index, yb, il);
}
yb += BLOCKS_IN_QUANT * gl_SubgroupSize/2;
}
for (int row = 0; row < N_ROWS; ++row) {
const float tot = subgroupAdd(sumf[row]);
if (first_row + row < pcs.ne01 && subgroupElect()) {
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;
}
}
}

View File

@ -10,9 +10,9 @@
#include "common.comp" #include "common.comp"
#define nth 32 #extension GL_KHR_shader_subgroup_arithmetic : require
layout(local_size_x = nth) in; layout(local_size_x_id = 0) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
@ -25,8 +25,6 @@ layout(push_constant) uniform PushConstants {
int ne02; int ne02;
} pcs; } pcs;
shared float buf[nth];
void main() { void main() {
const uint i03 = gl_WorkGroupID.z; const uint i03 = gl_WorkGroupID.z;
const uint i02 = gl_WorkGroupID.y; const uint i02 = gl_WorkGroupID.y;
@ -37,46 +35,22 @@ void main() {
const uint pdst = extra_off + pcs.outOff; // Based from out_ const uint pdst = extra_off + pcs.outOff; // Based from out_
// parallel max // parallel max
buf[gl_LocalInvocationID.x] = uintBitsToFloat(0xFF800000); float localMax = uintBitsToFloat(0xFF800000);
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], in_[psrc0 + i00]); localMax = max(localMax, in_[psrc0 + i00]);
} }
float max_ = subgroupMax(localMax);
// reduce
barrier();
memoryBarrierShared();
[[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
if (gl_LocalInvocationID.x < i) {
buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], buf[gl_LocalInvocationID.x + i]);
}
barrier();
memoryBarrierShared();
}
// broadcast
const float max_ = buf[0];
// parallel sum // parallel sum
buf[gl_LocalInvocationID.x] = 0.0; float localSum = 0.0f;
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
buf[gl_LocalInvocationID.x] += exp(in_[psrc0 + i00] - max_); const float exp_psrc0 = exp(in_[psrc0 + i00] - max_);
localSum += exp_psrc0;
out_[pdst + i00] = exp_psrc0;
} }
// reduce const float sum = subgroupAdd(localSum);
barrier(); for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
memoryBarrierShared(); out_[pdst + i00] /= sum;
[[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
if (gl_LocalInvocationID.x < i) {
buf[gl_LocalInvocationID.x] += buf[gl_LocalInvocationID.x + i];
}
barrier();
memoryBarrierShared();
}
// broadcast
const float sum = buf[0];
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
out_[pdst + i00] = exp(in_[psrc0 + i00] - max_) / sum;
} }
} }

View File

@ -13,6 +13,7 @@ add_library(kompute STATIC Algorithm.cpp
OpAlgoDispatch.cpp OpAlgoDispatch.cpp
OpMemoryBarrier.cpp OpMemoryBarrier.cpp
OpTensorCopy.cpp OpTensorCopy.cpp
OpTensorFill.cpp
OpTensorSyncDevice.cpp OpTensorSyncDevice.cpp
OpTensorSyncLocal.cpp OpTensorSyncLocal.cpp
OpBufferSyncDevice.cpp OpBufferSyncDevice.cpp

View File

@ -0,0 +1,55 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#include "kompute/operations/OpTensorFill.hpp"
#include "kompute/Tensor.hpp"
namespace kp {
OpTensorFill::OpTensorFill(const std::vector<std::shared_ptr<Tensor>>& tensors)
{
KP_LOG_DEBUG("Kompute OpTensorFill constructor with params");
if (tensors.size() < 1) {
throw std::runtime_error(
"Kompute OpTensorFill called with less than 1 tensor");
}
this->mTensors = tensors;
}
OpTensorFill::~OpTensorFill()
{
KP_LOG_DEBUG("Kompute OpTensorFill destructor started");
}
void
OpTensorFill::record(const vk::CommandBuffer& commandBuffer)
{
KP_LOG_DEBUG("Kompute OpTensorFill record called");
for (size_t i = 0; i < this->mTensors.size(); i++) {
this->mTensors[i]->recordFill(commandBuffer, 0);
}
}
void
OpTensorFill::preEval(const vk::CommandBuffer& /*commandBuffer*/)
{
KP_LOG_DEBUG("Kompute OpTensorFill preEval called");
}
void
OpTensorFill::postEval(const vk::CommandBuffer& /*commandBuffer*/)
{
KP_LOG_DEBUG("Kompute OpTensorFill postEval called");
}
}

View File

@ -215,6 +215,13 @@ Tensor::recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
commandBuffer.copyBuffer(*bufferFrom, *bufferTo, copyRegion); commandBuffer.copyBuffer(*bufferFrom, *bufferTo, copyRegion);
} }
void
Tensor::recordFill(const vk::CommandBuffer &commandBuffer,
uint32_t fill)
{
commandBuffer.fillBuffer(*this->mPrimaryBuffer, mOffset, this->memorySize(), fill);
}
void void
Tensor::recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer, Tensor::recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
vk::AccessFlagBits srcAccessMask, vk::AccessFlagBits srcAccessMask,

View File

@ -21,6 +21,7 @@ target_sources(kompute PRIVATE
kompute/operations/OpMemoryBarrier.hpp kompute/operations/OpMemoryBarrier.hpp
kompute/operations/OpMult.hpp kompute/operations/OpMult.hpp
kompute/operations/OpTensorCopy.hpp kompute/operations/OpTensorCopy.hpp
kompute/operations/OpTensorFill.hpp
kompute/operations/OpTensorSyncDevice.hpp kompute/operations/OpTensorSyncDevice.hpp
kompute/operations/OpTensorSyncLocal.hpp kompute/operations/OpTensorSyncLocal.hpp
kompute/operations/OpBufferSyncDevice.hpp kompute/operations/OpBufferSyncDevice.hpp

View File

@ -15,6 +15,7 @@
#include "operations/OpTensorSyncLocal.hpp" #include "operations/OpTensorSyncLocal.hpp"
#include "operations/OpBufferSyncDevice.hpp" #include "operations/OpBufferSyncDevice.hpp"
#include "operations/OpBufferSyncLocal.hpp" #include "operations/OpBufferSyncLocal.hpp"
#include "operations/OpTensorFill.hpp"
// Will be build by CMake and placed inside the build directory // Will be build by CMake and placed inside the build directory
#include "ShaderLogisticRegression.hpp" #include "ShaderLogisticRegression.hpp"

View File

@ -126,6 +126,9 @@ class Tensor
void recordCopyFrom(const vk::CommandBuffer& commandBuffer, void recordCopyFrom(const vk::CommandBuffer& commandBuffer,
std::shared_ptr<Tensor> copyFromTensor); std::shared_ptr<Tensor> copyFromTensor);
void recordFill(const vk::CommandBuffer &commandBuffer,
uint32_t fill);
/** /**
* Records a copy from the internal staging memory to the device memory * Records a copy from the internal staging memory to the device memory
* using an optional barrier to wait for the operation. This function would * using an optional barrier to wait for the operation. This function would
@ -279,6 +282,7 @@ class Tensor
vk::Buffer *bufferTo, vk::Buffer *bufferTo,
vk::DeviceSize bufferSize, vk::DeviceSize bufferSize,
vk::BufferCopy copyRegion); vk::BufferCopy copyRegion);
void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer, void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
const vk::Buffer& buffer, const vk::Buffer& buffer,
vk::AccessFlagBits srcAccessMask, vk::AccessFlagBits srcAccessMask,

View File

@ -0,0 +1,58 @@
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "kompute/Core.hpp"
#include "kompute/Tensor.hpp"
#include "kompute/operations/OpBase.hpp"
namespace kp {
/**
* Operation that fills the tensor
*/
class OpTensorFill : public OpBase
{
public:
/**
* Default constructor with parameters that provides the core vulkan
* resources and the tensors that will be used in the operation.
*
* @param tensors Tensors that will be used to create in operation.
*/
OpTensorFill(const std::vector<std::shared_ptr<Tensor>>& tensors);
/**
* Default destructor. This class does not manage memory so it won't be
* expecting the parent to perform a release.
*/
~OpTensorFill() override;
/**
* Records the fill command for tensor.
*
* @param commandBuffer The command buffer to record the command into.
*/
void record(const vk::CommandBuffer& commandBuffer) override;
/**
* Does not perform any preEval commands.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
/**
* Does not perform any postEval commands.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
private:
// -------------- ALWAYS OWNED RESOURCES
std::vector<std::shared_ptr<Tensor>> mTensors;
};
} // End namespace kp

View File

@ -6495,7 +6495,8 @@ struct llama_context * llama_new_context_with_model(
if (ggml_vk_has_device() && params.n_gpu_layers > 0 if (ggml_vk_has_device() && params.n_gpu_layers > 0
&& (model->ftype == LLAMA_FTYPE_ALL_F32 && (model->ftype == LLAMA_FTYPE_ALL_F32
|| model->ftype == LLAMA_FTYPE_MOSTLY_F16 || model->ftype == LLAMA_FTYPE_MOSTLY_F16
|| model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0)) { || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0
|| model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1)) {
// this allocates all Vulkan resources and memory buffers // this allocates all Vulkan resources and memory buffers
ctx->ctx_kompute = ggml_vk_init(); ctx->ctx_kompute = ggml_vk_init();