diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index c64fde832..74dd0f00f 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -165,11 +165,20 @@ std::vector ggml_vk_available_devices(size_t memoryRequired) { if (heapSize < memoryRequired) continue; + vk::PhysicalDeviceSubgroupProperties subgroupProperties; + vk::PhysicalDeviceProperties2 deviceProperties2; + deviceProperties2.pNext = &subgroupProperties; + physicalDevices.at(i).getProperties2(&deviceProperties2); + + if (subgroupProperties.subgroupSize < 32) + continue; + ggml_vk_device d; d.index = i; d.type = properties.deviceType; d.heapSize = heapSize; d.name = properties.deviceName; + d.subgroupSize = subgroupProperties.subgroupSize; size_t n_idx = ++count_by_name[d.name]; if (n_idx > 1) { d.name += " (" + std::to_string(n_idx) + ")"; @@ -242,7 +251,7 @@ bool ggml_vk_init_device(const ggml_vk_device &device) { bool ggml_vk_init_device(int device) { komputeManager()->initializeDevice(device, {}, {"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage", - "VK_KHR_16bit_storage", "VK_KHR_storage_buffer_storage_class"}); + "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"}); return ggml_vk_has_device(); } @@ -772,9 +781,10 @@ void ggml_vk_soft_max(kp::Sequence& seq, }; std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}); - else { + if (!komputeManager()->hasAlgorithm(__func__)) { + const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2; + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts}); + } else { s_algo = komputeManager()->getAlgorithm(__func__); s_algo->setTensors({in, out}); s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); @@ -890,9 +900,10 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq, }; std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts}); - else { + if (!komputeManager()->hasAlgorithm(__func__)) { + const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2; + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts}); + } else { s_algo = komputeManager()->getAlgorithm(__func__); s_algo->setTensors({inA, inB, out}); s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)}); @@ -907,26 +918,28 @@ void ggml_vk_mul_mat_q4_x(const std::vector& spirv, uint32_t block_siz const std::shared_ptr& inB, const std::shared_ptr& out, uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t ne10, int32_t ne0, - int32_t ne01, int32_t ne11) { + int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1, + int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02) { struct PushConstants { uint32_t inAOff, inBOff, outOff; - int32_t ne00, ne10, ne0; + int32_t ne00, ne10, ne0, ne1, ne01, gqa; } pushConsts { safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, ne10, ne0, + ne00, ne10, ne0, ne1, ne01, ne12/ne02 }; std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11)}, {}, {pushConsts}); - else { + if (!komputeManager()->hasAlgorithm(__func__)) { + const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2; + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts}); + } else { s_algo = komputeManager()->getAlgorithm(__func__); s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11)}); + s_algo->setWorkgroup({unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)}); s_algo->setPushConstants({pushConsts}); s_algo->updateDescriptors(s_kompute_context->pool.get()); } + seq.record({out}); seq.record(s_algo); } @@ -1182,7 +1195,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph const uint32_t nb3 = dst ? dst->nb[3] : 0; const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; -// const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; + const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; const static std::shared_ptr nullTensor = nullptr; @@ -1263,30 +1276,46 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph } break; case GGML_OP_MUL_MAT: { - if ((src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32) - && src1->type == GGML_TYPE_F32) { - ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1); - } else if (src0->type == GGML_TYPE_Q4_0 - && src1->type == GGML_TYPE_F32) { - ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11); - } else if (src0->type == GGML_TYPE_Q4_1 - && src1->type == GGML_TYPE_F32) { - ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11); - } else { - fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0->type, src1->type); + if (src1t != GGML_TYPE_F32) { + fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t); goto not_implemented; } + + if (!ggml_is_transposed(src0) + && !ggml_is_transposed(src1) + && ne00%32 == 0 + && ne11 > 1) { + fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t); + goto not_implemented; + } else { + switch (src0t) { + case GGML_TYPE_F16: + case GGML_TYPE_F32: + ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1); + break; + case GGML_TYPE_Q4_0: + ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02); + break; + case GGML_TYPE_Q4_1: + ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02); + break; + default: { + fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t); + goto not_implemented; + } + } + } } break; case GGML_OP_GET_ROWS: { - if (src0->type == GGML_TYPE_F16) { + if (src0t == GGML_TYPE_F16) { ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); - } else if (src0->type == GGML_TYPE_Q4_0) { + } else if (src0t == GGML_TYPE_Q4_0) { ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); - } else if (src0->type == GGML_TYPE_Q4_1) { + } else if (src0t == GGML_TYPE_Q4_1) { ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); } else { - fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0->type); + fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t); goto not_implemented; } } break; diff --git a/ggml-vulkan.h b/ggml-vulkan.h index 614959ba8..7989cfc1f 100644 --- a/ggml-vulkan.h +++ b/ggml-vulkan.h @@ -34,6 +34,7 @@ struct ggml_vk_device { size_t heapSize = 0; std::string name; std::string vendor; + int subgroupSize = 0; }; std::vector ggml_vk_available_devices(size_t memoryRequired); diff --git a/kompute/op_getrows_q4_1.comp b/kompute/op_getrows_q4_1.comp index 44718c6af..3d00928d3 100644 --- a/kompute/op_getrows_q4_1.comp +++ b/kompute/op_getrows_q4_1.comp @@ -43,7 +43,7 @@ void dequantize_row_q4_1(uint x /*Based from inA unaligned*/, uint y /*Based fro const uint nb = k / qk; for (uint i = 0; i < nb; i++) { - const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_0); + const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_1); const float16_t d = block.d; const float16_t m = block.m; diff --git a/kompute/op_mul_mat_f16.comp b/kompute/op_mul_mat_f16.comp index 1390c00cf..72a667f92 100644 --- a/kompute/op_mul_mat_f16.comp +++ b/kompute/op_mul_mat_f16.comp @@ -10,7 +10,9 @@ #include "common.comp" -layout(local_size_x = 64) in; +#extension GL_KHR_shader_subgroup_arithmetic : require + +layout(local_size_x_id = 0) in; layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; }; layout (binding = 1) readonly buffer tensorInB { float inB[]; }; @@ -29,8 +31,6 @@ layout (push_constant) uniform parameter { int ne1; } pcs; -shared float sum[gl_WorkGroupSize.x]; - void main() { const uint r0 = gl_WorkGroupID.x; const uint r1 = gl_WorkGroupID.y; @@ -39,24 +39,13 @@ void main() { const uint x = (r0*pcs.nb01 + im*pcs.nb02) / 2 + pcs.inAOff; // Based from inA const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // based from inB - sum[gl_LocalInvocationID.x] = 0.0; - - for (uint i = gl_LocalInvocationID.x; i < pcs.ne00; i += gl_WorkGroupSize.x) { - sum[gl_LocalInvocationID.x] += float(inA[x+i]) * float(inB[y+i]); + float sumf = 0.0f; + for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) { + sumf += float(inA[x+i]) * float(inB[y+i]); } - // accumulate the sum from all threads in the threadgroup - barrier(); - memoryBarrierShared(); - [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) { - if (gl_LocalInvocationID.x < i) { - sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i]; - } - barrier(); - memoryBarrierShared(); - } - - if (gl_LocalInvocationID.x == 0) { - out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = sum[0]; + const float all_sum = subgroupAdd(sumf); + if (subgroupElect()) { + out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum; } } diff --git a/kompute/op_mul_mat_q4_0.comp b/kompute/op_mul_mat_q4_0.comp index 9b6dd72dc..165df3c37 100644 --- a/kompute/op_mul_mat_q4_0.comp +++ b/kompute/op_mul_mat_q4_0.comp @@ -10,7 +10,13 @@ #include "common.comp" -layout(local_size_x = 8, local_size_y = 8) in; +#define BLOCKS_IN_QUANT QK4_0 +#define SIZE_OF_BLOCK sizeof_block_q4_0 +#define N_ROWS 4 + +layout(local_size_x_id = 0) in; +layout(local_size_y = 1) in; +layout(local_size_z = 1) in; layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; layout (binding = 1) readonly buffer tensorInB { float inB[]; }; @@ -23,58 +29,31 @@ layout (push_constant) uniform parameter { int ne00; int ne10; int ne0; + int ne1; + int ne01; + int gqa; } pcs; -shared float sum[64]; +// The q4_0 version of this function +float block_q_n_dot_y(uint block_index, uint yb, uint il) { + vec2 acc = vec2(0.0, 0.0); + const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff; + float d = float(u8BufToFloat16(inA, index)); + float sumy = 0.0f; + for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) { + const uint16_t b = u8BufToU16(inA, index + 2 + il + i); -void main() { - const uint nb = uint(pcs.ne00/QK4_0); + const float yl0 = inB[yb + i]; + const float yl1 = inB[yb + i + 1]; + const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2]; + const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1]; - const uint r0 = gl_WorkGroupID.x; - const uint r1 = gl_WorkGroupID.y; + sumy += yl0 + yl1 + yl8 + yl9; - const uint x = r0*nb; // Based from inA without base offset - const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB - - const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y; - const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y; - - const uint ix = gl_LocalInvocationID.y/4; // 0 or 1 - const uint iy = gl_LocalInvocationID.y - 4*ix; // 0...3 - - const uint first = 4 * iy; - - float sumf = 0.0; - - for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) { - const uint index = (x+i)*sizeof_block_q4_0+pcs.inAOff; - const float d = float(u8BufToFloat16(inA, index)); - - const uint xl = first; // Based from bl->qs - const uint yl = y + i * QK4_0 + first; // Based from inB - - vec2 acc = vec2(0.0, 0.0); - - for (int j = 0; j < 4; ++j) { - const uint8_t b = inA[index+2+xl+j]; - acc.x += inB[yl+j] * (b & 0xF) + inB[yl+j+16] * (b >> 4); - acc.y += inB[yl+j] + inB[yl+j+16]; - } - - sumf += d * (acc.x - 8.*acc.y); - } - - sum[ith] = sumf; - - // - // Accumulate the sum from all threads in the threadgroup - // - barrier(); - if (ith == 0) { - float sumTotal = 0.0; - for (uint i = 0; i < nth; ++i) { - sumTotal += sum[i]; - } - out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sumTotal; + acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00); + acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000); } + return d * (sumy * -8.f + acc[0] + acc[1]); } + +#include "op_mul_mv_q_n.comp" diff --git a/kompute/op_mul_mat_q4_1.comp b/kompute/op_mul_mat_q4_1.comp index fb7b051b8..683b695ca 100644 --- a/kompute/op_mul_mat_q4_1.comp +++ b/kompute/op_mul_mat_q4_1.comp @@ -10,7 +10,13 @@ #include "common.comp" -layout(local_size_x = 8, local_size_y = 8) in; +#define BLOCKS_IN_QUANT QK4_1 +#define SIZE_OF_BLOCK sizeof_block_q4_1 +#define N_ROWS 4 + +layout(local_size_x_id = 0) in; +layout(local_size_y = 1) in; +layout(local_size_z = 1) in; layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; layout (binding = 1) readonly buffer tensorInB { float inB[]; }; @@ -23,81 +29,33 @@ layout (push_constant) uniform parameter { int ne00; int ne10; int ne0; + int ne1; + int ne01; + int gqa; } pcs; -shared float sum[gl_WorkGroupSize.x*gl_WorkGroupSize.y]; +// The q4_1 version of this function +float block_q_n_dot_y(uint block_index, uint yb, uint il) { + vec2 acc = vec2(0.0, 0.0); + const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff; + float d = float(u8BufToFloat16(inA, index)); + float m = float(u8BufToFloat16(inA, index+2)); -#define UNALIGNED_INPUT inA + float sumy = 0.0f; + for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) { + const uint16_t b = u8BufToU16(inA, index + 4 + il + i); -block_q4_1 get_unaligned_block_q4_1(uint index) { - block_q4_1 fres; - fres.d = u8BufToFloat16(UNALIGNED_INPUT, index); - fres.m = u8BufToFloat16(UNALIGNED_INPUT, index+2); - [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) { - fres.qs[it] = UNALIGNED_INPUT[index+4+it]; + const float yl0 = inB[yb + i]; + const float yl1 = inB[yb + i + 1]; + const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2]; + const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1]; + + sumy += yl0 + yl1 + yl8 + yl9; + + acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00); + acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000); } - return fres; + return d * (acc[0] + acc[1]) + sumy * m; } -void main() { - const uint nb = uint(pcs.ne00/QK4_1); - - const uint r0 = gl_WorkGroupID.x; - const uint r1 = gl_WorkGroupID.y; - - const uint x = r0*nb; // Based from inA without base offset - const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB - - const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y; - const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y; - - const uint ix = gl_LocalInvocationID.y/4; // 0 or 1 - const uint iy = gl_LocalInvocationID.y - 4*ix; // 0...3 - - const uint first = 4 * iy; - - float sumf = 0.0; - - for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) { - //TODO: Removing the use of pointers has been quite hairy here. If something goes wrong here, this is most likely it: - - const block_q4_1 block = get_unaligned_block_q4_1((x+i)*sizeof_block_q4_1+pcs.inAOff); - - const float d = float(block.d); - const float m = float(block.m); - - const uint xl = first; // Based from bl->qs - const uint yl = y + i * QK4_1 + first; // Based from inB - - vec2 acc = vec2(0.0, 0.0); - - for (int j = 0; j < 4; ++j) { - acc.x += inB[yl+j] * (d * (block.qs[xl+j] & 0xF) + m); - acc.y += inB[yl+j+16] * (d * (block.qs[xl+j] >> 4) + m); - } - - sumf += d * (acc.x - acc.y); - } - - sum[ith] = sumf; - - // - // Accumulate the sum from all threads in the threadgroup - // - barrier(); - memoryBarrierShared(); - if (ith%4 == 0) { - sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3]; - } - barrier(); - memoryBarrierShared(); - if (ith%16 == 0) { - sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12]; - } - barrier(); - memoryBarrierShared(); - if (ith == 0) { - for (uint i = 16; i < nth; i += 16) sum[0] += sum[i]; - out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sum[0]; - } -} +#include "op_mul_mv_q_n.comp" diff --git a/kompute/op_mul_mv_q_n.comp b/kompute/op_mul_mv_q_n.comp new file mode 100644 index 000000000..83de952dd --- /dev/null +++ b/kompute/op_mul_mv_q_n.comp @@ -0,0 +1,49 @@ +/** + * Copyright (c) 2023 Nomic, Inc. All rights reserved. + * + * This software is licensed under the terms of the Software for Open Models License (SOM), + * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany + * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc. + */ + +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_EXT_debug_printf : enable + +void main() { + const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT); + const uint r0 = gl_WorkGroupID.x; + const uint r1 = gl_WorkGroupID.y; + const uint im = gl_WorkGroupID.z; + const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS; + const uint offset0 = first_row * nb + im/pcs.gqa*(nb*pcs.ne0); + + const uint x = offset0; // Based from inA without base offset + const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB + + float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f}; + + const uint ix = gl_SubgroupInvocationID/2; + const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2); + + uint yb = y + ix * BLOCKS_IN_QUANT + il; + + debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n", + gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize, + gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z); + + for (uint ib = ix; ib < nb; ib += gl_SubgroupSize/2) { + for (int row = 0; row < N_ROWS; row++) { + const uint block_index = x + ib + row * nb; + sumf[row] += block_q_n_dot_y(block_index, yb, il); + } + + yb += BLOCKS_IN_QUANT * gl_SubgroupSize/2; + } + + for (int row = 0; row < N_ROWS; ++row) { + const float tot = subgroupAdd(sumf[row]); + if (first_row + row < pcs.ne01 && subgroupElect()) { + out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot; + } + } +} diff --git a/kompute/op_softmax.comp b/kompute/op_softmax.comp index e936d8f68..60456a3bb 100644 --- a/kompute/op_softmax.comp +++ b/kompute/op_softmax.comp @@ -10,9 +10,9 @@ #include "common.comp" -#define nth 32 +#extension GL_KHR_shader_subgroup_arithmetic : require -layout(local_size_x = nth) in; +layout(local_size_x_id = 0) in; layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; @@ -25,8 +25,6 @@ layout(push_constant) uniform PushConstants { int ne02; } pcs; -shared float buf[nth]; - void main() { const uint i03 = gl_WorkGroupID.z; const uint i02 = gl_WorkGroupID.y; @@ -37,46 +35,22 @@ void main() { const uint pdst = extra_off + pcs.outOff; // Based from out_ // parallel max - buf[gl_LocalInvocationID.x] = uintBitsToFloat(0xFF800000); - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { - buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], in_[psrc0 + i00]); + float localMax = uintBitsToFloat(0xFF800000); + for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) { + localMax = max(localMax, in_[psrc0 + i00]); } - - // reduce - barrier(); - memoryBarrierShared(); - [[unroll]] for (uint i = nth/2; i > 0; i /= 2) { - if (gl_LocalInvocationID.x < i) { - buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], buf[gl_LocalInvocationID.x + i]); - } - barrier(); - memoryBarrierShared(); - } - - // broadcast - const float max_ = buf[0]; + float max_ = subgroupMax(localMax); // parallel sum - buf[gl_LocalInvocationID.x] = 0.0; - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { - buf[gl_LocalInvocationID.x] += exp(in_[psrc0 + i00] - max_); + float localSum = 0.0f; + for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) { + const float exp_psrc0 = exp(in_[psrc0 + i00] - max_); + localSum += exp_psrc0; + out_[pdst + i00] = exp_psrc0; } - // reduce - barrier(); - memoryBarrierShared(); - [[unroll]] for (uint i = nth/2; i > 0; i /= 2) { - if (gl_LocalInvocationID.x < i) { - buf[gl_LocalInvocationID.x] += buf[gl_LocalInvocationID.x + i]; - } - barrier(); - memoryBarrierShared(); - } - - // broadcast - const float sum = buf[0]; - - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { - out_[pdst + i00] = exp(in_[psrc0 + i00] - max_) / sum; + const float sum = subgroupAdd(localSum); + for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) { + out_[pdst + i00] /= sum; } } diff --git a/kompute/src/CMakeLists.txt b/kompute/src/CMakeLists.txt index b5c3879af..42b7d07f5 100644 --- a/kompute/src/CMakeLists.txt +++ b/kompute/src/CMakeLists.txt @@ -13,6 +13,7 @@ add_library(kompute STATIC Algorithm.cpp OpAlgoDispatch.cpp OpMemoryBarrier.cpp OpTensorCopy.cpp + OpTensorFill.cpp OpTensorSyncDevice.cpp OpTensorSyncLocal.cpp OpBufferSyncDevice.cpp diff --git a/kompute/src/OpTensorFill.cpp b/kompute/src/OpTensorFill.cpp new file mode 100644 index 000000000..da477dcc7 --- /dev/null +++ b/kompute/src/OpTensorFill.cpp @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: Apache-2.0 + +/** + * Copyright (c) 2023 Nomic, Inc. All rights reserved. + * + * This software is licensed under the terms of the Software for Open Models License (SOM), + * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany + * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc. + */ + +#include "kompute/operations/OpTensorFill.hpp" +#include "kompute/Tensor.hpp" + +namespace kp { + +OpTensorFill::OpTensorFill(const std::vector>& tensors) +{ + KP_LOG_DEBUG("Kompute OpTensorFill constructor with params"); + + if (tensors.size() < 1) { + throw std::runtime_error( + "Kompute OpTensorFill called with less than 1 tensor"); + } + + this->mTensors = tensors; +} + +OpTensorFill::~OpTensorFill() +{ + KP_LOG_DEBUG("Kompute OpTensorFill destructor started"); +} + +void +OpTensorFill::record(const vk::CommandBuffer& commandBuffer) +{ + KP_LOG_DEBUG("Kompute OpTensorFill record called"); + + for (size_t i = 0; i < this->mTensors.size(); i++) { + this->mTensors[i]->recordFill(commandBuffer, 0); + } +} + +void +OpTensorFill::preEval(const vk::CommandBuffer& /*commandBuffer*/) +{ + KP_LOG_DEBUG("Kompute OpTensorFill preEval called"); +} + +void +OpTensorFill::postEval(const vk::CommandBuffer& /*commandBuffer*/) +{ + KP_LOG_DEBUG("Kompute OpTensorFill postEval called"); +} + +} diff --git a/kompute/src/Tensor.cpp b/kompute/src/Tensor.cpp index 9c343ff13..65279206d 100644 --- a/kompute/src/Tensor.cpp +++ b/kompute/src/Tensor.cpp @@ -215,6 +215,13 @@ Tensor::recordCopyBuffer(const vk::CommandBuffer& commandBuffer, commandBuffer.copyBuffer(*bufferFrom, *bufferTo, copyRegion); } +void +Tensor::recordFill(const vk::CommandBuffer &commandBuffer, + uint32_t fill) +{ + commandBuffer.fillBuffer(*this->mPrimaryBuffer, mOffset, this->memorySize(), fill); +} + void Tensor::recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer, vk::AccessFlagBits srcAccessMask, diff --git a/kompute/src/include/CMakeLists.txt b/kompute/src/include/CMakeLists.txt index 313f48311..53e9d8ae6 100644 --- a/kompute/src/include/CMakeLists.txt +++ b/kompute/src/include/CMakeLists.txt @@ -21,6 +21,7 @@ target_sources(kompute PRIVATE kompute/operations/OpMemoryBarrier.hpp kompute/operations/OpMult.hpp kompute/operations/OpTensorCopy.hpp + kompute/operations/OpTensorFill.hpp kompute/operations/OpTensorSyncDevice.hpp kompute/operations/OpTensorSyncLocal.hpp kompute/operations/OpBufferSyncDevice.hpp diff --git a/kompute/src/include/kompute/Kompute.hpp b/kompute/src/include/kompute/Kompute.hpp index f59a63b50..70e0dd433 100644 --- a/kompute/src/include/kompute/Kompute.hpp +++ b/kompute/src/include/kompute/Kompute.hpp @@ -15,6 +15,7 @@ #include "operations/OpTensorSyncLocal.hpp" #include "operations/OpBufferSyncDevice.hpp" #include "operations/OpBufferSyncLocal.hpp" +#include "operations/OpTensorFill.hpp" // Will be build by CMake and placed inside the build directory #include "ShaderLogisticRegression.hpp" diff --git a/kompute/src/include/kompute/Tensor.hpp b/kompute/src/include/kompute/Tensor.hpp index 4c260ce6b..2ab88eb30 100644 --- a/kompute/src/include/kompute/Tensor.hpp +++ b/kompute/src/include/kompute/Tensor.hpp @@ -126,6 +126,9 @@ class Tensor void recordCopyFrom(const vk::CommandBuffer& commandBuffer, std::shared_ptr copyFromTensor); + void recordFill(const vk::CommandBuffer &commandBuffer, + uint32_t fill); + /** * Records a copy from the internal staging memory to the device memory * using an optional barrier to wait for the operation. This function would @@ -279,6 +282,7 @@ class Tensor vk::Buffer *bufferTo, vk::DeviceSize bufferSize, vk::BufferCopy copyRegion); + void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer, const vk::Buffer& buffer, vk::AccessFlagBits srcAccessMask, diff --git a/kompute/src/include/kompute/operations/OpTensorFill.hpp b/kompute/src/include/kompute/operations/OpTensorFill.hpp new file mode 100644 index 000000000..9a6bf131e --- /dev/null +++ b/kompute/src/include/kompute/operations/OpTensorFill.hpp @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: Apache-2.0 +#pragma once + +#include "kompute/Core.hpp" + +#include "kompute/Tensor.hpp" + +#include "kompute/operations/OpBase.hpp" + +namespace kp { + +/** + * Operation that fills the tensor + */ +class OpTensorFill : public OpBase +{ + public: + /** + * Default constructor with parameters that provides the core vulkan + * resources and the tensors that will be used in the operation. + * + * @param tensors Tensors that will be used to create in operation. + */ + OpTensorFill(const std::vector>& tensors); + + /** + * Default destructor. This class does not manage memory so it won't be + * expecting the parent to perform a release. + */ + ~OpTensorFill() override; + + /** + * Records the fill command for tensor. + * + * @param commandBuffer The command buffer to record the command into. + */ + void record(const vk::CommandBuffer& commandBuffer) override; + + /** + * Does not perform any preEval commands. + * + * @param commandBuffer The command buffer to record the command into. + */ + virtual void preEval(const vk::CommandBuffer& commandBuffer) override; + + /** + * Does not perform any postEval commands. + * + * @param commandBuffer The command buffer to record the command into. + */ + virtual void postEval(const vk::CommandBuffer& commandBuffer) override; + + private: + // -------------- ALWAYS OWNED RESOURCES + std::vector> mTensors; +}; + +} // End namespace kp diff --git a/llama.cpp b/llama.cpp index 1432696bd..245174898 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6495,7 +6495,8 @@ struct llama_context * llama_new_context_with_model( if (ggml_vk_has_device() && params.n_gpu_layers > 0 && (model->ftype == LLAMA_FTYPE_ALL_F32 || model->ftype == LLAMA_FTYPE_MOSTLY_F16 - || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0)) { + || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0 + || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1)) { // this allocates all Vulkan resources and memory buffers ctx->ctx_kompute = ggml_vk_init();