diff --git a/CMakeLists.txt b/CMakeLists.txt index cf4042ea3..fbbb46bbf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -483,6 +483,7 @@ if (LLAMA_KOMPUTE) kompute/op_mul_mat_mat_f32.comp kompute/op_mul_mat_mat_q4_0.comp kompute/op_mul_mat_mat_q8_0.comp + kompute/op_mul_mat_mat_q6_k.comp kompute/op_mul_mat_f16.comp kompute/op_mul_mat_q8_0.comp kompute/op_mul_mat_q4_0.comp @@ -517,6 +518,7 @@ if (LLAMA_KOMPUTE) shaderop_mul_mat_mat_f32.h shaderop_mul_mat_mat_q4_0.h shaderop_mul_mat_mat_q8_0.h + shaderop_mul_mat_mat_q6_k.h shaderop_mul_mat_f16.h shaderop_mul_mat_q8_0.h shaderop_mul_mat_q4_0.h diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index f2320f3cc..488683ec3 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -31,6 +31,7 @@ #include "shaderop_mul_mat_mat_f16.h" #include "shaderop_mul_mat_mat_q4_0.h" #include "shaderop_mul_mat_mat_q8_0.h" +#include "shaderop_mul_mat_mat_q6_k.h" #include "shaderop_getrows_f16.h" #include "shaderop_getrows_q4_0.h" #include "shaderop_getrows_q4_1.h" @@ -1109,6 +1110,54 @@ void ggml_vk_mul_mat_mat_q8_0( seq.record(s_algo); } +void ggml_vk_mul_mat_mat_q6_k( + kp::Sequence& seq, + const std::shared_ptr& inA, + const std::shared_ptr& inB, + const std::shared_ptr& out, + uint32_t inAOff, uint32_t inBOff, uint32_t outOff, + int32_t ne00, int32_t ne01, int32_t ne02, + uint32_t nb01, uint32_t nb02, + int32_t ne11, int32_t ne12, + uint32_t nb11, uint32_t nb12, + uint32_t nb1, uint32_t nb2) { + const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_q6_k_comp_spv, + kp::shader_data::op_mul_mat_mat_q6_k_comp_spv_len); + struct PushConstants { + uint32_t inAOff, inBOff, outOff; + int32_t ne00, ne01, ne02, ne11, ne12; + uint32_t nb01, nb02; + uint32_t nb11, nb12; + uint32_t nb1, nb2; + } pushConsts { + inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4), + ne00, ne01, ne02, ne11, ne12, + nb01, nb02, nb11, nb12, + nb1, nb2 + }; + + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(__func__)) { + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), + {inA, inB, out}, spirv, + {unsigned(ne01)/32, + unsigned(ne11), + unsigned(std::max(ne12, ne02)) + }, + {}, + {pushConsts}); + } else { + s_algo = komputeManager()->getAlgorithm(__func__); + s_algo->setTensors({inA, inB, out}); + s_algo->setWorkgroup({unsigned(ne01)/32, + unsigned(ne11), + unsigned(std::max(ne12, ne02)), + }); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} void ggml_vk_mul_mat_mat_q4_x(const std::vector& spirv, kp::Sequence& seq, @@ -1138,7 +1187,7 @@ void ggml_vk_mul_mat_mat_q4_x(const std::vector& spirv, if (!komputeManager()->hasAlgorithm(__func__)) { s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, - {unsigned(ne01), + {unsigned(ne01)/32, unsigned(ne11), unsigned(std::max(ne12, ne02))}, {}, @@ -1619,6 +1668,16 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph ne11, ne12, nb11, nb12, nb1, nb2); + break; + case GGML_TYPE_Q6_K: + ggml_vk_mul_mat_mat_q6_k(seq, + id_src0, id_src1, id_dst, + off_src0, off_src1, off_dst, + ne00, ne01, ne02, + nb01, nb02, + ne11, ne12, + nb11, nb12, + nb1, nb2); break; default: { fprintf(stderr, "%s: %s: Unsupported quantization for M*M: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t); diff --git a/kompute/op_mul_mat_mat_q6_k.comp b/kompute/op_mul_mat_mat_q6_k.comp new file mode 100644 index 000000000..127f17df6 --- /dev/null +++ b/kompute/op_mul_mat_mat_q6_k.comp @@ -0,0 +1,88 @@ +/** + * Copyright (c) 2023 Nomic, Inc. All rights reserved. + * + * This software is licensed under the terms of the Software for Open Models + * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy + * of this license should accompany this software. Except as expressly granted + * in the SOM license, all rights are reserved by Nomic, Inc. + */ + +#version 450 + +#include "common.comp" + +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_EXT_debug_printf : enable + +layout(local_size_x = 32) in; + +layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; +layout(binding = 1) readonly buffer tensorInB { float inB[]; }; +layout(binding = 2) writeonly buffer tensorOut { float out_[]; }; + +layout(push_constant) uniform parameter { + uint inAOff; + uint inBOff; + uint outOff; + int ne00; + int ne01; + int ne02; + int ne11; + int ne12; + uint nb01; + uint nb02; + uint nb11; + uint nb12; + uint nb1; + uint nb2; +} +pcs; + + +#define ELS_PER_BLOCK 256 //QK_K +#define QH_OFFSET (ELS_PER_BLOCK / 2) +#define QSCALES_OFFSET (QH_OFFSET + (ELS_PER_BLOCK / 4)) +#define SCALE_SCALE_OFFSET (QSCALES_OFFSET + (ELS_PER_BLOCK / 16)) +#define BLOCK_SIZE (SCALE_SCALE_OFFSET + 2) + +void main() { + uvec3 gid = gl_GlobalInvocationID; + + uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z; + uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z; + + const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) + pcs.inAOff; // Based from inA + const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB + + float sum = 0.0f; + const uint n_blocks = pcs.ne00 / ELS_PER_BLOCK; + // this is pretty much all lifted right from dequantize_row_q6_K + uint outoff = 0; + for (uint i = 0; i < n_blocks; i++) { + const uint block_number = i; + const uint block_offset = block_number * BLOCK_SIZE; + const float scales_d = u8BufToFloat16(inA, x + block_offset + SCALE_SCALE_OFFSET); + uint qloff = block_offset; + uint qhoff = block_offset + QH_OFFSET; + uint scoff = block_offset + QSCALES_OFFSET; + for (int n = 0; n < 256; n += 128) { + for (int l = 0; l < 32; ++l) { + int is = l/16; + const int q1 = int((inA[x + qloff + l + 0] & 0xF) | (((inA[x + qhoff + l] >> 0) & 3) << 4)) - 32; + const int q2 = int((inA[x + qloff + l + 32] & 0xF) | (((inA[x + qhoff + l] >> 2) & 3) << 4)) - 32; + const int q3 = int((inA[x + qloff + l + 0] >> 4) | (((inA[x + qhoff + l] >> 4) & 3) << 4)) - 32; + const int q4 = int((inA[x + qloff + l + 32] >> 4) | (((inA[x + qhoff + l] >> 6) & 3) << 4)) - 32; + sum += inB[y + outoff + l + 0] * scales_d * int8_t(inA[x + scoff + is + 0]) * q1; + sum += inB[y + outoff + l + 32] * scales_d * int8_t(inA[x + scoff + is + 2]) * q2; + sum += inB[y + outoff + l + 64] * scales_d * int8_t(inA[x + scoff + is + 4]) * q3; + sum += inB[y + outoff + l + 96] * scales_d * int8_t(inA[x + scoff + is + 6]) * q4; + } + outoff += 128; + qloff += 64; + qhoff += 32; + scoff += 8; + } + } + + out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum; +} \ No newline at end of file