Consolidate code for mat x vec kernels and use subgroups more extensively.

2024-12-30 21:34:36 +00:00 · 2023-09-29 10:02:22 -04:00 · 2023-09-29 10:02:22 -04:00 · 93306f16d0
commit 93306f16d0
parent 77135a3bf5
16 changed files with 321 additions and 214 deletions
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@ -165,11 +165,20 @@ std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
        if (heapSize < memoryRequired)
            continue;
        vk::PhysicalDeviceSubgroupProperties subgroupProperties;
        vk::PhysicalDeviceProperties2 deviceProperties2;
        deviceProperties2.pNext = &subgroupProperties;
        physicalDevices.at(i).getProperties2(&deviceProperties2);
        if (subgroupProperties.subgroupSize < 32)
            continue;
        ggml_vk_device d;
        d.index = i;
        d.type = properties.deviceType;
        d.heapSize = heapSize;
        d.name = properties.deviceName;
        d.subgroupSize = subgroupProperties.subgroupSize;
        size_t n_idx = ++count_by_name[d.name];
        if (n_idx > 1) {
            d.name += " (" + std::to_string(n_idx) + ")";
@ -242,7 +251,7 @@ bool ggml_vk_init_device(const ggml_vk_device &device) {
 bool ggml_vk_init_device(int device) {
    komputeManager()->initializeDevice(device, {},
                         {"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
-                          "VK_KHR_16bit_storage", "VK_KHR_storage_buffer_storage_class"});
+                          "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"});
    return ggml_vk_has_device();
 }
@ -772,9 +781,10 @@ void ggml_vk_soft_max(kp::Sequence& seq,
    };
    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
+    if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
-    else {
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts});
    } else {
        s_algo = komputeManager()->getAlgorithm(__func__);
        s_algo->setTensors({in, out});
        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
@ -890,9 +900,10 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
    };
    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
+    if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts});
+        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
-    else {
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
    } else {
        s_algo = komputeManager()->getAlgorithm(__func__);
        s_algo->setTensors({inA, inB, out});
        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)});
@ -907,26 +918,28 @@ void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_siz
                          const std::shared_ptr<kp::Tensor>& inB,
                          const std::shared_ptr<kp::Tensor>& out,
                          uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                          int32_t ne00, int32_t ne10, int32_t ne0,
+                          int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
-                          int32_t ne01, int32_t ne11) {
+                          int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02) {
    struct PushConstants {
        uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne10, ne0;
+        int32_t ne00, ne10, ne0, ne1, ne01, gqa;
    } pushConsts {
        safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne10, ne0,
+        ne00, ne10, ne0, ne1, ne01, ne12/ne02
    };
    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
+    if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11)}, {}, {pushConsts});
+        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
-    else {
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
    } else {
        s_algo = komputeManager()->getAlgorithm(__func__);
        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11)});
+        s_algo->setWorkgroup({unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)});
        s_algo->setPushConstants<PushConstants>({pushConsts});
        s_algo->updateDescriptors(s_kompute_context->pool.get());
    }
    seq.record<kp::OpTensorFill>({out});
    seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@ -1182,7 +1195,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
            const uint32_t nb3 = dst ? dst->nb[3] : 0;
            const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
-//            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
            const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
            const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
@ -1263,30 +1276,46 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                    } break;
                case GGML_OP_MUL_MAT:
                    {
-                        if ((src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32)
+                        if (src1t != GGML_TYPE_F32) {
-                            && src1->type == GGML_TYPE_F32) {
+                            fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
                            ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
                        } else if (src0->type == GGML_TYPE_Q4_0
                                   && src1->type == GGML_TYPE_F32) {
                            ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11);
                        } else if (src0->type == GGML_TYPE_Q4_1
                                   && src1->type == GGML_TYPE_F32) {
                            ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11);
                        } else {
                            fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0->type, src1->type);
                            goto not_implemented;
                        }
                        if (!ggml_is_transposed(src0)
                            && !ggml_is_transposed(src1)
                            && ne00%32 == 0
                            && ne11 > 1) {
                            fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
                            goto not_implemented;
                        } else {
                            switch (src0t) {
                                case GGML_TYPE_F16:
                                case GGML_TYPE_F32:
                                    ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
                                    break;
                                case GGML_TYPE_Q4_0:
                                    ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
                                    break;
                                case GGML_TYPE_Q4_1:
                                    ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
                                    break;
                                default: {
                                    fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
                                    goto not_implemented;
                                }
                            }
                        }
                    } break;
                case GGML_OP_GET_ROWS:
                    {
-                        if (src0->type == GGML_TYPE_F16) {
+                        if (src0t == GGML_TYPE_F16) {
                            ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
-                        } else if (src0->type == GGML_TYPE_Q4_0) {
+                        } else if (src0t == GGML_TYPE_Q4_0) {
                            ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
-                        } else if (src0->type == GGML_TYPE_Q4_1) {
+                        } else if (src0t == GGML_TYPE_Q4_1) {
                            ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
                        } else {
-                            fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0->type);
+                            fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t);
                            goto not_implemented;
                        }
                    } break;
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@ -34,6 +34,7 @@ struct ggml_vk_device {
    size_t heapSize = 0;
    std::string name;
    std::string vendor;
    int subgroupSize = 0;
 };
 std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired);
--- a/kompute/op_getrows_q4_1.comp
+++ b/kompute/op_getrows_q4_1.comp
@ -43,7 +43,7 @@ void dequantize_row_q4_1(uint x /*Based from inA unaligned*/, uint y /*Based fro
    const uint nb = k / qk;
    for (uint i = 0; i < nb; i++) {
-        const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_0);
+        const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_1);
        const float16_t d = block.d;
        const float16_t m = block.m;
--- a/kompute/op_mul_mat_f16.comp
+++ b/kompute/op_mul_mat_f16.comp
@ -10,7 +10,9 @@
 #include "common.comp"
-layout(local_size_x = 64) in;
+#extension GL_KHR_shader_subgroup_arithmetic : require
 layout(local_size_x_id = 0) in;
 layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { float inB[]; };
@ -29,8 +31,6 @@ layout (push_constant) uniform parameter {
    int ne1;
 } pcs;
 shared float sum[gl_WorkGroupSize.x];
 void main() {
    const uint r0 = gl_WorkGroupID.x;
    const uint r1 = gl_WorkGroupID.y;
@ -39,24 +39,13 @@ void main() {
    const uint x = (r0*pcs.nb01 + im*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
    const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // based from inB
-    sum[gl_LocalInvocationID.x] = 0.0;
+    float sumf = 0.0f;
-
+    for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
-    for (uint i = gl_LocalInvocationID.x; i < pcs.ne00; i += gl_WorkGroupSize.x) {
+        sumf += float(inA[x+i]) * float(inB[y+i]);
        sum[gl_LocalInvocationID.x] += float(inA[x+i]) * float(inB[y+i]);
    }
-    // accumulate the sum from all threads in the threadgroup
+    const float all_sum = subgroupAdd(sumf);
-    barrier();
+    if (subgroupElect()) {
-    memoryBarrierShared();
+        out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
        if (gl_LocalInvocationID.x < i) {
            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
        }
        barrier();
        memoryBarrierShared();
    }
    if (gl_LocalInvocationID.x == 0) {
        out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = sum[0];
    }
 }
--- a/kompute/op_mul_mat_q4_0.comp
+++ b/kompute/op_mul_mat_q4_0.comp
@ -10,7 +10,13 @@
 #include "common.comp"
-layout(local_size_x = 8, local_size_y = 8) in;
+#define BLOCKS_IN_QUANT QK4_0
 #define SIZE_OF_BLOCK sizeof_block_q4_0
 #define N_ROWS 4
 layout(local_size_x_id = 0) in;
 layout(local_size_y = 1) in;
 layout(local_size_z = 1) in;
 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { float inB[]; };
@ -23,58 +29,31 @@ layout (push_constant) uniform parameter {
    int ne00;
    int ne10;
    int ne0;
    int ne1;
    int ne01;
    int gqa;
 } pcs;
-shared float sum[64];
+// The q4_0 version of this function
 float block_q_n_dot_y(uint block_index, uint yb, uint il) {
    vec2 acc = vec2(0.0, 0.0);
    const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
    float d = float(u8BufToFloat16(inA, index));
    float sumy = 0.0f;
    for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
        const uint16_t b = u8BufToU16(inA, index + 2 + il + i);
-void main() {
+        const float yl0 = inB[yb + i];
-    const uint nb = uint(pcs.ne00/QK4_0);
+        const float yl1 = inB[yb + i + 1];
        const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
        const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
-    const uint r0 = gl_WorkGroupID.x;
+        sumy += yl0 + yl1 + yl8 + yl9;
    const uint r1 = gl_WorkGroupID.y;
-    const uint x = r0*nb; // Based from inA without base offset
+        acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
-    const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
+        acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
    const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
    const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
    const uint ix = gl_LocalInvocationID.y/4;           // 0 or 1
    const uint iy = gl_LocalInvocationID.y - 4*ix;      // 0...3
    const uint first = 4 * iy;
    float sumf = 0.0;
    for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
        const uint index = (x+i)*sizeof_block_q4_0+pcs.inAOff;
        const float d = float(u8BufToFloat16(inA, index));
        const uint xl = first; // Based from bl->qs
        const uint yl = y + i * QK4_0 + first; // Based from inB
        vec2 acc = vec2(0.0, 0.0);
        for (int j = 0; j < 4; ++j) {
            const uint8_t b = inA[index+2+xl+j];
            acc.x += inB[yl+j] * (b & 0xF) + inB[yl+j+16] * (b >> 4);
            acc.y += inB[yl+j] + inB[yl+j+16];
        }
        sumf += d * (acc.x - 8.*acc.y);
    }
    sum[ith] = sumf;
    //
    // Accumulate the sum from all threads in the threadgroup
    //
    barrier();
    if (ith == 0) {
        float sumTotal = 0.0;
        for (uint i = 0; i < nth; ++i) {
            sumTotal += sum[i];
        }
        out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sumTotal;
    }
    return d * (sumy * -8.f + acc[0] + acc[1]);
 }
 #include "op_mul_mv_q_n.comp"
--- a/kompute/op_mul_mat_q4_1.comp
+++ b/kompute/op_mul_mat_q4_1.comp
@ -10,7 +10,13 @@
 #include "common.comp"
-layout(local_size_x = 8, local_size_y = 8) in;
+#define BLOCKS_IN_QUANT QK4_1
 #define SIZE_OF_BLOCK sizeof_block_q4_1
 #define N_ROWS 4
 layout(local_size_x_id = 0) in;
 layout(local_size_y = 1) in;
 layout(local_size_z = 1) in;
 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { float inB[]; };
@ -23,81 +29,33 @@ layout (push_constant) uniform parameter {
    int ne00;
    int ne10;
    int ne0;
    int ne1;
    int ne01;
    int gqa;
 } pcs;
-shared float sum[gl_WorkGroupSize.x*gl_WorkGroupSize.y];
+// The q4_1 version of this function
 float block_q_n_dot_y(uint block_index, uint yb, uint il) {
    vec2 acc = vec2(0.0, 0.0);
    const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
    float d = float(u8BufToFloat16(inA, index));
    float m = float(u8BufToFloat16(inA, index+2));
-#define UNALIGNED_INPUT inA
+    float sumy = 0.0f;
    for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
        const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
-block_q4_1 get_unaligned_block_q4_1(uint index) {
+        const float yl0 = inB[yb + i];
-    block_q4_1 fres;
+        const float yl1 = inB[yb + i + 1];
-    fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
+        const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
-    fres.m = u8BufToFloat16(UNALIGNED_INPUT, index+2);
+        const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
-    [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
+
-        fres.qs[it] = UNALIGNED_INPUT[index+4+it];
+        sumy += yl0 + yl1 + yl8 + yl9;
        acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
        acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
    }
-    return fres;
+    return d * (acc[0] + acc[1]) + sumy * m;
 }
-void main() {
+#include "op_mul_mv_q_n.comp"
    const uint nb = uint(pcs.ne00/QK4_1);
    const uint r0 = gl_WorkGroupID.x;
    const uint r1 = gl_WorkGroupID.y;
    const uint x = r0*nb; // Based from inA without base offset
    const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
    const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
    const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
    const uint ix = gl_LocalInvocationID.y/4;           // 0 or 1
    const uint iy = gl_LocalInvocationID.y - 4*ix;      // 0...3
    const uint first = 4 * iy;
    float sumf = 0.0;
    for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
        //TODO: Removing the use of pointers has been quite hairy here. If something goes wrong here, this is most likely it:
        const block_q4_1 block = get_unaligned_block_q4_1((x+i)*sizeof_block_q4_1+pcs.inAOff);
        const float d = float(block.d);
        const float m = float(block.m);
        const uint xl = first; // Based from bl->qs
        const uint yl = y + i * QK4_1 + first; // Based from inB
        vec2 acc = vec2(0.0, 0.0);
        for (int j = 0; j < 4; ++j) {
            acc.x += inB[yl+j] * (d * (block.qs[xl+j] & 0xF) + m);
            acc.y += inB[yl+j+16] * (d * (block.qs[xl+j] >> 4) + m);
        }
        sumf += d * (acc.x - acc.y);
    }
    sum[ith] = sumf;
    //
    // Accumulate the sum from all threads in the threadgroup
    //
    barrier();
    memoryBarrierShared();
    if (ith%4 == 0) {
        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
    }
    barrier();
    memoryBarrierShared();
    if (ith%16 == 0) {
        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
    }
    barrier();
    memoryBarrierShared();
    if (ith == 0) {
        for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
        out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sum[0];
    }
 }
--- a/kompute/op_mul_mv_q_n.comp
+++ b/kompute/op_mul_mv_q_n.comp
@ -0,0 +1,49 @@
 /**
 * Copyright (c) 2023 Nomic, Inc. All rights reserved.
 *
 * This software is licensed under the terms of the Software for Open Models License (SOM),
 * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
 * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
 */
 #extension GL_KHR_shader_subgroup_arithmetic : require
 #extension GL_EXT_debug_printf : enable
 void main() {
    const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
    const uint r0 = gl_WorkGroupID.x;
    const uint r1 = gl_WorkGroupID.y;
    const uint im = gl_WorkGroupID.z;
    const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;
    const uint offset0 = first_row * nb + im/pcs.gqa*(nb*pcs.ne0);
    const uint x = offset0; // Based from inA without base offset
    const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
    float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
    const uint ix = gl_SubgroupInvocationID/2;
    const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);
    uint yb = y + ix * BLOCKS_IN_QUANT + il;
    debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
        gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
        gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
    for (uint ib = ix; ib < nb; ib += gl_SubgroupSize/2) {
        for (int row = 0; row < N_ROWS; row++) {
            const uint block_index = x + ib + row * nb;
            sumf[row] += block_q_n_dot_y(block_index, yb, il);
        }
        yb += BLOCKS_IN_QUANT * gl_SubgroupSize/2;
    }
    for (int row = 0; row < N_ROWS; ++row) {
        const float tot = subgroupAdd(sumf[row]);
        if (first_row + row < pcs.ne01 && subgroupElect()) {
            out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;
        }
    }
 }
--- a/kompute/op_softmax.comp
+++ b/kompute/op_softmax.comp
@ -10,9 +10,9 @@
 #include "common.comp"
-#define nth 32
+#extension GL_KHR_shader_subgroup_arithmetic : require
-layout(local_size_x = nth) in;
+layout(local_size_x_id = 0) in;
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
@ -25,8 +25,6 @@ layout(push_constant) uniform PushConstants {
    int ne02;
 } pcs;
 shared float buf[nth];
 void main() {
    const uint i03 = gl_WorkGroupID.z;
    const uint i02 = gl_WorkGroupID.y;
@ -37,46 +35,22 @@ void main() {
    const uint pdst = extra_off + pcs.outOff; // Based from out_
    // parallel max
-    buf[gl_LocalInvocationID.x] = uintBitsToFloat(0xFF800000);
+    float localMax = uintBitsToFloat(0xFF800000);
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
-        buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], in_[psrc0 + i00]);
+        localMax = max(localMax, in_[psrc0 + i00]);
    }
-
+    float max_ = subgroupMax(localMax);
    // reduce
    barrier();
    memoryBarrierShared();
    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
        if (gl_LocalInvocationID.x < i) {
            buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], buf[gl_LocalInvocationID.x + i]);
        }
        barrier();
        memoryBarrierShared();
    }
    // broadcast
    const float max_ = buf[0];
    // parallel sum
-    buf[gl_LocalInvocationID.x] = 0.0;
+    float localSum = 0.0f;
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
-        buf[gl_LocalInvocationID.x] += exp(in_[psrc0 + i00] - max_);
+        const float exp_psrc0 = exp(in_[psrc0 + i00] - max_);
        localSum += exp_psrc0;
        out_[pdst + i00] = exp_psrc0;
    }
-    // reduce
+    const float sum = subgroupAdd(localSum);
-    barrier();
+    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
-    memoryBarrierShared();
+        out_[pdst + i00] /= sum;
    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
        if (gl_LocalInvocationID.x < i) {
            buf[gl_LocalInvocationID.x] += buf[gl_LocalInvocationID.x + i];
        }
        barrier();
        memoryBarrierShared();
    }
    // broadcast
    const float sum = buf[0];
    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
        out_[pdst + i00] = exp(in_[psrc0 + i00] - max_) / sum;
    }
 }
--- a/kompute/src/CMakeLists.txt
+++ b/kompute/src/CMakeLists.txt
@ -13,6 +13,7 @@ add_library(kompute STATIC Algorithm.cpp
    OpAlgoDispatch.cpp
    OpMemoryBarrier.cpp
    OpTensorCopy.cpp
    OpTensorFill.cpp
    OpTensorSyncDevice.cpp
    OpTensorSyncLocal.cpp
    OpBufferSyncDevice.cpp
--- a/kompute/src/OpTensorFill.cpp
+++ b/kompute/src/OpTensorFill.cpp
@ -0,0 +1,55 @@
 // SPDX-License-Identifier: Apache-2.0
 /**
 * Copyright (c) 2023 Nomic, Inc. All rights reserved.
 *
 * This software is licensed under the terms of the Software for Open Models License (SOM),
 * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
 * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
 */
 #include "kompute/operations/OpTensorFill.hpp"
 #include "kompute/Tensor.hpp"
 namespace kp {
 OpTensorFill::OpTensorFill(const std::vector<std::shared_ptr<Tensor>>& tensors)
 {
    KP_LOG_DEBUG("Kompute OpTensorFill constructor with params");
    if (tensors.size() < 1) {
        throw std::runtime_error(
          "Kompute OpTensorFill called with less than 1 tensor");
    }
    this->mTensors = tensors;
 }
 OpTensorFill::~OpTensorFill()
 {
    KP_LOG_DEBUG("Kompute OpTensorFill destructor started");
 }
 void
 OpTensorFill::record(const vk::CommandBuffer& commandBuffer)
 {
    KP_LOG_DEBUG("Kompute OpTensorFill record called");
    for (size_t i = 0; i < this->mTensors.size(); i++) {
        this->mTensors[i]->recordFill(commandBuffer, 0);
    }
 }
 void
 OpTensorFill::preEval(const vk::CommandBuffer& /*commandBuffer*/)
 {
    KP_LOG_DEBUG("Kompute OpTensorFill preEval called");
 }
 void
 OpTensorFill::postEval(const vk::CommandBuffer& /*commandBuffer*/)
 {
    KP_LOG_DEBUG("Kompute OpTensorFill postEval called");
 }
 }
--- a/kompute/src/Tensor.cpp
+++ b/kompute/src/Tensor.cpp
@ -215,6 +215,13 @@ Tensor::recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
    commandBuffer.copyBuffer(*bufferFrom, *bufferTo, copyRegion);
 }
 void
 Tensor::recordFill(const vk::CommandBuffer &commandBuffer,
                   uint32_t fill)
 {
    commandBuffer.fillBuffer(*this->mPrimaryBuffer, mOffset, this->memorySize(), fill);
 }
 void
 Tensor::recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
                                         vk::AccessFlagBits srcAccessMask,
--- a/kompute/src/include/CMakeLists.txt
+++ b/kompute/src/include/CMakeLists.txt
@ -21,6 +21,7 @@ target_sources(kompute PRIVATE
    kompute/operations/OpMemoryBarrier.hpp
    kompute/operations/OpMult.hpp
    kompute/operations/OpTensorCopy.hpp
    kompute/operations/OpTensorFill.hpp
    kompute/operations/OpTensorSyncDevice.hpp
    kompute/operations/OpTensorSyncLocal.hpp
    kompute/operations/OpBufferSyncDevice.hpp
--- a/kompute/src/include/kompute/Kompute.hpp
+++ b/kompute/src/include/kompute/Kompute.hpp
@ -15,6 +15,7 @@
 #include "operations/OpTensorSyncLocal.hpp"
 #include "operations/OpBufferSyncDevice.hpp"
 #include "operations/OpBufferSyncLocal.hpp"
 #include "operations/OpTensorFill.hpp"
 // Will be build by CMake and placed inside the build directory
 #include "ShaderLogisticRegression.hpp"
--- a/kompute/src/include/kompute/Tensor.hpp
+++ b/kompute/src/include/kompute/Tensor.hpp
@ -126,6 +126,9 @@ class Tensor
    void recordCopyFrom(const vk::CommandBuffer& commandBuffer,
                        std::shared_ptr<Tensor> copyFromTensor);
    void recordFill(const vk::CommandBuffer &commandBuffer,
                    uint32_t fill);
    /**
     * Records a copy from the internal staging memory to the device memory
     * using an optional barrier to wait for the operation. This function would
@ -279,6 +282,7 @@ class Tensor
                          vk::Buffer *bufferTo,
                          vk::DeviceSize bufferSize,
                          vk::BufferCopy copyRegion);
    void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
                                   const vk::Buffer& buffer,
                                   vk::AccessFlagBits srcAccessMask,
--- a/kompute/src/include/kompute/operations/OpTensorFill.hpp
+++ b/kompute/src/include/kompute/operations/OpTensorFill.hpp
@ -0,0 +1,58 @@
 // SPDX-License-Identifier: Apache-2.0
 #pragma once
 #include "kompute/Core.hpp"
 #include "kompute/Tensor.hpp"
 #include "kompute/operations/OpBase.hpp"
 namespace kp {
 /**
 * Operation that fills the tensor
 */
 class OpTensorFill : public OpBase
 {
  public:
    /**
     * Default constructor with parameters that provides the core vulkan
     * resources and the tensors that will be used in the operation.
     *
     * @param tensors Tensors that will be used to create in operation.
     */
    OpTensorFill(const std::vector<std::shared_ptr<Tensor>>& tensors);
    /**
     * Default destructor. This class does not manage memory so it won't be
     * expecting the parent to perform a release.
     */
    ~OpTensorFill() override;
    /**
     * Records the fill command for tensor.
     *
     * @param commandBuffer The command buffer to record the command into.
     */
    void record(const vk::CommandBuffer& commandBuffer) override;
    /**
     * Does not perform any preEval commands.
     *
     * @param commandBuffer The command buffer to record the command into.
     */
    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
    /**
     * Does not perform any postEval commands.
     *
     * @param commandBuffer The command buffer to record the command into.
     */
    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
  private:
    // -------------- ALWAYS OWNED RESOURCES
    std::vector<std::shared_ptr<Tensor>> mTensors;
 };
 } // End namespace kp
--- a/llama.cpp
+++ b/llama.cpp
@ -6495,7 +6495,8 @@ struct llama_context * llama_new_context_with_model(
    if (ggml_vk_has_device() && params.n_gpu_layers > 0
        && (model->ftype == LLAMA_FTYPE_ALL_F32
            || model->ftype == LLAMA_FTYPE_MOSTLY_F16
-            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0)) {
+            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0
            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1)) {
        // this allocates all Vulkan resources and memory buffers
        ctx->ctx_kompute = ggml_vk_init();