Consolidate code for mat x vec kernels and use subgroups more extensively.

2024-12-30 21:34:36 +00:00 · 2023-09-29 10:02:22 -04:00 · 2023-09-29 10:02:22 -04:00 · 93306f16d0
commit 93306f16d0
parent 77135a3bf5
16 changed files with 321 additions and 214 deletions
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@ -165,11 +165,20 @@ std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
        if (heapSize < memoryRequired)
            continue;

+        vk::PhysicalDeviceSubgroupProperties subgroupProperties;
+        vk::PhysicalDeviceProperties2 deviceProperties2;
+        deviceProperties2.pNext = &subgroupProperties;
+        physicalDevices.at(i).getProperties2(&deviceProperties2);
+
+        if (subgroupProperties.subgroupSize < 32)
+            continue;
+
        ggml_vk_device d;
        d.index = i;
        d.type = properties.deviceType;
        d.heapSize = heapSize;
        d.name = properties.deviceName;
+        d.subgroupSize = subgroupProperties.subgroupSize;
        size_t n_idx = ++count_by_name[d.name];
        if (n_idx > 1) {
            d.name += " (" + std::to_string(n_idx) + ")";
@ -242,7 +251,7 @@ bool ggml_vk_init_device(const ggml_vk_device &device) {
 bool ggml_vk_init_device(int device) {
    komputeManager()->initializeDevice(device, {},
                         {"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
-                          "VK_KHR_16bit_storage", "VK_KHR_storage_buffer_storage_class"});
+                          "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"});
    return ggml_vk_has_device();
 }

@ -772,9 +781,10 @@ void ggml_vk_soft_max(kp::Sequence& seq,
    };

    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
-    else {
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts});
+    } else {
        s_algo = komputeManager()->getAlgorithm(__func__);
        s_algo->setTensors({in, out});
        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
@ -890,9 +900,10 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
    };

    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts});
-    else {
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
+    } else {
        s_algo = komputeManager()->getAlgorithm(__func__);
        s_algo->setTensors({inA, inB, out});
        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)});
@ -907,26 +918,28 @@ void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_siz
                          const std::shared_ptr<kp::Tensor>& inB,
                          const std::shared_ptr<kp::Tensor>& out,
                          uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                          int32_t ne00, int32_t ne10, int32_t ne0,
-                          int32_t ne01, int32_t ne11) {
+                          int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
+                          int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02) {
    struct PushConstants {
        uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne10, ne0;
+        int32_t ne00, ne10, ne0, ne1, ne01, gqa;
    } pushConsts {
        safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne10, ne0,
+        ne00, ne10, ne0, ne1, ne01, ne12/ne02
    };

    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11)}, {}, {pushConsts});
-    else {
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
+    } else {
        s_algo = komputeManager()->getAlgorithm(__func__);
        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11)});
+        s_algo->setWorkgroup({unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)});
        s_algo->setPushConstants<PushConstants>({pushConsts});
        s_algo->updateDescriptors(s_kompute_context->pool.get());
    }
+    seq.record<kp::OpTensorFill>({out});
    seq.record<kp::OpAlgoDispatch>(s_algo);
 }

@ -1182,7 +1195,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
            const uint32_t nb3 = dst ? dst->nb[3] : 0;

            const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
-//            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
            const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;

            const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
@ -1263,30 +1276,46 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                    } break;
                case GGML_OP_MUL_MAT:
                    {
-                        if ((src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32)
-                            && src1->type == GGML_TYPE_F32) {
-                            ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
-                        } else if (src0->type == GGML_TYPE_Q4_0
-                                   && src1->type == GGML_TYPE_F32) {
-                            ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11);
-                        } else if (src0->type == GGML_TYPE_Q4_1
-                                   && src1->type == GGML_TYPE_F32) {
-                            ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11);
-                        } else {
-                            fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0->type, src1->type);
+                        if (src1t != GGML_TYPE_F32) {
+                            fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
                            goto not_implemented;
                        }
+
+                        if (!ggml_is_transposed(src0)
+                            && !ggml_is_transposed(src1)
+                            && ne00%32 == 0
+                            && ne11 > 1) {
+                            fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
+                            goto not_implemented;
+                        } else {
+                            switch (src0t) {
+                                case GGML_TYPE_F16:
+                                case GGML_TYPE_F32:
+                                    ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
+                                    break;
+                                case GGML_TYPE_Q4_0:
+                                    ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
+                                    break;
+                                case GGML_TYPE_Q4_1:
+                                    ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
+                                    break;
+                                default: {
+                                    fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
+                                    goto not_implemented;
+                                }
+                            }
+                        }
                    } break;
                case GGML_OP_GET_ROWS:
                    {
-                        if (src0->type == GGML_TYPE_F16) {
+                        if (src0t == GGML_TYPE_F16) {
                            ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
-                        } else if (src0->type == GGML_TYPE_Q4_0) {
+                        } else if (src0t == GGML_TYPE_Q4_0) {
                            ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
-                        } else if (src0->type == GGML_TYPE_Q4_1) {
+                        } else if (src0t == GGML_TYPE_Q4_1) {
                            ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
                        } else {
-                            fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0->type);
+                            fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t);
                            goto not_implemented;
                        }
                    } break;
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@ -34,6 +34,7 @@ struct ggml_vk_device {
    size_t heapSize = 0;
    std::string name;
    std::string vendor;
+    int subgroupSize = 0;
 };

 std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired);
--- a/kompute/op_getrows_q4_1.comp
+++ b/kompute/op_getrows_q4_1.comp
@ -43,7 +43,7 @@ void dequantize_row_q4_1(uint x /*Based from inA unaligned*/, uint y /*Based fro
    const uint nb = k / qk;

    for (uint i = 0; i < nb; i++) {
-        const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_0);
+        const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_1);

        const float16_t d = block.d;
        const float16_t m = block.m;
--- a/kompute/op_mul_mat_f16.comp
+++ b/kompute/op_mul_mat_f16.comp
@ -10,7 +10,9 @@

 #include "common.comp"

-layout(local_size_x = 64) in;
+#extension GL_KHR_shader_subgroup_arithmetic : require
+
+layout(local_size_x_id = 0) in;

 layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { float inB[]; };
@ -29,8 +31,6 @@ layout (push_constant) uniform parameter {
    int ne1;
 } pcs;

-shared float sum[gl_WorkGroupSize.x];
-
 void main() {
    const uint r0 = gl_WorkGroupID.x;
    const uint r1 = gl_WorkGroupID.y;
@ -39,24 +39,13 @@ void main() {
    const uint x = (r0*pcs.nb01 + im*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
    const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // based from inB

-    sum[gl_LocalInvocationID.x] = 0.0;
-
-    for (uint i = gl_LocalInvocationID.x; i < pcs.ne00; i += gl_WorkGroupSize.x) {
-        sum[gl_LocalInvocationID.x] += float(inA[x+i]) * float(inB[y+i]);
+    float sumf = 0.0f;
+    for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
+        sumf += float(inA[x+i]) * float(inB[y+i]);
    }

-    // accumulate the sum from all threads in the threadgroup
-    barrier();
-    memoryBarrierShared();
-    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
-        if (gl_LocalInvocationID.x < i) {
-            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
-        }
-        barrier();
-        memoryBarrierShared();
-    }
-
-    if (gl_LocalInvocationID.x == 0) {
-        out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = sum[0];
+    const float all_sum = subgroupAdd(sumf);
+    if (subgroupElect()) {
+        out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
    }
 }
--- a/kompute/op_mul_mat_q4_0.comp
+++ b/kompute/op_mul_mat_q4_0.comp
@ -10,7 +10,13 @@

 #include "common.comp"

-layout(local_size_x = 8, local_size_y = 8) in;
+#define BLOCKS_IN_QUANT QK4_0
+#define SIZE_OF_BLOCK sizeof_block_q4_0
+#define N_ROWS 4
+
+layout(local_size_x_id = 0) in;
+layout(local_size_y = 1) in;
+layout(local_size_z = 1) in;

 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { float inB[]; };
@ -23,58 +29,31 @@ layout (push_constant) uniform parameter {
    int ne00;
    int ne10;
    int ne0;
+    int ne1;
+    int ne01;
+    int gqa;
 } pcs;

-shared float sum[64];
-
-void main() {
-    const uint nb = uint(pcs.ne00/QK4_0);
-
-    const uint r0 = gl_WorkGroupID.x;
-    const uint r1 = gl_WorkGroupID.y;
-
-    const uint x = r0*nb; // Based from inA without base offset
-    const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
-
-    const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
-    const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
-
-    const uint ix = gl_LocalInvocationID.y/4;           // 0 or 1
-    const uint iy = gl_LocalInvocationID.y - 4*ix;      // 0...3
-
-    const uint first = 4 * iy;
-
-    float sumf = 0.0;
-
-    for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
-        const uint index = (x+i)*sizeof_block_q4_0+pcs.inAOff;
-        const float d = float(u8BufToFloat16(inA, index));
-
-        const uint xl = first; // Based from bl->qs
-        const uint yl = y + i * QK4_0 + first; // Based from inB
-
+// The q4_0 version of this function
+float block_q_n_dot_y(uint block_index, uint yb, uint il) {
    vec2 acc = vec2(0.0, 0.0);
+    const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
+    float d = float(u8BufToFloat16(inA, index));
+    float sumy = 0.0f;
+    for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
+        const uint16_t b = u8BufToU16(inA, index + 2 + il + i);

-        for (int j = 0; j < 4; ++j) {
-            const uint8_t b = inA[index+2+xl+j];
-            acc.x += inB[yl+j] * (b & 0xF) + inB[yl+j+16] * (b >> 4);
-            acc.y += inB[yl+j] + inB[yl+j+16];
-        }
+        const float yl0 = inB[yb + i];
+        const float yl1 = inB[yb + i + 1];
+        const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
+        const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];

-        sumf += d * (acc.x - 8.*acc.y);
-    }
+        sumy += yl0 + yl1 + yl8 + yl9;

-    sum[ith] = sumf;
-
-    //
-    // Accumulate the sum from all threads in the threadgroup
-    //
-    barrier();
-    if (ith == 0) {
-        float sumTotal = 0.0;
-        for (uint i = 0; i < nth; ++i) {
-            sumTotal += sum[i];
-        }
-        out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sumTotal;
+        acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
+        acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
    }
+    return d * (sumy * -8.f + acc[0] + acc[1]);
 }
+
+#include "op_mul_mv_q_n.comp"
--- a/kompute/op_mul_mat_q4_1.comp
+++ b/kompute/op_mul_mat_q4_1.comp
@ -10,7 +10,13 @@

 #include "common.comp"

-layout(local_size_x = 8, local_size_y = 8) in;
+#define BLOCKS_IN_QUANT QK4_1
+#define SIZE_OF_BLOCK sizeof_block_q4_1
+#define N_ROWS 4
+
+layout(local_size_x_id = 0) in;
+layout(local_size_y = 1) in;
+layout(local_size_z = 1) in;

 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { float inB[]; };
@ -23,81 +29,33 @@ layout (push_constant) uniform parameter {
    int ne00;
    int ne10;
    int ne0;
+    int ne1;
+    int ne01;
+    int gqa;
 } pcs;

-shared float sum[gl_WorkGroupSize.x*gl_WorkGroupSize.y];
-
-#define UNALIGNED_INPUT inA
-
-block_q4_1 get_unaligned_block_q4_1(uint index) {
-    block_q4_1 fres;
-    fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
-    fres.m = u8BufToFloat16(UNALIGNED_INPUT, index+2);
-    [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
-        fres.qs[it] = UNALIGNED_INPUT[index+4+it];
-    }
-    return fres;
-}
-
-void main() {
-    const uint nb = uint(pcs.ne00/QK4_1);
-
-    const uint r0 = gl_WorkGroupID.x;
-    const uint r1 = gl_WorkGroupID.y;
-
-    const uint x = r0*nb; // Based from inA without base offset
-    const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
-
-    const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
-    const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
-
-    const uint ix = gl_LocalInvocationID.y/4;           // 0 or 1
-    const uint iy = gl_LocalInvocationID.y - 4*ix;      // 0...3
-
-    const uint first = 4 * iy;
-
-    float sumf = 0.0;
-
-    for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
-        //TODO: Removing the use of pointers has been quite hairy here. If something goes wrong here, this is most likely it:
-
-        const block_q4_1 block = get_unaligned_block_q4_1((x+i)*sizeof_block_q4_1+pcs.inAOff);
-
-        const float d = float(block.d);
-        const float m = float(block.m);
-
-        const uint xl = first; // Based from bl->qs
-        const uint yl = y + i * QK4_1 + first; // Based from inB
-
+// The q4_1 version of this function
+float block_q_n_dot_y(uint block_index, uint yb, uint il) {
    vec2 acc = vec2(0.0, 0.0);
+    const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
+    float d = float(u8BufToFloat16(inA, index));
+    float m = float(u8BufToFloat16(inA, index+2));

-        for (int j = 0; j < 4; ++j) {
-            acc.x += inB[yl+j] * (d * (block.qs[xl+j] & 0xF) + m);
-            acc.y += inB[yl+j+16] * (d * (block.qs[xl+j] >> 4) + m);
-        }
+    float sumy = 0.0f;
+    for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
+        const uint16_t b = u8BufToU16(inA, index + 4 + il + i);

-        sumf += d * (acc.x - acc.y);
-    }
+        const float yl0 = inB[yb + i];
+        const float yl1 = inB[yb + i + 1];
+        const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
+        const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];

-    sum[ith] = sumf;
+        sumy += yl0 + yl1 + yl8 + yl9;

-    //
-    // Accumulate the sum from all threads in the threadgroup
-    //
-    barrier();
-    memoryBarrierShared();
-    if (ith%4 == 0) {
-        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
-    }
-    barrier();
-    memoryBarrierShared();
-    if (ith%16 == 0) {
-        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
-    }
-    barrier();
-    memoryBarrierShared();
-    if (ith == 0) {
-        for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
-        out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sum[0];
+        acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
+        acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
    }
+    return d * (acc[0] + acc[1]) + sumy * m;
 }
+
+#include "op_mul_mv_q_n.comp"
--- a/kompute/op_mul_mv_q_n.comp
+++ b/kompute/op_mul_mv_q_n.comp
@ -0,0 +1,49 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_EXT_debug_printf : enable
+
+void main() {
+    const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
+    const uint r0 = gl_WorkGroupID.x;
+    const uint r1 = gl_WorkGroupID.y;
+    const uint im = gl_WorkGroupID.z;
+    const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;
+    const uint offset0 = first_row * nb + im/pcs.gqa*(nb*pcs.ne0);
+
+    const uint x = offset0; // Based from inA without base offset
+    const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
+
+    float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
+
+    const uint ix = gl_SubgroupInvocationID/2;
+    const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);
+
+    uint yb = y + ix * BLOCKS_IN_QUANT + il;
+
+    debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
+        gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
+        gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
+
+    for (uint ib = ix; ib < nb; ib += gl_SubgroupSize/2) {
+        for (int row = 0; row < N_ROWS; row++) {
+            const uint block_index = x + ib + row * nb;
+            sumf[row] += block_q_n_dot_y(block_index, yb, il);
+        }
+
+        yb += BLOCKS_IN_QUANT * gl_SubgroupSize/2;
+    }
+
+    for (int row = 0; row < N_ROWS; ++row) {
+        const float tot = subgroupAdd(sumf[row]);
+        if (first_row + row < pcs.ne01 && subgroupElect()) {
+            out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;
+        }
+    }
+}
--- a/kompute/op_softmax.comp
+++ b/kompute/op_softmax.comp
@ -10,9 +10,9 @@

 #include "common.comp"

-#define nth 32
+#extension GL_KHR_shader_subgroup_arithmetic : require

-layout(local_size_x = nth) in;
+layout(local_size_x_id = 0) in;

 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
@ -25,8 +25,6 @@ layout(push_constant) uniform PushConstants {
    int ne02;
 } pcs;

-shared float buf[nth];
-
 void main() {
    const uint i03 = gl_WorkGroupID.z;
    const uint i02 = gl_WorkGroupID.y;
@ -37,46 +35,22 @@ void main() {
    const uint pdst = extra_off + pcs.outOff; // Based from out_

    // parallel max
-    buf[gl_LocalInvocationID.x] = uintBitsToFloat(0xFF800000);
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
-        buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], in_[psrc0 + i00]);
+    float localMax = uintBitsToFloat(0xFF800000);
+    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
+        localMax = max(localMax, in_[psrc0 + i00]);
    }
-
-    // reduce
-    barrier();
-    memoryBarrierShared();
-    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
-        if (gl_LocalInvocationID.x < i) {
-            buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], buf[gl_LocalInvocationID.x + i]);
-        }
-        barrier();
-        memoryBarrierShared();
-    }
-
-    // broadcast
-    const float max_ = buf[0];
+    float max_ = subgroupMax(localMax);

    // parallel sum
-    buf[gl_LocalInvocationID.x] = 0.0;
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
-        buf[gl_LocalInvocationID.x] += exp(in_[psrc0 + i00] - max_);
+    float localSum = 0.0f;
+    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
+        const float exp_psrc0 = exp(in_[psrc0 + i00] - max_);
+        localSum += exp_psrc0;
+        out_[pdst + i00] = exp_psrc0;
    }

-    // reduce
-    barrier();
-    memoryBarrierShared();
-    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
-        if (gl_LocalInvocationID.x < i) {
-            buf[gl_LocalInvocationID.x] += buf[gl_LocalInvocationID.x + i];
-        }
-        barrier();
-        memoryBarrierShared();
-    }
-
-    // broadcast
-    const float sum = buf[0];
-
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
-        out_[pdst + i00] = exp(in_[psrc0 + i00] - max_) / sum;
+    const float sum = subgroupAdd(localSum);
+    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
+        out_[pdst + i00] /= sum;
    }
 }
--- a/kompute/src/CMakeLists.txt
+++ b/kompute/src/CMakeLists.txt
@ -13,6 +13,7 @@ add_library(kompute STATIC Algorithm.cpp
    OpAlgoDispatch.cpp
    OpMemoryBarrier.cpp
    OpTensorCopy.cpp
+    OpTensorFill.cpp
    OpTensorSyncDevice.cpp
    OpTensorSyncLocal.cpp
    OpBufferSyncDevice.cpp
--- a/kompute/src/OpTensorFill.cpp
+++ b/kompute/src/OpTensorFill.cpp
@ -0,0 +1,55 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpTensorFill.hpp"
+#include "kompute/Tensor.hpp"
+
+namespace kp {
+
+OpTensorFill::OpTensorFill(const std::vector<std::shared_ptr<Tensor>>& tensors)
+{
+    KP_LOG_DEBUG("Kompute OpTensorFill constructor with params");
+
+    if (tensors.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpTensorFill called with less than 1 tensor");
+    }
+
+    this->mTensors = tensors;
+}
+
+OpTensorFill::~OpTensorFill()
+{
+    KP_LOG_DEBUG("Kompute OpTensorFill destructor started");
+}
+
+void
+OpTensorFill::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpTensorFill record called");
+
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        this->mTensors[i]->recordFill(commandBuffer, 0);
+    }
+}
+
+void
+OpTensorFill::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorFill preEval called");
+}
+
+void
+OpTensorFill::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorFill postEval called");
+}
+
+}
--- a/kompute/src/Tensor.cpp
+++ b/kompute/src/Tensor.cpp
@ -215,6 +215,13 @@ Tensor::recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
    commandBuffer.copyBuffer(*bufferFrom, *bufferTo, copyRegion);
 }

+void
+Tensor::recordFill(const vk::CommandBuffer &commandBuffer,
+                   uint32_t fill)
+{
+    commandBuffer.fillBuffer(*this->mPrimaryBuffer, mOffset, this->memorySize(), fill);
+}
+
 void
 Tensor::recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
                                         vk::AccessFlagBits srcAccessMask,
--- a/kompute/src/include/CMakeLists.txt
+++ b/kompute/src/include/CMakeLists.txt
@ -21,6 +21,7 @@ target_sources(kompute PRIVATE
    kompute/operations/OpMemoryBarrier.hpp
    kompute/operations/OpMult.hpp
    kompute/operations/OpTensorCopy.hpp
+    kompute/operations/OpTensorFill.hpp
    kompute/operations/OpTensorSyncDevice.hpp
    kompute/operations/OpTensorSyncLocal.hpp
    kompute/operations/OpBufferSyncDevice.hpp
--- a/kompute/src/include/kompute/Kompute.hpp
+++ b/kompute/src/include/kompute/Kompute.hpp
@ -15,6 +15,7 @@
 #include "operations/OpTensorSyncLocal.hpp"
 #include "operations/OpBufferSyncDevice.hpp"
 #include "operations/OpBufferSyncLocal.hpp"
+#include "operations/OpTensorFill.hpp"

 // Will be build by CMake and placed inside the build directory
 #include "ShaderLogisticRegression.hpp"
--- a/kompute/src/include/kompute/Tensor.hpp
+++ b/kompute/src/include/kompute/Tensor.hpp
@ -126,6 +126,9 @@ class Tensor
    void recordCopyFrom(const vk::CommandBuffer& commandBuffer,
                        std::shared_ptr<Tensor> copyFromTensor);

+    void recordFill(const vk::CommandBuffer &commandBuffer,
+                    uint32_t fill);
+
    /**
     * Records a copy from the internal staging memory to the device memory
     * using an optional barrier to wait for the operation. This function would
@ -279,6 +282,7 @@ class Tensor
                          vk::Buffer *bufferTo,
                          vk::DeviceSize bufferSize,
                          vk::BufferCopy copyRegion);
+
    void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
                                   const vk::Buffer& buffer,
                                   vk::AccessFlagBits srcAccessMask,
--- a/kompute/src/include/kompute/operations/OpTensorFill.hpp
+++ b/kompute/src/include/kompute/operations/OpTensorFill.hpp
@ -0,0 +1,58 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that fills the tensor
+ */
+class OpTensorFill : public OpBase
+{
+  public:
+    /**
+     * Default constructor with parameters that provides the core vulkan
+     * resources and the tensors that will be used in the operation.
+     *
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorFill(const std::vector<std::shared_ptr<Tensor>>& tensors);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpTensorFill() override;
+
+    /**
+     * Records the fill command for tensor.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::shared_ptr<Tensor>> mTensors;
+};
+
+} // End namespace kp
--- a/llama.cpp
+++ b/llama.cpp
@ -6495,7 +6495,8 @@ struct llama_context * llama_new_context_with_model(
    if (ggml_vk_has_device() && params.n_gpu_layers > 0
        && (model->ftype == LLAMA_FTYPE_ALL_F32
            || model->ftype == LLAMA_FTYPE_MOSTLY_F16
-            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0)) {
+            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0
+            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1)) {
        // this allocates all Vulkan resources and memory buffers
        ctx->ctx_kompute = ggml_vk_init();