vulkan : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

2024-12-30 21:34:36 +00:00 · 2023-11-23 17:18:27 -05:00 · 2023-11-23 17:18:27 -05:00 · 71565eb0c3
commit 71565eb0c3
parent af00cca08e
2 changed files with 82 additions and 21 deletions
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@ -579,29 +579,48 @@ uint32_t safe_divide(uint32_t a, uint32_t b) {
    return a / b;
 }
-void ggml_vk_add(kp::Sequence& seq,
+void ggml_vk_add(
-                    const std::shared_ptr<kp::Tensor>& inA,
+    kp::Sequence& seq,
-                    const std::shared_ptr<kp::Tensor>& inB,
+    const std::shared_ptr<kp::Tensor>& inA,
-                    const std::shared_ptr<kp::Tensor>& out,
+    const std::shared_ptr<kp::Tensor>& inB,
-                    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+    const std::shared_ptr<kp::Tensor>& out,
-                    uint32_t size) {
+    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
    int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
    int32_t nb00, int32_t nb01, int32_t nb02, int32_t nb03,
    int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
    int32_t nb10, int32_t nb11, int32_t nb12, int32_t nb13,
    int32_t ne0,
    int32_t nb0,  int32_t nb1,  int32_t nb2,  int32_t nb3
 ) {
    const static auto spirv = getSpirvShader(kp::shader_data::op_add_comp_spv,
        kp::shader_data::op_add_comp_spv_len);
    struct PushConstants {
        uint32_t inAOff, inBOff, outOff;
        int32_t ne00;
        int32_t nb00, nb01, nb02, nb03;
        int32_t ne10, ne11, ne12, ne13;
        int32_t nb10, nb11, nb12, nb13;
        int32_t ne0;
        int32_t nb0, nb1, nb2, nb3;
    } const pushConsts {
-        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4)
+        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
        ne00,
        nb00, nb01, nb02, nb03,
        ne10, ne11, ne12, ne13,
        nb10, nb11, nb12, nb13,
        ne0,
        nb0, nb1, nb2, nb3
    };
    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
+    if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
-    else {
+    } else {
        s_algo = komputeManager()->getAlgorithm(__func__);
        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({size});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
        s_algo->setPushConstants<PushConstants>({pushConsts});
        s_algo->updateDescriptors(s_kompute_context->pool.get());
    }
@ -1315,12 +1334,12 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
            const int32_t ne10 = src1 ? src1->ne[0] : 0;
            const int32_t ne11 = src1 ? src1->ne[1] : 0;
            const int32_t ne12 = src1 ? src1->ne[2] : 0;
-//            const int32_t ne13 = src1 ? src1->ne[3] : 0;
+            const int32_t ne13 = src1 ? src1->ne[3] : 0;
-//            const uint32_t nb10 = src1 ? src1->nb[0] : 0;
+            const uint32_t nb10 = src1 ? src1->nb[0] : 0;
            const uint32_t nb11 = src1 ? src1->nb[1] : 0;
            const uint32_t nb12 = src1 ? src1->nb[2] : 0;
-//            const uint32_t nb13 = src1 ? src1->nb[3] : 0;
+            const uint32_t nb13 = src1 ? src1->nb[3] : 0;
            const int32_t ne0 = dst ? dst->ne[0] : 0;
            const int32_t ne1 = dst ? dst->ne[1] : 0;
@ -1354,11 +1373,19 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                    } break;
                case GGML_OP_ADD:
                    {
-                        if (ggml_nelements(src1) == ne10) {
+                        if (ggml_nelements(src1) == ne10 && ne00 % 4 == 0) {
                            // src1 is a row
                            ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00);
                        } else {
-                            ggml_vk_add(seq, id_src0, id_src1, id_dst,  off_src0, off_src1, off_dst, ggml_nelements(dst)/4);
+                            ggml_vk_add(
                                seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
                                ne00, ne01, ne02, ne03,
                                nb00, nb01, nb02, nb03,
                                ne10, ne11, ne12, ne13,
                                nb10, nb11, nb12, nb13,
                                ne0,
                                nb0, nb1, nb2, nb3
                            );
                        }
                    } break;
                case GGML_OP_MUL:
--- a/kompute/op_add.comp
+++ b/kompute/op_add.comp
@ -10,7 +10,7 @@
 #include "common.comp"
-layout(local_size_x = 1) in;
+layout(local_size_x = 1024) in;
 layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
 layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
@ -20,13 +20,47 @@ layout(push_constant) uniform PushConstants {
    uint inAOff;
    uint inBOff;
    uint outOff;
    int ne00;
    int nb00;
    int nb01;
    int nb02;
    int nb03;
    int ne10;
    int ne11;
    int ne12;
    int ne13;
    int nb10;
    int nb11;
    int nb12;
    int nb13;
    int ne0;
    int nb0;
    int nb1;
    int nb2;
    int nb3;
 } pcs;
 // general-purpose kernel for addition of two tensors
 // pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
 // cons: not very efficient
 void main() {
-    const uint baseIndex = gl_WorkGroupID.x * 4;
+    const uint i03 = gl_WorkGroupID.z;
    const uint i02 = gl_WorkGroupID.y;
    const uint i01 = gl_WorkGroupID.x;
-    for (uint x = 0; x < 4; x++) {
+    const uint i13 = i03 % pcs.ne13;
-        const uint i = baseIndex + x;
+    const uint i12 = i02 % pcs.ne12;
-        out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[i + pcs.inBOff];
+    const uint i11 = i01 % pcs.ne11;
    uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + gl_SubgroupInvocationID.x*pcs.nb00) / 4);
    uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11 + gl_SubgroupInvocationID.x*pcs.nb10) / 4);
    uint dst_off  = uint((i03*pcs.nb3  + i02*pcs.nb2  + i01*pcs.nb1  + gl_SubgroupInvocationID.x*pcs.nb0 ) / 4);
    for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
        out_[pcs.outOff + dst_off] = inA[pcs.inAOff + src0_off] + inB[pcs.inBOff + src1_off];
        src0_off += gl_WorkGroupSize.x*pcs.ne00;
        src1_off += gl_WorkGroupSize.x*pcs.ne10;
        dst_off  += gl_WorkGroupSize.x*pcs.ne0;
    }
 }