vulkan : optimize workgroup sizes

2024-12-29 04:44:34 +00:00 · 2023-11-23 17:18:48 -05:00 · 2023-11-23 17:18:48 -05:00 · 39abedd1d7
commit 39abedd1d7
parent 84f7fc4553
8 changed files with 24 additions and 31 deletions
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@ -847,9 +847,9 @@ void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
    };

    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
+    if (!komputeManager()->hasAlgorithm(__func__)) {
        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
-    else {
+    } else {
        s_algo = komputeManager()->getAlgorithm(__func__);
        s_algo->setTensors({in, out});
        s_algo->setWorkgroup({(uint32_t)nrows});
--- a/kompute/op_cpy_f16_f16.comp
+++ b/kompute/op_cpy_f16_f16.comp
@ -10,13 +10,12 @@

 #include "common.comp"

-#define nth 32
 #define IN_TYPE float16_t
 #define IN_TYPE_SIZE 2
 #define OUT_TYPE float16_t
 #define OUT_TYPE_SIZE 2

-layout(local_size_x = nth) in;
+layout(local_size_x = 1024) in;

 layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
 layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
@ -54,7 +53,7 @@ void main() {

    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_

-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
        out_[dst_data+i00] = OUT_TYPE(in_[src]);
    }
--- a/kompute/op_cpy_f16_f32.comp
+++ b/kompute/op_cpy_f16_f32.comp
@ -10,13 +10,12 @@

 #include "common.comp"

-#define nth 32
 #define IN_TYPE float16_t
 #define IN_TYPE_SIZE 2
 #define OUT_TYPE float
 #define OUT_TYPE_SIZE 4

-layout(local_size_x = nth) in;
+layout(local_size_x = 1024) in;

 layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
 layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
@ -54,7 +53,7 @@ void main() {

    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_

-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
        out_[dst_data+i00] = OUT_TYPE(in_[src]);
    }
--- a/kompute/op_cpy_f32_f16.comp
+++ b/kompute/op_cpy_f32_f16.comp
@ -10,13 +10,12 @@

 #include "common.comp"

-#define nth 32
 #define IN_TYPE float
 #define IN_TYPE_SIZE 4
 #define OUT_TYPE float16_t
 #define OUT_TYPE_SIZE 2

-layout(local_size_x = nth) in;
+layout(local_size_x = 1024) in;

 layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
 layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
@ -54,7 +53,7 @@ void main() {

    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_

-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
        out_[dst_data+i00] = OUT_TYPE(in_[src]);
    }
--- a/kompute/op_cpy_f32_f32.comp
+++ b/kompute/op_cpy_f32_f32.comp
@ -2,13 +2,12 @@

 #include "common.comp"

-#define nth 32
 #define IN_TYPE float
 #define IN_TYPE_SIZE 4
 #define OUT_TYPE float
 #define OUT_TYPE_SIZE 4

-layout(local_size_x = nth) in;
+layout(local_size_x = 1024) in;

 layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
 layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
@ -46,7 +45,7 @@ void main() {

    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_

-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
        out_[dst_data+i00] = OUT_TYPE(in_[src]);
    }
--- a/kompute/op_norm.comp
+++ b/kompute/op_norm.comp
@ -10,9 +10,7 @@

 #include "common.comp"

-#define nth 256
-
-layout(local_size_x = nth) in;
+layout(local_size_x = 256) in;

 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict tensorOut { float out_[]; };
@ -25,21 +23,21 @@ layout(push_constant) uniform PushConstants {
    float eps;
 } pcs;

-shared float sum[nth];
+shared float sum[gl_WorkGroupSize.x];

 void main() {
    const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
    // MEAN
    // parallel sum
    sum[gl_LocalInvocationID.x] = 0.0;
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
        sum[gl_LocalInvocationID.x] += in_[x+i00];
    }

    // reduce
    barrier();
    memoryBarrierShared();
-    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
        if (gl_LocalInvocationID.x < i) {
            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
        }
@ -57,21 +55,21 @@ void main() {

    // recenter
    const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
        out_[y+i00] = in_[x+i00] - mean;
    }

    // VARIANCE
    // parallel sum
    sum[gl_LocalInvocationID.x] = 0.0;
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
        sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
    }

    // reduce
    barrier();
    memoryBarrierShared();
-    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
        if (gl_LocalInvocationID.x < i) {
            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
        }
@ -88,7 +86,7 @@ void main() {
    const float variance = sum[0];

    const float scale = 1.0f/sqrt(variance + pcs.eps);
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
        out_[y+i00] *= scale;
    }
 }
--- a/kompute/op_rmsnorm.comp
+++ b/kompute/op_rmsnorm.comp
@ -10,9 +10,7 @@

 #include "common.comp"

-#define nth 512
-
-layout(local_size_x = nth) in;
+layout(local_size_x = 512) in;

 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict tensorOut { float out_[]; };
@ -25,21 +23,21 @@ layout(push_constant) uniform PushConstants {
    float eps;
 } pcs;

-shared float sum[nth];
+shared float sum[gl_WorkGroupSize.x];

 void main() {
    const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_

    // parallel sum
    sum[gl_LocalInvocationID.x] = 0.0;
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
        sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
    }

    // reduce
    barrier();
    memoryBarrierShared();
-    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
        if (gl_LocalInvocationID.x < i) {
            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
        }
@ -57,7 +55,7 @@ void main() {
    const float scale = 1.0f/sqrt(sum[0] + pcs.eps);

    const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
        out_[y+i00] = in_[x+i00] * scale;
    }
 }
--- a/kompute/op_rope_f32.comp
+++ b/kompute/op_rope_f32.comp
@ -10,6 +10,7 @@

 #include "common.comp"

+// TODO: use a local size of 32 or more (Metal uses 1024)
 layout(local_size_x = 1) in;

 layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };