From 39abedd1d75b83cc9ff6f5c951d2e4f63d840bdf Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 23 Nov 2023 17:18:48 -0500
Subject: [PATCH] vulkan : optimize workgroup sizes

---
 ggml-vulkan.cpp             |  4 ++--
 kompute/op_cpy_f16_f16.comp |  5 ++---
 kompute/op_cpy_f16_f32.comp |  5 ++---
 kompute/op_cpy_f32_f16.comp |  5 ++---
 kompute/op_cpy_f32_f32.comp |  5 ++---
 kompute/op_norm.comp        | 18 ++++++++----------
 kompute/op_rmsnorm.comp     | 12 +++++-------
 kompute/op_rope_f32.comp    |  1 +
 8 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 3e3f6cc80..74d9fceb6 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -847,9 +847,9 @@ void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
+    if (!komputeManager()->hasAlgorithm(__func__)) {
         s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
-    else {
+    } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({(uint32_t)nrows});
diff --git a/kompute/op_cpy_f16_f16.comp b/kompute/op_cpy_f16_f16.comp
index 5f425ae28..652db0313 100644
--- a/kompute/op_cpy_f16_f16.comp
+++ b/kompute/op_cpy_f16_f16.comp
@@ -10,13 +10,12 @@
 
 #include "common.comp"
 
-#define nth 32
 #define IN_TYPE float16_t
 #define IN_TYPE_SIZE 2
 #define OUT_TYPE float16_t
 #define OUT_TYPE_SIZE 2
 
-layout(local_size_x = nth) in;
+layout(local_size_x = 1024) in;
 
 layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
 layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
@@ -54,7 +53,7 @@ void main() {
 
     const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
 
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
         out_[dst_data+i00] = OUT_TYPE(in_[src]);
     }
diff --git a/kompute/op_cpy_f16_f32.comp b/kompute/op_cpy_f16_f32.comp
index 4298bebdd..aa204248c 100644
--- a/kompute/op_cpy_f16_f32.comp
+++ b/kompute/op_cpy_f16_f32.comp
@@ -10,13 +10,12 @@
 
 #include "common.comp"
 
-#define nth 32
 #define IN_TYPE float16_t
 #define IN_TYPE_SIZE 2
 #define OUT_TYPE float
 #define OUT_TYPE_SIZE 4
 
-layout(local_size_x = nth) in;
+layout(local_size_x = 1024) in;
 
 layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
 layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
@@ -54,7 +53,7 @@ void main() {
 
     const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
 
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
         out_[dst_data+i00] = OUT_TYPE(in_[src]);
     }
diff --git a/kompute/op_cpy_f32_f16.comp b/kompute/op_cpy_f32_f16.comp
index 2d763edfd..4fdab4831 100644
--- a/kompute/op_cpy_f32_f16.comp
+++ b/kompute/op_cpy_f32_f16.comp
@@ -10,13 +10,12 @@
 
 #include "common.comp"
 
-#define nth 32
 #define IN_TYPE float
 #define IN_TYPE_SIZE 4
 #define OUT_TYPE float16_t
 #define OUT_TYPE_SIZE 2
 
-layout(local_size_x = nth) in;
+layout(local_size_x = 1024) in;
 
 layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
 layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
@@ -54,7 +53,7 @@ void main() {
 
     const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
 
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
         out_[dst_data+i00] = OUT_TYPE(in_[src]);
     }
diff --git a/kompute/op_cpy_f32_f32.comp b/kompute/op_cpy_f32_f32.comp
index 4e5b1d393..2fc998492 100644
--- a/kompute/op_cpy_f32_f32.comp
+++ b/kompute/op_cpy_f32_f32.comp
@@ -2,13 +2,12 @@
 
 #include "common.comp"
 
-#define nth 32
 #define IN_TYPE float
 #define IN_TYPE_SIZE 4
 #define OUT_TYPE float
 #define OUT_TYPE_SIZE 4
 
-layout(local_size_x = nth) in;
+layout(local_size_x = 1024) in;
 
 layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
 layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
@@ -46,7 +45,7 @@ void main() {
 
     const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
 
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
         out_[dst_data+i00] = OUT_TYPE(in_[src]);
     }
diff --git a/kompute/op_norm.comp b/kompute/op_norm.comp
index 5aafeaac5..1d685cf36 100644
--- a/kompute/op_norm.comp
+++ b/kompute/op_norm.comp
@@ -10,9 +10,7 @@
 
 #include "common.comp"
 
-#define nth 256
-
-layout(local_size_x = nth) in;
+layout(local_size_x = 256) in;
 
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict tensorOut { float out_[]; };
@@ -25,21 +23,21 @@ layout(push_constant) uniform PushConstants {
     float eps;
 } pcs;
 
-shared float sum[nth];
+shared float sum[gl_WorkGroupSize.x];
 
 void main() {
     const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
     // MEAN
     // parallel sum
     sum[gl_LocalInvocationID.x] = 0.0;
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         sum[gl_LocalInvocationID.x] += in_[x+i00];
     }
 
     // reduce
     barrier();
     memoryBarrierShared();
-    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
         if (gl_LocalInvocationID.x < i) {
             sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
         }
@@ -57,21 +55,21 @@ void main() {
 
     // recenter
     const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         out_[y+i00] = in_[x+i00] - mean;
     }
 
     // VARIANCE
     // parallel sum
     sum[gl_LocalInvocationID.x] = 0.0;
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
     }
 
     // reduce
     barrier();
     memoryBarrierShared();
-    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
         if (gl_LocalInvocationID.x < i) {
             sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
         }
@@ -88,7 +86,7 @@ void main() {
     const float variance = sum[0];
 
     const float scale = 1.0f/sqrt(variance + pcs.eps);
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         out_[y+i00] *= scale;
     }
 }
diff --git a/kompute/op_rmsnorm.comp b/kompute/op_rmsnorm.comp
index 8d6c0fa6a..5ebaf2269 100644
--- a/kompute/op_rmsnorm.comp
+++ b/kompute/op_rmsnorm.comp
@@ -10,9 +10,7 @@
 
 #include "common.comp"
 
-#define nth 512
-
-layout(local_size_x = nth) in;
+layout(local_size_x = 512) in;
 
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict tensorOut { float out_[]; };
@@ -25,21 +23,21 @@ layout(push_constant) uniform PushConstants {
     float eps;
 } pcs;
 
-shared float sum[nth];
+shared float sum[gl_WorkGroupSize.x];
 
 void main() {
     const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
 
     // parallel sum
     sum[gl_LocalInvocationID.x] = 0.0;
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
     }
 
     // reduce
     barrier();
     memoryBarrierShared();
-    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
         if (gl_LocalInvocationID.x < i) {
             sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
         }
@@ -57,7 +55,7 @@ void main() {
     const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
 
     const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         out_[y+i00] = in_[x+i00] * scale;
     }
 }
diff --git a/kompute/op_rope_f32.comp b/kompute/op_rope_f32.comp
index 6024c3e5e..0cf83fec0 100644
--- a/kompute/op_rope_f32.comp
+++ b/kompute/op_rope_f32.comp
@@ -10,6 +10,7 @@
 
 #include "common.comp"
 
+// TODO: use a local size of 32 or more (Metal uses 1024)
 layout(local_size_x = 1) in;
 
 layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };