From 39abedd1d75b83cc9ff6f5c951d2e4f63d840bdf Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Thu, 23 Nov 2023 17:18:48 -0500 Subject: [PATCH] vulkan : optimize workgroup sizes --- ggml-vulkan.cpp | 4 ++-- kompute/op_cpy_f16_f16.comp | 5 ++--- kompute/op_cpy_f16_f32.comp | 5 ++--- kompute/op_cpy_f32_f16.comp | 5 ++--- kompute/op_cpy_f32_f32.comp | 5 ++--- kompute/op_norm.comp | 18 ++++++++---------- kompute/op_rmsnorm.comp | 12 +++++------- kompute/op_rope_f32.comp | 1 + 8 files changed, 24 insertions(+), 31 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 3e3f6cc80..74d9fceb6 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -847,9 +847,9 @@ void ggml_vk_norm_(const std::vector& spirv, kp::Sequence& seq, }; std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) + if (!komputeManager()->hasAlgorithm(__func__)) { s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts}); - else { + } else { s_algo = komputeManager()->getAlgorithm(__func__); s_algo->setTensors({in, out}); s_algo->setWorkgroup({(uint32_t)nrows}); diff --git a/kompute/op_cpy_f16_f16.comp b/kompute/op_cpy_f16_f16.comp index 5f425ae28..652db0313 100644 --- a/kompute/op_cpy_f16_f16.comp +++ b/kompute/op_cpy_f16_f16.comp @@ -10,13 +10,12 @@ #include "common.comp" -#define nth 32 #define IN_TYPE float16_t #define IN_TYPE_SIZE 2 #define OUT_TYPE float16_t #define OUT_TYPE_SIZE 2 -layout(local_size_x = nth) in; +layout(local_size_x = 1024) in; layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; }; layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; }; @@ -54,7 +53,7 @@ void main() { const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_ - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_ out_[dst_data+i00] = OUT_TYPE(in_[src]); } diff --git a/kompute/op_cpy_f16_f32.comp b/kompute/op_cpy_f16_f32.comp index 4298bebdd..aa204248c 100644 --- a/kompute/op_cpy_f16_f32.comp +++ b/kompute/op_cpy_f16_f32.comp @@ -10,13 +10,12 @@ #include "common.comp" -#define nth 32 #define IN_TYPE float16_t #define IN_TYPE_SIZE 2 #define OUT_TYPE float #define OUT_TYPE_SIZE 4 -layout(local_size_x = nth) in; +layout(local_size_x = 1024) in; layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; }; layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; }; @@ -54,7 +53,7 @@ void main() { const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_ - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_ out_[dst_data+i00] = OUT_TYPE(in_[src]); } diff --git a/kompute/op_cpy_f32_f16.comp b/kompute/op_cpy_f32_f16.comp index 2d763edfd..4fdab4831 100644 --- a/kompute/op_cpy_f32_f16.comp +++ b/kompute/op_cpy_f32_f16.comp @@ -10,13 +10,12 @@ #include "common.comp" -#define nth 32 #define IN_TYPE float #define IN_TYPE_SIZE 4 #define OUT_TYPE float16_t #define OUT_TYPE_SIZE 2 -layout(local_size_x = nth) in; +layout(local_size_x = 1024) in; layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; }; layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; }; @@ -54,7 +53,7 @@ void main() { const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_ - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_ out_[dst_data+i00] = OUT_TYPE(in_[src]); } diff --git a/kompute/op_cpy_f32_f32.comp b/kompute/op_cpy_f32_f32.comp index 4e5b1d393..2fc998492 100644 --- a/kompute/op_cpy_f32_f32.comp +++ b/kompute/op_cpy_f32_f32.comp @@ -2,13 +2,12 @@ #include "common.comp" -#define nth 32 #define IN_TYPE float #define IN_TYPE_SIZE 4 #define OUT_TYPE float #define OUT_TYPE_SIZE 4 -layout(local_size_x = nth) in; +layout(local_size_x = 1024) in; layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; }; layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; }; @@ -46,7 +45,7 @@ void main() { const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_ - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_ out_[dst_data+i00] = OUT_TYPE(in_[src]); } diff --git a/kompute/op_norm.comp b/kompute/op_norm.comp index 5aafeaac5..1d685cf36 100644 --- a/kompute/op_norm.comp +++ b/kompute/op_norm.comp @@ -10,9 +10,7 @@ #include "common.comp" -#define nth 256 - -layout(local_size_x = nth) in; +layout(local_size_x = 256) in; layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; layout(binding = 1) buffer restrict tensorOut { float out_[]; }; @@ -25,21 +23,21 @@ layout(push_constant) uniform PushConstants { float eps; } pcs; -shared float sum[nth]; +shared float sum[gl_WorkGroupSize.x]; void main() { const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_ // MEAN // parallel sum sum[gl_LocalInvocationID.x] = 0.0; - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { sum[gl_LocalInvocationID.x] += in_[x+i00]; } // reduce barrier(); memoryBarrierShared(); - [[unroll]] for (uint i = nth/2; i > 0; i /= 2) { + [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) { if (gl_LocalInvocationID.x < i) { sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i]; } @@ -57,21 +55,21 @@ void main() { // recenter const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_ - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { out_[y+i00] = in_[x+i00] - mean; } // VARIANCE // parallel sum sum[gl_LocalInvocationID.x] = 0.0; - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00]; } // reduce barrier(); memoryBarrierShared(); - [[unroll]] for (uint i = nth/2; i > 0; i /= 2) { + [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) { if (gl_LocalInvocationID.x < i) { sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i]; } @@ -88,7 +86,7 @@ void main() { const float variance = sum[0]; const float scale = 1.0f/sqrt(variance + pcs.eps); - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { out_[y+i00] *= scale; } } diff --git a/kompute/op_rmsnorm.comp b/kompute/op_rmsnorm.comp index 8d6c0fa6a..5ebaf2269 100644 --- a/kompute/op_rmsnorm.comp +++ b/kompute/op_rmsnorm.comp @@ -10,9 +10,7 @@ #include "common.comp" -#define nth 512 - -layout(local_size_x = nth) in; +layout(local_size_x = 512) in; layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; layout(binding = 1) buffer restrict tensorOut { float out_[]; }; @@ -25,21 +23,21 @@ layout(push_constant) uniform PushConstants { float eps; } pcs; -shared float sum[nth]; +shared float sum[gl_WorkGroupSize.x]; void main() { const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_ // parallel sum sum[gl_LocalInvocationID.x] = 0.0; - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00]; } // reduce barrier(); memoryBarrierShared(); - [[unroll]] for (uint i = nth/2; i > 0; i /= 2) { + [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) { if (gl_LocalInvocationID.x < i) { sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i]; } @@ -57,7 +55,7 @@ void main() { const float scale = 1.0f/sqrt(sum[0] + pcs.eps); const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_ - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { out_[y+i00] = in_[x+i00] * scale; } } diff --git a/kompute/op_rope_f32.comp b/kompute/op_rope_f32.comp index 6024c3e5e..0cf83fec0 100644 --- a/kompute/op_rope_f32.comp +++ b/kompute/op_rope_f32.comp @@ -10,6 +10,7 @@ #include "common.comp" +// TODO: use a local size of 32 or more (Metal uses 1024) layout(local_size_x = 1) in; layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };