vulkan : optimize workgroup sizes

This commit is contained in:
Jared Van Bortel 2023-11-23 17:18:48 -05:00
parent 84f7fc4553
commit 39abedd1d7
8 changed files with 24 additions and 31 deletions

View File

@ -847,9 +847,9 @@ void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
}; };
std::shared_ptr<kp::Algorithm> s_algo = nullptr; std::shared_ptr<kp::Algorithm> s_algo = nullptr;
if (!komputeManager()->hasAlgorithm(__func__)) if (!komputeManager()->hasAlgorithm(__func__)) {
s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts}); s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
else { } else {
s_algo = komputeManager()->getAlgorithm(__func__); s_algo = komputeManager()->getAlgorithm(__func__);
s_algo->setTensors({in, out}); s_algo->setTensors({in, out});
s_algo->setWorkgroup({(uint32_t)nrows}); s_algo->setWorkgroup({(uint32_t)nrows});

View File

@ -10,13 +10,12 @@
#include "common.comp" #include "common.comp"
#define nth 32
#define IN_TYPE float16_t #define IN_TYPE float16_t
#define IN_TYPE_SIZE 2 #define IN_TYPE_SIZE 2
#define OUT_TYPE float16_t #define OUT_TYPE float16_t
#define OUT_TYPE_SIZE 2 #define OUT_TYPE_SIZE 2
layout(local_size_x = nth) in; layout(local_size_x = 1024) in;
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; }; layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; }; layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
@ -54,7 +53,7 @@ void main() {
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_ const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_ const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
out_[dst_data+i00] = OUT_TYPE(in_[src]); out_[dst_data+i00] = OUT_TYPE(in_[src]);
} }

View File

@ -10,13 +10,12 @@
#include "common.comp" #include "common.comp"
#define nth 32
#define IN_TYPE float16_t #define IN_TYPE float16_t
#define IN_TYPE_SIZE 2 #define IN_TYPE_SIZE 2
#define OUT_TYPE float #define OUT_TYPE float
#define OUT_TYPE_SIZE 4 #define OUT_TYPE_SIZE 4
layout(local_size_x = nth) in; layout(local_size_x = 1024) in;
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; }; layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; }; layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
@ -54,7 +53,7 @@ void main() {
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_ const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_ const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
out_[dst_data+i00] = OUT_TYPE(in_[src]); out_[dst_data+i00] = OUT_TYPE(in_[src]);
} }

View File

@ -10,13 +10,12 @@
#include "common.comp" #include "common.comp"
#define nth 32
#define IN_TYPE float #define IN_TYPE float
#define IN_TYPE_SIZE 4 #define IN_TYPE_SIZE 4
#define OUT_TYPE float16_t #define OUT_TYPE float16_t
#define OUT_TYPE_SIZE 2 #define OUT_TYPE_SIZE 2
layout(local_size_x = nth) in; layout(local_size_x = 1024) in;
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; }; layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; }; layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
@ -54,7 +53,7 @@ void main() {
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_ const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_ const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
out_[dst_data+i00] = OUT_TYPE(in_[src]); out_[dst_data+i00] = OUT_TYPE(in_[src]);
} }

View File

@ -2,13 +2,12 @@
#include "common.comp" #include "common.comp"
#define nth 32
#define IN_TYPE float #define IN_TYPE float
#define IN_TYPE_SIZE 4 #define IN_TYPE_SIZE 4
#define OUT_TYPE float #define OUT_TYPE float
#define OUT_TYPE_SIZE 4 #define OUT_TYPE_SIZE 4
layout(local_size_x = nth) in; layout(local_size_x = 1024) in;
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; }; layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; }; layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
@ -46,7 +45,7 @@ void main() {
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_ const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_ const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
out_[dst_data+i00] = OUT_TYPE(in_[src]); out_[dst_data+i00] = OUT_TYPE(in_[src]);
} }

View File

@ -10,9 +10,7 @@
#include "common.comp" #include "common.comp"
#define nth 256 layout(local_size_x = 256) in;
layout(local_size_x = nth) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict tensorOut { float out_[]; }; layout(binding = 1) buffer restrict tensorOut { float out_[]; };
@ -25,21 +23,21 @@ layout(push_constant) uniform PushConstants {
float eps; float eps;
} pcs; } pcs;
shared float sum[nth]; shared float sum[gl_WorkGroupSize.x];
void main() { void main() {
const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_ const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
// MEAN // MEAN
// parallel sum // parallel sum
sum[gl_LocalInvocationID.x] = 0.0; sum[gl_LocalInvocationID.x] = 0.0;
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
sum[gl_LocalInvocationID.x] += in_[x+i00]; sum[gl_LocalInvocationID.x] += in_[x+i00];
} }
// reduce // reduce
barrier(); barrier();
memoryBarrierShared(); memoryBarrierShared();
[[unroll]] for (uint i = nth/2; i > 0; i /= 2) { [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
if (gl_LocalInvocationID.x < i) { if (gl_LocalInvocationID.x < i) {
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i]; sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
} }
@ -57,21 +55,21 @@ void main() {
// recenter // recenter
const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_ const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
out_[y+i00] = in_[x+i00] - mean; out_[y+i00] = in_[x+i00] - mean;
} }
// VARIANCE // VARIANCE
// parallel sum // parallel sum
sum[gl_LocalInvocationID.x] = 0.0; sum[gl_LocalInvocationID.x] = 0.0;
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00]; sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
} }
// reduce // reduce
barrier(); barrier();
memoryBarrierShared(); memoryBarrierShared();
[[unroll]] for (uint i = nth/2; i > 0; i /= 2) { [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
if (gl_LocalInvocationID.x < i) { if (gl_LocalInvocationID.x < i) {
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i]; sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
} }
@ -88,7 +86,7 @@ void main() {
const float variance = sum[0]; const float variance = sum[0];
const float scale = 1.0f/sqrt(variance + pcs.eps); const float scale = 1.0f/sqrt(variance + pcs.eps);
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
out_[y+i00] *= scale; out_[y+i00] *= scale;
} }
} }

View File

@ -10,9 +10,7 @@
#include "common.comp" #include "common.comp"
#define nth 512 layout(local_size_x = 512) in;
layout(local_size_x = nth) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict tensorOut { float out_[]; }; layout(binding = 1) buffer restrict tensorOut { float out_[]; };
@ -25,21 +23,21 @@ layout(push_constant) uniform PushConstants {
float eps; float eps;
} pcs; } pcs;
shared float sum[nth]; shared float sum[gl_WorkGroupSize.x];
void main() { void main() {
const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_ const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
// parallel sum // parallel sum
sum[gl_LocalInvocationID.x] = 0.0; sum[gl_LocalInvocationID.x] = 0.0;
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00]; sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
} }
// reduce // reduce
barrier(); barrier();
memoryBarrierShared(); memoryBarrierShared();
[[unroll]] for (uint i = nth/2; i > 0; i /= 2) { [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
if (gl_LocalInvocationID.x < i) { if (gl_LocalInvocationID.x < i) {
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i]; sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
} }
@ -57,7 +55,7 @@ void main() {
const float scale = 1.0f/sqrt(sum[0] + pcs.eps); const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_ const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) { for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
out_[y+i00] = in_[x+i00] * scale; out_[y+i00] = in_[x+i00] * scale;
} }
} }

View File

@ -10,6 +10,7 @@
#include "common.comp" #include "common.comp"
// TODO: use a local size of 32 or more (Metal uses 1024)
layout(local_size_x = 1) in; layout(local_size_x = 1) in;
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };