#version 450 #include "common.comp" layout(local_size_x = 1) in; layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; }; layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; }; layout(push_constant) uniform PushConstants { uint inAOff; uint inBOff; uint outOff; uint row; } pcs; void main() { const uint baseIndex = gl_WorkGroupID.x * 4; for (uint x = 0; x < 4; x++) { const uint i = baseIndex + x; out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i % pcs.row) + pcs.inBOff]; } }