#version 450 #include "common.comp" layout(local_size_x = 1) in; layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; }; layout (binding = 1) readonly buffer tensorInB { int inB[]; }; layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; layout (push_constant) uniform parameter { uint inAOff; uint inBOff; uint outOff; int ne00; int nb01; int nb1; } pcs; void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) { for (int j = 0; j < k; j++) { out_[y + j] = inA[x + j]; } } void main() { const uint i = gl_WorkGroupID.x; const int r = inB[i + pcs.inBOff]; dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00); }