mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-15 15:29:53 +00:00
80dd7ff22f
* tests: Fix memory bandwidth calculation for perf tests Add a flops calculation for flash attention. Add one GGML_OP_CPY perf test. * vulkan: Optimize contiguous copies Add a variant of the copy shader for when the tensors are contiguous. Avoid the complex addressing calculations, and do four elements per invocation to hide some other overhead. Apply similar changes to the scale shader, since scale is always contiguous. Add a "progress bar" for shader compiles.
29 lines
952 B
Plaintext
29 lines
952 B
Plaintext
#version 450
|
|
|
|
#include "types.comp"
|
|
#include "generic_unary_head.comp"
|
|
|
|
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
|
|
|
void main() {
|
|
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
|
|
|
if (idx >= p.ne) {
|
|
return;
|
|
}
|
|
|
|
const uint i3 = idx / (p.ne12*p.ne11*p.ne10);
|
|
const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10;
|
|
const uint i2 = (idx - i3_offset) / (p.ne11*p.ne10);
|
|
const uint i2_offset = i2*p.ne11*p.ne10;
|
|
const uint i1 = (idx - i3_offset - i2_offset) / p.ne10;
|
|
const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;
|
|
|
|
const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
|
|
const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;
|
|
|
|
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
|
|
|
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f);
|
|
}
|