mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-15 15:29:53 +00:00
80dd7ff22f
* tests: Fix memory bandwidth calculation for perf tests Add a flops calculation for flash attention. Add one GGML_OP_CPY perf test. * vulkan: Optimize contiguous copies Add a variant of the copy shader for when the tensors are contiguous. Avoid the complex addressing calculations, and do four elements per invocation to hide some other overhead. Apply similar changes to the scale shader, since scale is always contiguous. Add a "progress bar" for shader compiles.
18 lines
408 B
Plaintext
18 lines
408 B
Plaintext
#version 450
|
|
|
|
#include "types.comp"
|
|
#include "generic_unary_head.comp"
|
|
|
|
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
|
|
|
void main() {
|
|
const uint idx = get_idx();
|
|
|
|
if (idx >= p.ne) {
|
|
return;
|
|
}
|
|
|
|
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
|
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
|
|
}
|