mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-10 18:51:45 +00:00
Merge remote-tracking branch 'origin/master' into sl/dl-backend
This commit is contained in:
commit
4428593487
@ -178,7 +178,7 @@ struct common_params {
|
||||
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||
float defrag_thold = 0.1f; // KV cache defragmentation threshold
|
||||
|
||||
struct cpu_params cpuparams;
|
||||
struct cpu_params cpuparams_batch;
|
||||
|
@ -39,7 +39,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
|
||||
| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
|
||||
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
|
||||
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
|
||||
| `-c, --ctx-size N` | size of the prompt context (default: 4096, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
|
||||
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
|
||||
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
|
||||
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
||||
@ -64,7 +64,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
|
||||
| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
|
||||
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
|
||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
|
||||
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
|
||||
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
|
||||
@ -99,25 +99,27 @@ The project is under active development, and we are [looking for feedback and co
|
||||
|
||||
| Argument | Explanation |
|
||||
| -------- | ----------- |
|
||||
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;typ_p;top_p;min_p;temperature) |
|
||||
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: dry;top_k;typ_p;top_p;min_p;xtc;temperature) |
|
||||
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
|
||||
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
|
||||
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: dkypmxt) |
|
||||
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
||||
| `--penalize-nl` | penalize newline tokens (default: false) |
|
||||
| `--temp N` | temperature (default: 0.8) |
|
||||
| `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
|
||||
| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
|
||||
| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
|
||||
| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) |
|
||||
| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) |
|
||||
| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
|
||||
| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
|
||||
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
|
||||
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) |
|
||||
| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) |
|
||||
| `--dry-multiplier N` | DRY sampling multiplier (default: 0.0, 0.0 = disabled) |
|
||||
| `--dry-base N` | DRY sampling base value (default: 1.75) |
|
||||
| `--dry-allowed-length N` | allowed length for DRY sampling (default: 2) |
|
||||
| `--dry-penalty-last-n N` | DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
|
||||
| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers (`['\n', ':', '"', '*']`) in the process; use `"none"` to not use any sequence breakers
|
||||
| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) |
|
||||
| `--dry-base N` | set DRY sampling base value (default: 1.75) |
|
||||
| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) |
|
||||
| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
|
||||
| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers<br/> |
|
||||
| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
|
||||
| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
|
||||
| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
|
||||
|
@ -2942,6 +2942,7 @@ kernel void kernel_flash_attn_ext(
|
||||
half smax = -INFINITY;
|
||||
|
||||
// load the mask in shared memory
|
||||
#pragma unroll(Q)
|
||||
for (short j = 0; j < Q; ++j) {
|
||||
device const half * pm = (device const half *) ((device const char *) mask + (iq1 + j)*nb31);
|
||||
|
||||
@ -2968,7 +2969,7 @@ kernel void kernel_flash_attn_ext(
|
||||
// we can read directly from global memory
|
||||
device const k_t * pk = (device const k_t *) ((device const char *) k + ((ic + 8*cc)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
|
||||
|
||||
#pragma unroll
|
||||
#pragma unroll(D8)
|
||||
for (short i = 0; i < D8; ++i) {
|
||||
k8x8_t mk;
|
||||
simdgroup_load(mk, pk + i*8, nb_12_1/sizeof(k_t), 0, true); // transpose // TODO: use ne10
|
||||
@ -2989,7 +2990,7 @@ kernel void kernel_flash_attn_ext(
|
||||
|
||||
simdgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
#pragma unroll
|
||||
#pragma unroll(4)
|
||||
for (short k = 0; k < 4; ++k) {
|
||||
k8x8_t mk;
|
||||
|
||||
@ -3067,7 +3068,7 @@ kernel void kernel_flash_attn_ext(
|
||||
s8x8_t mm;
|
||||
simdgroup_load(mm, ss + 2*C, TS, 0, false);
|
||||
|
||||
#pragma unroll
|
||||
#pragma unroll(D8)
|
||||
for (short i = 0; i < D8; ++i) {
|
||||
simdgroup_multiply(lo[i], mm, lo[i]);
|
||||
}
|
||||
@ -3082,7 +3083,8 @@ kernel void kernel_flash_attn_ext(
|
||||
if (is_same<vd4x4_t, v4x4_t>::value) {
|
||||
// we can read directly from global memory
|
||||
device const v_t * pv = (device const v_t *) ((device const char *) v + ((ic + 8*cc)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
|
||||
#pragma unroll
|
||||
|
||||
#pragma unroll(D8)
|
||||
for (short i = 0; i < D8; ++i) {
|
||||
v8x8_t mv;
|
||||
simdgroup_load(mv, pv + i*8, nb_12_1/sizeof(v_t), 0, false); // TODO: use ne20
|
||||
@ -3103,7 +3105,7 @@ kernel void kernel_flash_attn_ext(
|
||||
|
||||
simdgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
#pragma unroll
|
||||
#pragma unroll(4)
|
||||
for (short k = 0; k < 4; ++k) {
|
||||
v8x8_t mv;
|
||||
|
||||
@ -3196,6 +3198,7 @@ kernel void kernel_flash_attn_ext(
|
||||
simdgroup_load(ms0, ss + 2*C, TS, 0, false);
|
||||
simdgroup_load(ms1, ss + 2*C + sg*SH, TS, 0, false);
|
||||
|
||||
#pragma unroll(D8)
|
||||
for (short i = 0; i < D8; ++i) {
|
||||
o8x8_t t;
|
||||
|
||||
@ -3413,6 +3416,7 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
// load the queries from shared memory into local memory
|
||||
q4x4_t mq[D16/NL];
|
||||
|
||||
#pragma unroll(D16/NL)
|
||||
for (short ii = 0; ii < D16; ii += NL) {
|
||||
mq[ii/NL] = sq4x4[ii + tx];
|
||||
}
|
||||
@ -3454,17 +3458,23 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
|
||||
device const kd4x4_t * pk = (device const kd4x4_t *) ((device const char *) k + ((ic + 4*cc + ty)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
|
||||
|
||||
#pragma unroll
|
||||
#pragma unroll(D16/NL)
|
||||
for (short ii = 0; ii < D16; ii += NL) {
|
||||
const short i = ii + tx;
|
||||
|
||||
k4x4_t mk;
|
||||
deq_k(pk + i/nl_k, i%nl_k, mk);
|
||||
|
||||
mqka[0] += dot(mq[ii/NL][0], mk[0]);
|
||||
mqka[1] += dot(mq[ii/NL][1], mk[1]);
|
||||
mqka[2] += dot(mq[ii/NL][2], mk[2]);
|
||||
mqka[3] += dot(mq[ii/NL][3], mk[3]);
|
||||
// note: this is less precise than the version below
|
||||
//mqka[0] += dot(mq[ii/NL][0], mk[0]);
|
||||
//mqka[1] += dot(mq[ii/NL][1], mk[1]);
|
||||
//mqka[2] += dot(mq[ii/NL][2], mk[2]);
|
||||
//mqka[3] += dot(mq[ii/NL][3], mk[3]);
|
||||
|
||||
mqka[0] += dot((float4) mq[ii/NL][0], (float4) mk[0]);
|
||||
mqka[1] += dot((float4) mq[ii/NL][1], (float4) mk[1]);
|
||||
mqka[2] += dot((float4) mq[ii/NL][2], (float4) mk[2]);
|
||||
mqka[3] += dot((float4) mq[ii/NL][3], (float4) mk[3]);
|
||||
}
|
||||
|
||||
qk_t mqk = mqka[0] + mqka[1] + mqka[2] + mqka[3];
|
||||
@ -3513,7 +3523,7 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
ss[tiisg] = vs;
|
||||
|
||||
// O = diag(ms)*O
|
||||
#pragma unroll
|
||||
#pragma unroll(D16/NL)
|
||||
for (short ii = 0; ii < D16; ii += NL) {
|
||||
lo[ii/NL] *= ms;
|
||||
}
|
||||
@ -3523,13 +3533,12 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
|
||||
// O = O + (Q*K^T)*V
|
||||
{
|
||||
#pragma unroll
|
||||
for (short cc = 0; cc < C/4; ++cc) {
|
||||
device const vd4x4_t * pv4 = (device const vd4x4_t *) ((device const char *) v + ((ic + 4*cc + ty)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3));
|
||||
|
||||
const s4x4_t ms(ss[4*cc + ty]);
|
||||
|
||||
#pragma unroll
|
||||
#pragma unroll(D16/NL)
|
||||
for (short ii = 0; ii < D16; ii += NL) {
|
||||
const short i = ii + tx;
|
||||
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <cassert>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
@ -92,11 +93,11 @@ void execute_command(const std::string& command, std::string& stdout_str, std::s
|
||||
std::array<char, 128> buffer;
|
||||
DWORD bytes_read;
|
||||
|
||||
while (ReadFile(stdout_read, buffer.data(), buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
|
||||
while (ReadFile(stdout_read, buffer.data(), (DWORD)buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
|
||||
stdout_str.append(buffer.data(), bytes_read);
|
||||
}
|
||||
|
||||
while (ReadFile(stderr_read, buffer.data(), buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
|
||||
while (ReadFile(stderr_read, buffer.data(), (DWORD)buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
|
||||
stderr_str.append(buffer.data(), bytes_read);
|
||||
}
|
||||
|
||||
@ -190,7 +191,12 @@ std::string basename(const std::string &path) {
|
||||
return path.substr(path.find_last_of("/\\") + 1);
|
||||
}
|
||||
|
||||
void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true) {
|
||||
// variables to track number of compiles in progress
|
||||
static uint32_t compile_count = 0;
|
||||
static std::mutex compile_count_mutex;
|
||||
static std::condition_variable compile_count_cond;
|
||||
|
||||
void string_to_spv_func(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true) {
|
||||
std::string name = _name + (fp16 ? "" : "_fp32");
|
||||
std::string out_fname = join_paths(output_dir, name + ".spv");
|
||||
std::string in_path = join_paths(input_dir, in_fname);
|
||||
@ -233,6 +239,12 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const
|
||||
} catch (const std::exception& e) {
|
||||
std::cerr << "Error executing command for " << name << ": " << e.what() << std::endl;
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(compile_count_mutex);
|
||||
assert(compile_count > 0);
|
||||
compile_count--;
|
||||
}
|
||||
compile_count_cond.notify_all();
|
||||
}
|
||||
|
||||
std::map<std::string, std::string> merge_maps(const std::map<std::string, std::string>& a, const std::map<std::string, std::string>& b) {
|
||||
@ -241,7 +253,22 @@ std::map<std::string, std::string> merge_maps(const std::map<std::string, std::s
|
||||
return result;
|
||||
}
|
||||
|
||||
void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmul_id) {
|
||||
static std::vector<std::future<void>> compiles;
|
||||
void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true) {
|
||||
{
|
||||
// wait until fewer than N compiles are in progress.
|
||||
// 16 is an arbitrary limit, the goal is to avoid "failed to create pipe" errors.
|
||||
uint32_t N = 16;
|
||||
std::unique_lock<std::mutex> guard(compile_count_mutex);
|
||||
while (compile_count >= N) {
|
||||
compile_count_cond.wait(guard);
|
||||
}
|
||||
compile_count++;
|
||||
}
|
||||
compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16));
|
||||
}
|
||||
|
||||
void matmul_shaders(bool fp16, bool matmul_id) {
|
||||
std::string load_vec = fp16 ? "8" : "4";
|
||||
std::string aligned_b_type_f32 = fp16 ? "mat2x4" : "vec4";
|
||||
std::string aligned_b_type_f16 = fp16 ? "f16mat2x4" : "f16vec4";
|
||||
@ -259,19 +286,11 @@ void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmu
|
||||
}
|
||||
|
||||
// Shaders with f16 B_TYPE
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv(shader_name + "_f32_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16);
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv(shader_name + "_f32_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
|
||||
}));
|
||||
string_to_spv(shader_name + "_f32_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16);
|
||||
string_to_spv(shader_name + "_f32_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv(shader_name + "_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16);
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv(shader_name + "_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
|
||||
}));
|
||||
string_to_spv(shader_name + "_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16);
|
||||
string_to_spv(shader_name + "_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
|
||||
|
||||
for (const auto& tname : type_names) {
|
||||
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
||||
@ -279,22 +298,18 @@ void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmu
|
||||
std::string load_vec_a_unaligned = (tname == "f32" || tname == "f16") ? "1" : "2";
|
||||
// For aligned matmul loads
|
||||
std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2";
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16);
|
||||
}));
|
||||
string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
|
||||
string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16);
|
||||
}
|
||||
}
|
||||
|
||||
void process_shaders(std::vector<std::future<void>>& tasks) {
|
||||
void process_shaders() {
|
||||
std::cout << "ggml_vulkan: Generating and compiling shaders to SPIR-V" << std::endl;
|
||||
std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}};
|
||||
|
||||
for (const auto& fp16 : {false, true}) {
|
||||
matmul_shaders(tasks, fp16, false);
|
||||
matmul_shaders(tasks, fp16, true);
|
||||
matmul_shaders(fp16, false);
|
||||
matmul_shaders(fp16, true);
|
||||
}
|
||||
|
||||
for (const auto& tname : type_names) {
|
||||
@ -302,201 +317,103 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
|
||||
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
||||
std::string shader = (string_ends_with(tname, "_k")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
// Dequant shaders
|
||||
if (tname != "f16") {
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("dequant_" + tname, "dequant_" + tname + ".comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float16_t"}}));
|
||||
}));
|
||||
string_to_spv("dequant_" + tname, "dequant_" + tname + ".comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float16_t"}}));
|
||||
}
|
||||
|
||||
if (!string_ends_with(tname, "_k")) {
|
||||
shader = (tname == "f32" || tname == "f16") ? "get_rows.comp" : "get_rows_quant.comp";
|
||||
|
||||
if (tname == "f16") {
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
}));
|
||||
string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
} else {
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}});
|
||||
}));
|
||||
string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}});
|
||||
}
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("get_rows_" + tname + "_f32", shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
string_to_spv("get_rows_" + tname + "_f32", shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}});
|
||||
}
|
||||
}
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
// Norms
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
}));
|
||||
string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
|
||||
}));
|
||||
string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("mul_f32", "mul.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
string_to_spv("mul_f32", "mul.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("repeat_f32", "repeat.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
string_to_spv("repeat_f32", "repeat.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("sin_f32", "sin.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
string_to_spv("sin_f32", "sin.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("cos_f32", "cos.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
string_to_spv("cos_f32", "cos.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}});
|
||||
}));
|
||||
string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}});
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("soft_max_f32", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
string_to_spv("soft_max_f32", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
}));
|
||||
string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
}));
|
||||
string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}});
|
||||
}));
|
||||
string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}});
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
|
||||
}));
|
||||
string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
for (auto &c : compiles) {
|
||||
c.wait();
|
||||
}
|
||||
}
|
||||
|
||||
void write_output_files() {
|
||||
@ -591,12 +508,7 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::future<void>> tasks;
|
||||
process_shaders(tasks);
|
||||
|
||||
for (auto& task : tasks) {
|
||||
task.get();
|
||||
}
|
||||
process_shaders();
|
||||
|
||||
write_output_files();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user