From 2471d56a2e202c7ed83877d2b246ea3903880cbb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 22 Oct 2023 09:22:54 +0300 Subject: [PATCH] llama : profiling the attention compute --- llama.cpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/llama.cpp b/llama.cpp index 365349335..4bd6ffd80 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5815,6 +5815,24 @@ static struct ggml_cgraph * llama_build_graph( GGML_ASSERT(false); } +#if 1 + for (int i = 0; i < result->n_nodes; ++i) { + struct ggml_tensor * node = result->nodes[i]; + if (getenv("SKIP_KQ_ALL")) { + if ( + strcmp(node->name, "KQ") == 0 || + strcmp(node->name, "KQ_scaled") == 0 || + strcmp(node->name, "KQ_masked") == 0 || + strcmp(node->name, "KQ_soft_max") == 0 || + strcmp(node->name, "KQV") == 0 || + false) { + //printf("skipping %s\n", dst->name); + node->op = GGML_OP_NONE; + } + } + } +#endif + return result; }