llama : profiling the attention compute

2024-12-27 03:44:35 +00:00 · 2023-10-22 09:22:54 +03:00 · 2023-10-22 09:22:54 +03:00 · 2471d56a2e
commit 2471d56a2e
parent 22c69a2794
1 changed files with 18 additions and 0 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -5815,6 +5815,24 @@ static struct ggml_cgraph * llama_build_graph(
            GGML_ASSERT(false);
    }
 #if 1
    for (int i = 0; i < result->n_nodes; ++i) {
        struct ggml_tensor * node = result->nodes[i];
        if (getenv("SKIP_KQ_ALL")) {
            if (
                    strcmp(node->name, "KQ")  == 0 ||
                    strcmp(node->name, "KQ_scaled") == 0 ||
                    strcmp(node->name, "KQ_masked") == 0 ||
                    strcmp(node->name, "KQ_soft_max") == 0 ||
                    strcmp(node->name, "KQV") == 0 ||
                    false) {
                //printf("skipping %s\n", dst->name);
                node->op  = GGML_OP_NONE;
            }
        }
    }
 #endif
    return result;
 }