From 2471d56a2e202c7ed83877d2b246ea3903880cbb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 22 Oct 2023 09:22:54 +0300
Subject: [PATCH] llama : profiling the attention compute

---
 llama.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 365349335..4bd6ffd80 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5815,6 +5815,24 @@ static struct ggml_cgraph * llama_build_graph(
             GGML_ASSERT(false);
     }
 
+#if 1
+    for (int i = 0; i < result->n_nodes; ++i) {
+        struct ggml_tensor * node = result->nodes[i];
+        if (getenv("SKIP_KQ_ALL")) {
+            if (
+                    strcmp(node->name, "KQ")  == 0 ||
+                    strcmp(node->name, "KQ_scaled") == 0 ||
+                    strcmp(node->name, "KQ_masked") == 0 ||
+                    strcmp(node->name, "KQ_soft_max") == 0 ||
+                    strcmp(node->name, "KQV") == 0 ||
+                    false) {
+                //printf("skipping %s\n", dst->name);
+                node->op  = GGML_OP_NONE;
+            }
+        }
+    }
+#endif
+
     return result;
 }