From 6cea9d8df29d1583e0666741c597bc2f784d4a94 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Wed, 25 Sep 2024 17:48:48 -0700
Subject: [PATCH] profiler: add support for different outputs

---
 ggml/src/ggml-impl.h      |  2 +-
 ggml/src/ggml-profile.cpp | 89 ++++++++++++++++++++++++++-------------
 ggml/src/ggml-profile.h   | 45 ++++++++++++++------
 ggml/src/ggml.c           | 17 +++++---
 4 files changed, 105 insertions(+), 48 deletions(-)
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index ee13e4dbc..97ea40667 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -191,7 +191,7 @@ struct ggml_cgraph {
     struct ggml_tensor ** grads;
     struct ggml_tensor ** leafs;
 
-    struct ggml_profile_data ** prof;
+    struct ggml_profile_data * prof;
 
     struct ggml_hash_set visited_hash_set;
 
diff --git a/ggml/src/ggml-profile.cpp b/ggml/src/ggml-profile.cpp
index 0f59455a5..7fee0cece 100644
--- a/ggml/src/ggml-profile.cpp
+++ b/ggml/src/ggml-profile.cpp
@@ -2,42 +2,63 @@
 
 #include <stdint.h>
 #include <stdlib.h>
+#include <stdio.h>
 
+#include <string>
 #include <chrono>
 
 #ifdef GGML_GRAPH_PROFILER
 
-extern "C" void ggml_profile_graph_init(struct ggml_cgraph *cg, int n_threads)
+struct ggml_profile_output {
+    const char * prefix;
+    FILE *       stream;
+};
+
+extern "C" void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads)
 {
-    if (!getenv("GGML_GRAPH_PROFILE")) { return; }
+    // TODO: make this a param
+    const char *env = getenv("GGML_GRAPH_PROFILE");
+    if (!env) { return; }
 
     // The number of threads may change between passes (pp vs tg).
     // Allocate for max_n_threads for simplicity for now.
     // TODO: use aligned allocator
 
-    size_t node_size = sizeof(struct ggml_profile_data) * GGML_MAX_N_THREADS;
+    size_t node_size = sizeof(struct ggml_profile_timing) * GGML_MAX_N_THREADS;
     size_t pvec_size = sizeof(std::intptr_t) * cg->n_nodes;
-    size_t data_size = node_size * cg->n_nodes;
-    size_t t_size    = pvec_size + data_size;
+    size_t time_size = node_size * cg->n_nodes;
+    size_t t_size    = pvec_size + time_size + sizeof(ggml_profile_output) + sizeof(ggml_profile_data);
 
-    cg->prof = (struct ggml_profile_data **) malloc(t_size);
-    if (!cg->prof) {
+    uint8_t * ptr = (uint8_t *) malloc(t_size);
+    if (!ptr) {
         fprintf(stderr, "ggml-profile: failed to allocate profiling data : n_threads %d n_nodes %d\n", n_threads, cg->n_nodes);
         return;
     }
+    memset(ptr, 0, t_size);
 
-    memset(cg->prof, 0, t_size);
-
-    // init pre-thread pointers
-    uint8_t * data = (uint8_t *) cg->prof + pvec_size;
+    // init all pointers
+    cg->prof         = (ggml_profile_data *)    ptr; ptr += sizeof(ggml_profile_data);
+    cg->prof->output = (ggml_profile_output *)  ptr; ptr += sizeof(ggml_profile_output);
+    cg->prof->timing = (ggml_profile_timing **) ptr; ptr += pvec_size;
     for (int i=0; i < cg->n_nodes; i++) {
-        cg->prof[i] = (struct ggml_profile_data *) data; data += node_size;
+        cg->prof->timing[i] = (struct ggml_profile_timing *) ptr; ptr += node_size;
     }
+
+    // init the output
+    ggml_profile_output *out = cg->prof->output;
+    if (!strcmp("stderr", env) || !strcmp("1", env)) {
+        out->prefix = "ggml-profile:";
+        out->stream = stderr;
+    } else {
+        out->prefix = "";
+        out->stream = fopen(env, "w");
+    }
+
 }
 
-extern "C" void ggml_profile_graph_start(struct ggml_cgraph *cg, int n_threads)
+extern "C" void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads)
 {
-    if (!cg->prof) { ggml_profile_graph_init(cg, n_threads); }
+    if (!cg->prof) { ggml_graph_profile_init(cg, n_threads); }
     if (!cg->prof) { return; }
 }
 
@@ -89,13 +110,14 @@ static inline void ggml_profile_format_op_types(char *str, struct ggml_tensor *t
     p += sprintf(p, "%3s", ggml_type_name(t->type));
 }
 
-
-extern "C" void ggml_profile_graph_finish(struct ggml_cgraph *cg, int n_threads)
+extern "C" void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads)
 {
     if (!cg->prof) { return; }
 
-    fprintf(stderr, "ggml-profile: | node idx | op name | proc (nsec) | sync (nsec) | total (nsec) | op dims | op types | tensor name |\n");
-    fprintf(stderr, "ggml-profile: | -------: | :------ | ----------: | ----------: | -----------: | ------: | -------: | ----------: |\n");
+    ggml_profile_output *out = cg->prof->output;
+
+    fprintf(out->stream, "%s| node idx | op name | proc (nsec) | sync (nsec) | total (nsec) | op dims | op types | tensor name |\n", out->prefix);
+    fprintf(out->stream, "%s| -------: | :------ | ----------: | ----------: | -----------: | ------: | -------: | ----------: |\n", out->prefix);
 
     char dims[64 * GGML_MAX_SRC];
     char types[16 * GGML_MAX_SRC];
@@ -107,39 +129,48 @@ extern "C" void ggml_profile_graph_finish(struct ggml_cgraph *cg, int n_threads)
 
         // add up per thread counters and reset them
         for (int t=0; t < n_threads; t++) {
-            p_nsec += cg->prof[i][t].nsec[GGML_PROF_OP_SYNC] - cg->prof[i][t].nsec[GGML_PROF_OP_START];
-            s_nsec += cg->prof[i][t].nsec[GGML_PROF_OP_END]  - cg->prof[i][t].nsec[GGML_PROF_OP_SYNC];
-            t_nsec += cg->prof[i][t].nsec[GGML_PROF_OP_END]  - cg->prof[i][t].nsec[GGML_PROF_OP_START];
+            ggml_profile_timing &timing = cg->prof->timing[i][t];
 
-            cg->prof[i][t].nsec[GGML_PROF_OP_START] = 0;
-            cg->prof[i][t].nsec[GGML_PROF_OP_SYNC]  = 0;
-            cg->prof[i][t].nsec[GGML_PROF_OP_END]   = 0;
+            p_nsec += timing.nsec[GGML_PROF_OP_SYNC] - timing.nsec[GGML_PROF_OP_START];
+            s_nsec += timing.nsec[GGML_PROF_OP_END]  - timing.nsec[GGML_PROF_OP_SYNC];
+            t_nsec += timing.nsec[GGML_PROF_OP_END]  - timing.nsec[GGML_PROF_OP_START];
+
+            timing.nsec[GGML_PROF_OP_START] = 0;
+            timing.nsec[GGML_PROF_OP_SYNC]  = 0;
+            timing.nsec[GGML_PROF_OP_END]   = 0;
         }
 
         ggml_profile_format_op_dims(dims, cg->nodes[i]);
         ggml_profile_format_op_types(types, cg->nodes[i]);
 
-        fprintf(stderr, "ggml-profile: | %04d | %10s | %10lu | %10lu | %10lu | %46s | %22s | %20s |\n",
+        fprintf(out->stream, "%s| %04d | %10s | %10lu | %10lu | %10lu | %46s | %22s | %20s |\n", out->prefix,
             i, ggml_op_name(cg->nodes[i]->op),
             (unsigned long) p_nsec, (unsigned long) s_nsec, (unsigned long) t_nsec,
             dims, types, cg->nodes[i]->name);
     }
-    fprintf(stderr, "ggml-profile:   \n"); // empty line to split tables
+    fprintf(out->stream, "%s   \n", out->prefix); // empty line to split tables
 }
 
-extern "C" void ggml_profile_graph_free(struct ggml_cgraph *cg)
+extern "C" void ggml_graph_profile_free(struct ggml_cgraph *cg)
 {
     if (!cg->prof) { return; }
 
+    ggml_profile_output *out = cg->prof->output;
+    if (out->stream != stderr) {
+        fclose(out->stream);
+    }
+
     free(cg->prof); cg->prof = nullptr;
 }
 
-extern "C" void ggml_profile_op_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
+extern "C" void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
 {
     if (!cg->prof) { return; }
 
     using clock = std::chrono::high_resolution_clock;
-    cg->prof[node_n][ith].nsec[e] = std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
+
+    ggml_profile_timing &timing = cg->prof->timing[node_n][ith];
+    timing.nsec[e] = std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
 }
 
 #endif // GGML_GRAPH_PROFILER
diff --git a/ggml/src/ggml-profile.h b/ggml/src/ggml-profile.h
index e572b91da..3f8fecc08 100644
--- a/ggml/src/ggml-profile.h
+++ b/ggml/src/ggml-profile.h
@@ -8,45 +8,66 @@
 extern "C" {
 #endif
 
-// op profile data (per op / per thread)
+// op profile events & timing (per op / per thread)
 enum ggml_profile_event {
     GGML_PROF_OP_START,
     GGML_PROF_OP_SYNC,
     GGML_PROF_OP_END
 };
 
-struct ggml_profile_data {
+struct ggml_profile_timing {
     uint64_t nsec[GGML_PROF_OP_END + 1]; // event times in nsec
 };
 
+struct ggml_profile_output;
+
+struct ggml_profile_data {
+    struct ggml_profile_output *output;
+    struct ggml_profile_timing ** timing; // per op / per thread timing
+};
+
+// check if profiling is enabled for this graph
+static inline bool ggml_graph_profile_enabled(const struct ggml_cgraph *cg)
+{
+    return cg->prof != NULL;
+}
+
+// get pointer to the timing data for specific node / thread
+// can be used by the backends to populate data collected internally
+static inline struct ggml_profile_timing * ggml_graph_profile_timing(const struct ggml_cgraph *cg, int node_n, int ith)
+{
+    if (!cg->prof) { return NULL; }
+    return &cg->prof->timing[node_n][ith];
+}
+
 #ifndef GGML_GRAPH_PROFILER
 
 // Stub out all profiler functions
 
-static inline void ggml_profile_graph_init(struct ggml_cgraph *cg, int n_threads)
+static inline void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads)
 {
     GGML_UNUSED(cg);
     GGML_UNUSED(n_threads);
 }
 
-static inline void ggml_profile_graph_start(struct ggml_cgraph *cg, int n_threads)
+static inline void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads)
 {
     GGML_UNUSED(cg);
     GGML_UNUSED(n_threads);
 }
 
-static inline void ggml_profile_graph_finish(struct ggml_cgraph *cg, int n_threads)
+static inline void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads)
 {
     GGML_UNUSED(cg);
     GGML_UNUSED(n_threads);
 }
 
-static inline void ggml_profile_graph_free(struct ggml_cgraph *cg)
+static inline void ggml_graph_profile_free(struct ggml_cgraph *cg)
 {
     GGML_UNUSED(cg);
 }
 
-static inline void ggml_profile_op_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
+static inline void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
 {
     GGML_UNUSED(cg);
     GGML_UNUSED(e);
@@ -56,11 +77,11 @@ static inline void ggml_profile_op_event(const struct ggml_cgraph *cg, enum ggml
 
 #else
 
-void ggml_profile_graph_init(struct ggml_cgraph *cg, int n_threads);
-void ggml_profile_graph_start(struct ggml_cgraph *cg, int n_threads);
-void ggml_profile_graph_finish(struct ggml_cgraph *cg, int n_threads);
-void ggml_profile_graph_free(struct ggml_cgraph *cg);
-void ggml_profile_op_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith);
+void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads);
+void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads);
+void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads);
+void ggml_graph_profile_free(struct ggml_cgraph *cg);
+void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith);
 
 #endif // GGML_GRAPH_PROFILER
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 50f861ef3..f7f2d97aa 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -19876,7 +19876,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) {
         struct ggml_tensor * node = cgraph->nodes[node_n];
 
-        ggml_profile_op_event(cgraph, GGML_PROF_OP_START, node_n, state->ith);
+        ggml_graph_profile_event(cgraph, GGML_PROF_OP_START, node_n, state->ith);
 
         ggml_compute_forward(&params, node);
 
@@ -19886,11 +19886,16 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             tp->ec    = GGML_STATUS_ABORTED;
         }
 
-        ggml_profile_op_event(cgraph, GGML_PROF_OP_SYNC, node_n, state->ith);
+        ggml_graph_profile_event(cgraph, GGML_PROF_OP_SYNC, node_n, state->ith);
 
         ggml_barrier(state->threadpool);
 
-        ggml_profile_op_event(cgraph, GGML_PROF_OP_END,  node_n, state->ith);
+        ggml_graph_profile_event(cgraph, GGML_PROF_OP_END,  node_n, state->ith);
+    }
+
+    if (ggml_graph_profile_enabled(cgraph)) {
+        // need another barrier to flush the last timing update
+        ggml_barrier(state->threadpool);
     }
 
     return 0;
@@ -20163,7 +20168,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         threadpool->ec               = GGML_STATUS_SUCCESS;
     }
 
-    ggml_profile_graph_start(cgraph, n_threads);
+    ggml_graph_profile_start(cgraph, n_threads);
 
 #ifdef GGML_USE_OPENMP
     if (n_threads > 1) {
@@ -20195,6 +20200,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     ggml_graph_compute_thread(&threadpool->workers[0]);
 #endif
 
+    ggml_graph_profile_finish(cgraph, n_threads);
+
     // don't leave affinity set on the main thread
     clear_numa_thread_affinity();
 
@@ -20204,8 +20211,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         ggml_threadpool_free(threadpool);
     }
 
-    ggml_profile_graph_finish(cgraph, n_threads);
-
     return ret;
 }