profiler: initial support for profiling graph ops

2025-01-09 18:21:45 +00:00 · 2024-09-25 14:25:13 -07:00 · 2024-09-25 14:25:13 -07:00 · 10908a57f7
commit 10908a57f7
parent a89f75e1b7
4 changed files with 172 additions and 0 deletions
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -1330,6 +1330,7 @@ add_library(ggml
            ggml-backend.cpp
            ggml-quants.c
            ggml-quants.h
+            ggml-profile.cpp
            ${GGML_SOURCES_CUDA}      ${GGML_HEADERS_CUDA}
            ${GGML_SOURCES_METAL}     ${GGML_HEADERS_METAL}
            ${GGML_SOURCES_RPC}       ${GGML_HEADERS_RPC}
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@ -172,6 +172,17 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g
    GGML_ABORT("fatal error");
 }

+// op profile data (per op / per thread)
+enum ggml_profile_event {
+    GGML_PROF_OP_START,
+    GGML_PROF_OP_SYNC,
+    GGML_PROF_OP_END
+};
+
+struct ggml_profile_data {
+    uint64_t nsec[GGML_PROF_OP_END + 1]; // event times in nsec
+};
+
 // computation graph

 enum ggml_cgraph_eval_order {
@ -189,6 +200,8 @@ struct ggml_cgraph {
    struct ggml_tensor ** grads;
    struct ggml_tensor ** leafs;

+    struct ggml_profile_data ** prof;
+
    struct ggml_hash_set visited_hash_set;

    enum ggml_cgraph_eval_order order;
@ -196,6 +209,12 @@ struct ggml_cgraph {

 struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);

+void ggml_profile_graph_init(struct ggml_cgraph *cg, int n_threads);
+void ggml_profile_graph_start(struct ggml_cgraph *cg, int n_threads);
+void ggml_profile_graph_finish(struct ggml_cgraph *cg, int n_threads);
+void ggml_profile_graph_free(struct ggml_cgraph *cg);
+void ggml_profile_op_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith);
+
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml-profile.cpp
+++ b/ggml/src/ggml-profile.cpp
@ -0,0 +1,140 @@
+#include "ggml-impl.h"
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <chrono>
+
+extern "C" void ggml_profile_graph_init(struct ggml_cgraph *cg, int n_threads)
+{
+    if (!getenv("GGML_GRAPH_PROFILE")) { return; }
+
+    // The number of threads may change between passes (pp vs tg).
+    // Allocate for max_n_threads for simplicity for now.
+    // TODO: use aligned allocator
+
+    size_t node_size = sizeof(struct ggml_profile_data) * GGML_MAX_N_THREADS;
+    size_t pvec_size = sizeof(std::intptr_t) * cg->n_nodes;
+    size_t data_size = node_size * cg->n_nodes;
+    size_t t_size    = pvec_size + data_size;
+
+    cg->prof = (struct ggml_profile_data **) malloc(t_size);
+    if (!cg->prof) {
+        fprintf(stderr, "ggml-profile: failed to allocate profiling data : n_threads %d n_nodes %d\n", n_threads, cg->n_nodes);
+        return;
+    }
+
+    memset(cg->prof, 0, t_size);
+
+    // init pre-thread pointers
+    uint8_t * data = (uint8_t *) cg->prof + pvec_size;
+    for (int i=0; i < cg->n_nodes; i++) {
+        cg->prof[i] = (struct ggml_profile_data *) data; data += node_size;
+    }
+}
+
+extern "C" void ggml_profile_graph_start(struct ggml_cgraph *cg, int n_threads)
+{
+    if (!cg->prof) { ggml_profile_graph_init(cg, n_threads); }
+    if (!cg->prof) { return; }
+}
+
+static inline int ggml_profile_format_tensor_dims(char *str, struct ggml_tensor *t)
+{
+    return sprintf(str, "%d:%d:%d:%d",
+        (int) t->ne[0], (int) t->ne[1], (int) t->ne[3], (int) t->ne[3]);
+}
+
+static inline void ggml_profile_format_op_dims(char *str, struct ggml_tensor *t)
+{
+    char *p = str;
+
+    // append src0 and src1 (if any)
+    if (t->src[0]) {
+       p += ggml_profile_format_tensor_dims(p, t->src[0]);
+
+       for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+           p += sprintf(p, " x ");
+           p += ggml_profile_format_tensor_dims(p, t->src[i]);
+       }
+
+       p += sprintf(p, " -> ");
+    }
+
+    // format self dims separately for better visual alignment
+    char self[64];
+    ggml_profile_format_tensor_dims(self, t);
+
+    p += sprintf(p, "%12s", self);
+}
+
+static inline void ggml_profile_format_op_types(char *str, struct ggml_tensor *t)
+{
+    char *p = str;
+
+    // append src0 and src1 (if any)
+    if (t->src[0]) {
+       p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
+
+       for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+           p += sprintf(p, " x ");
+           p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
+       }
+
+       p += sprintf(p, " -> ");
+    }
+
+    p += sprintf(p, "%3s", ggml_type_name(t->type));
+}
+
+
+extern "C" void ggml_profile_graph_finish(struct ggml_cgraph *cg, int n_threads)
+{
+    if (!cg->prof) { return; }
+
+    fprintf(stderr, "ggml-profile: | node idx | op name | proc (nsec) | sync (nsec) | total (nsec) | op dims | op types | tensor name |\n");
+    fprintf(stderr, "ggml-profile: | -------: | :------ | ----------: | ----------: | -----------: | ------: | -------: | ----------: |\n");
+
+    char dims[64 * GGML_MAX_SRC];
+    char types[16 * GGML_MAX_SRC];
+
+    for (int i = 0; i < cg->n_nodes; i++) {
+        uint64_t p_nsec = 0;
+        uint64_t s_nsec = 0;
+        uint64_t t_nsec = 0;
+
+        // add up per thread counters and reset them
+        for (int t=0; t < n_threads; t++) {
+            p_nsec += cg->prof[i][t].nsec[GGML_PROF_OP_SYNC] - cg->prof[i][t].nsec[GGML_PROF_OP_START];
+            s_nsec += cg->prof[i][t].nsec[GGML_PROF_OP_END]  - cg->prof[i][t].nsec[GGML_PROF_OP_SYNC];
+            t_nsec += cg->prof[i][t].nsec[GGML_PROF_OP_END]  - cg->prof[i][t].nsec[GGML_PROF_OP_START];
+
+            cg->prof[i][t].nsec[GGML_PROF_OP_START] = 0;
+            cg->prof[i][t].nsec[GGML_PROF_OP_SYNC]  = 0;
+            cg->prof[i][t].nsec[GGML_PROF_OP_END]   = 0;
+        }
+
+        ggml_profile_format_op_dims(dims, cg->nodes[i]);
+        ggml_profile_format_op_types(types, cg->nodes[i]);
+
+        fprintf(stderr, "ggml-profile: | %04d | %10s | %10lu | %10lu | %10lu | %46s | %22s | %20s |\n",
+            i, ggml_op_name(cg->nodes[i]->op),
+            (unsigned long) p_nsec, (unsigned long) s_nsec, (unsigned long) t_nsec,
+            dims, types, cg->nodes[i]->name);
+    }
+    fprintf(stderr, "ggml-profile:   \n"); // empty line to split tables
+}
+
+extern "C" void ggml_profile_graph_free(struct ggml_cgraph *cg)
+{
+    if (!cg->prof) { return; }
+
+    free(cg->prof); cg->prof = nullptr;
+}
+
+extern "C" void ggml_profile_op_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
+{
+    if (!cg->prof) { return; }
+
+    using clock = std::chrono::high_resolution_clock;
+    cg->prof[node_n][ith].nsec[e] = std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
+}
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -18988,6 +18988,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
        /*.nodes        =*/ nodes_ptr,
        /*.grads        =*/ grads_ptr,
        /*.leafs        =*/ leafs_ptr,
+        /*.prof         =*/ NULL,
        /*.hash_table   =*/ { hash_size, hash_used, hash_keys_ptr },
        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
    };
@ -19009,6 +19010,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
        /*.nodes        =*/ cgraph0->nodes + i0,
        /*.grads        =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
        /*.leafs        =*/ NULL,
+        /*.prof         =*/ NULL,
        /*.hash_table   =*/ { 0, NULL, NULL },
        /*.order        =*/ cgraph0->order,
    };
@ -19873,6 +19875,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
    for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) {
        struct ggml_tensor * node = cgraph->nodes[node_n];

+        ggml_profile_op_event(cgraph, GGML_PROF_OP_START, node_n, state->ith);
+
        ggml_compute_forward(&params, node);

        if (state->ith == 0 && cplan->abort_callback &&
@ -19881,7 +19885,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
            tp->ec    = GGML_STATUS_ABORTED;
        }

+        ggml_profile_op_event(cgraph, GGML_PROF_OP_SYNC, node_n, state->ith);
+
        ggml_barrier(state->threadpool);
+
+        ggml_profile_op_event(cgraph, GGML_PROF_OP_END,  node_n, state->ith);
    }

    return 0;
@ -20154,6 +20162,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
        threadpool->ec               = GGML_STATUS_SUCCESS;
    }

+    ggml_profile_graph_start(cgraph, n_threads);
+
 #ifdef GGML_USE_OPENMP
    if (n_threads > 1) {
        #pragma omp parallel num_threads(n_threads)
@ -20193,6 +20203,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
        ggml_threadpool_free(threadpool);
    }

+    ggml_profile_graph_finish(cgraph, n_threads);
+
    return ret;
 }