diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 676f85a36..566b13e00 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -1330,6 +1330,7 @@ add_library(ggml ggml-backend.cpp ggml-quants.c ggml-quants.h + ggml-profile.cpp ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} ${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC} diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index d3f4bad8c..1760fa7f8 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -172,6 +172,17 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g GGML_ABORT("fatal error"); } +// op profile data (per op / per thread) +enum ggml_profile_event { + GGML_PROF_OP_START, + GGML_PROF_OP_SYNC, + GGML_PROF_OP_END +}; + +struct ggml_profile_data { + uint64_t nsec[GGML_PROF_OP_END + 1]; // event times in nsec +}; + // computation graph enum ggml_cgraph_eval_order { @@ -189,6 +200,8 @@ struct ggml_cgraph { struct ggml_tensor ** grads; struct ggml_tensor ** leafs; + struct ggml_profile_data ** prof; + struct ggml_hash_set visited_hash_set; enum ggml_cgraph_eval_order order; @@ -196,6 +209,12 @@ struct ggml_cgraph { struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); +void ggml_profile_graph_init(struct ggml_cgraph *cg, int n_threads); +void ggml_profile_graph_start(struct ggml_cgraph *cg, int n_threads); +void ggml_profile_graph_finish(struct ggml_cgraph *cg, int n_threads); +void ggml_profile_graph_free(struct ggml_cgraph *cg); +void ggml_profile_op_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-profile.cpp b/ggml/src/ggml-profile.cpp new file mode 100644 index 000000000..e6ee6aea0 --- /dev/null +++ b/ggml/src/ggml-profile.cpp @@ -0,0 +1,140 @@ +#include "ggml-impl.h" +#include +#include + +#include + +extern "C" void ggml_profile_graph_init(struct ggml_cgraph *cg, int n_threads) +{ + if (!getenv("GGML_GRAPH_PROFILE")) { return; } + + // The number of threads may change between passes (pp vs tg). + // Allocate for max_n_threads for simplicity for now. + // TODO: use aligned allocator + + size_t node_size = sizeof(struct ggml_profile_data) * GGML_MAX_N_THREADS; + size_t pvec_size = sizeof(std::intptr_t) * cg->n_nodes; + size_t data_size = node_size * cg->n_nodes; + size_t t_size = pvec_size + data_size; + + cg->prof = (struct ggml_profile_data **) malloc(t_size); + if (!cg->prof) { + fprintf(stderr, "ggml-profile: failed to allocate profiling data : n_threads %d n_nodes %d\n", n_threads, cg->n_nodes); + return; + } + + memset(cg->prof, 0, t_size); + + // init pre-thread pointers + uint8_t * data = (uint8_t *) cg->prof + pvec_size; + for (int i=0; i < cg->n_nodes; i++) { + cg->prof[i] = (struct ggml_profile_data *) data; data += node_size; + } +} + +extern "C" void ggml_profile_graph_start(struct ggml_cgraph *cg, int n_threads) +{ + if (!cg->prof) { ggml_profile_graph_init(cg, n_threads); } + if (!cg->prof) { return; } +} + +static inline int ggml_profile_format_tensor_dims(char *str, struct ggml_tensor *t) +{ + return sprintf(str, "%d:%d:%d:%d", + (int) t->ne[0], (int) t->ne[1], (int) t->ne[3], (int) t->ne[3]); +} + +static inline void ggml_profile_format_op_dims(char *str, struct ggml_tensor *t) +{ + char *p = str; + + // append src0 and src1 (if any) + if (t->src[0]) { + p += ggml_profile_format_tensor_dims(p, t->src[0]); + + for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) { + p += sprintf(p, " x "); + p += ggml_profile_format_tensor_dims(p, t->src[i]); + } + + p += sprintf(p, " -> "); + } + + // format self dims separately for better visual alignment + char self[64]; + ggml_profile_format_tensor_dims(self, t); + + p += sprintf(p, "%12s", self); +} + +static inline void ggml_profile_format_op_types(char *str, struct ggml_tensor *t) +{ + char *p = str; + + // append src0 and src1 (if any) + if (t->src[0]) { + p += sprintf(p, "%s", ggml_type_name(t->src[0]->type)); + + for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) { + p += sprintf(p, " x "); + p += sprintf(p, "%s", ggml_type_name(t->src[i]->type)); + } + + p += sprintf(p, " -> "); + } + + p += sprintf(p, "%3s", ggml_type_name(t->type)); +} + + +extern "C" void ggml_profile_graph_finish(struct ggml_cgraph *cg, int n_threads) +{ + if (!cg->prof) { return; } + + fprintf(stderr, "ggml-profile: | node idx | op name | proc (nsec) | sync (nsec) | total (nsec) | op dims | op types | tensor name |\n"); + fprintf(stderr, "ggml-profile: | -------: | :------ | ----------: | ----------: | -----------: | ------: | -------: | ----------: |\n"); + + char dims[64 * GGML_MAX_SRC]; + char types[16 * GGML_MAX_SRC]; + + for (int i = 0; i < cg->n_nodes; i++) { + uint64_t p_nsec = 0; + uint64_t s_nsec = 0; + uint64_t t_nsec = 0; + + // add up per thread counters and reset them + for (int t=0; t < n_threads; t++) { + p_nsec += cg->prof[i][t].nsec[GGML_PROF_OP_SYNC] - cg->prof[i][t].nsec[GGML_PROF_OP_START]; + s_nsec += cg->prof[i][t].nsec[GGML_PROF_OP_END] - cg->prof[i][t].nsec[GGML_PROF_OP_SYNC]; + t_nsec += cg->prof[i][t].nsec[GGML_PROF_OP_END] - cg->prof[i][t].nsec[GGML_PROF_OP_START]; + + cg->prof[i][t].nsec[GGML_PROF_OP_START] = 0; + cg->prof[i][t].nsec[GGML_PROF_OP_SYNC] = 0; + cg->prof[i][t].nsec[GGML_PROF_OP_END] = 0; + } + + ggml_profile_format_op_dims(dims, cg->nodes[i]); + ggml_profile_format_op_types(types, cg->nodes[i]); + + fprintf(stderr, "ggml-profile: | %04d | %10s | %10lu | %10lu | %10lu | %46s | %22s | %20s |\n", + i, ggml_op_name(cg->nodes[i]->op), + (unsigned long) p_nsec, (unsigned long) s_nsec, (unsigned long) t_nsec, + dims, types, cg->nodes[i]->name); + } + fprintf(stderr, "ggml-profile: \n"); // empty line to split tables +} + +extern "C" void ggml_profile_graph_free(struct ggml_cgraph *cg) +{ + if (!cg->prof) { return; } + + free(cg->prof); cg->prof = nullptr; +} + +extern "C" void ggml_profile_op_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith) +{ + if (!cg->prof) { return; } + + using clock = std::chrono::high_resolution_clock; + cg->prof[node_n][ith].nsec[e] = std::chrono::nanoseconds(clock::now().time_since_epoch()).count(); +} diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 3f01092d9..211c11448 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -18988,6 +18988,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz /*.nodes =*/ nodes_ptr, /*.grads =*/ grads_ptr, /*.leafs =*/ leafs_ptr, + /*.prof =*/ NULL, /*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr }, /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, }; @@ -19009,6 +19010,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) /*.nodes =*/ cgraph0->nodes + i0, /*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL, /*.leafs =*/ NULL, + /*.prof =*/ NULL, /*.hash_table =*/ { 0, NULL, NULL }, /*.order =*/ cgraph0->order, }; @@ -19873,6 +19875,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) { struct ggml_tensor * node = cgraph->nodes[node_n]; + ggml_profile_op_event(cgraph, GGML_PROF_OP_START, node_n, state->ith); + ggml_compute_forward(¶ms, node); if (state->ith == 0 && cplan->abort_callback && @@ -19881,7 +19885,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { tp->ec = GGML_STATUS_ABORTED; } + ggml_profile_op_event(cgraph, GGML_PROF_OP_SYNC, node_n, state->ith); + ggml_barrier(state->threadpool); + + ggml_profile_op_event(cgraph, GGML_PROF_OP_END, node_n, state->ith); } return 0; @@ -20154,6 +20162,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl threadpool->ec = GGML_STATUS_SUCCESS; } + ggml_profile_graph_start(cgraph, n_threads); + #ifdef GGML_USE_OPENMP if (n_threads > 1) { #pragma omp parallel num_threads(n_threads) @@ -20193,6 +20203,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl ggml_threadpool_free(threadpool); } + ggml_profile_graph_finish(cgraph, n_threads); + return ret; }