From b1ca8f36a9cdbcee5f5c425df717611a1040a897 Mon Sep 17 00:00:00 2001 From: Qingyou Meng Date: Sat, 1 Jul 2023 23:42:43 +0800 Subject: [PATCH] ggml : disable GGML_TASK_INIT and GGML_TASK_FINALIZE by default (#1995) Will not be scheduled unless explicitly enabled. --- ggml.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++--------- ggml.h | 3 +++ 2 files changed, 55 insertions(+), 9 deletions(-) diff --git a/ggml.c b/ggml.c index 684caaa37..75cc44baa 100644 --- a/ggml.c +++ b/ggml.c @@ -3846,6 +3846,40 @@ static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); +// WARN: +// Mis-confguration can lead to problem that's hard to reason about: +// * At best it crash or talks nosense. +// * At worst it talks slightly difference but hard to perceive. +// +// An op has to enable INIT or FINALIZE when any of it's branch needs that pass. +// Take care about compile options (e.g., GGML_USE_xxx). +static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 }; +static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 }; +static void ggml_setup_op_has_task_pass(void) { + { // INIT + bool * I = GGML_OP_HAS_INIT; + + I[GGML_OP_ACC ] = true; + I[GGML_OP_MUL_MAT ] = true; + I[GGML_OP_OUT_PROD ] = true; + I[GGML_OP_SET ] = true; + I[GGML_OP_GET_ROWS_BACK ] = true; + I[GGML_OP_DIAG_MASK_INF ] = true; + I[GGML_OP_DIAG_MASK_ZERO ] = true; + I[GGML_OP_CONV_1D_S1_PH ] = true; + I[GGML_OP_CONV_1D_S2_PH ] = true; + I[GGML_OP_CONV_2D_SK_P0 ] = true; + I[GGML_OP_FLASH_ATTN_BACK ] = true; + I[GGML_OP_CROSS_ENTROPY_LOSS ] = true; + } + + { // FINALIZE + bool * F = GGML_OP_HAS_FINALIZE; + + F[GGML_OP_CROSS_ENTROPY_LOSS ] = true; + } +} + // // ggml context // @@ -4267,6 +4301,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { ggml_cl_init(); #endif + ggml_setup_op_has_task_pass(); + is_first_call = false; } @@ -16791,9 +16827,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { if (node_n != -1) { /* FINALIZE */ struct ggml_tensor * node = state->shared->cgraph->nodes[node_n]; - params.nth = node->n_tasks; - ggml_compute_forward(¶ms, node); - ggml_graph_compute_perf_stats_node(node, state->shared); + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.nth = node->n_tasks; + ggml_compute_forward(¶ms, node); + ggml_graph_compute_perf_stats_node(node, state->shared); + } } // distribute new work or execute it direct if 1T @@ -16805,10 +16843,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { state->shared->perf_node_start_cycles = ggml_perf_cycles(); state->shared->perf_node_start_time_us = ggml_perf_time_us(); + params.nth = node->n_tasks; + /* INIT */ - params.type = GGML_TASK_INIT; - params.nth = node->n_tasks; - ggml_compute_forward(¶ms, node); + if (GGML_OP_HAS_INIT[node->op]) { + params.type = GGML_TASK_INIT; + ggml_compute_forward(¶ms, node); + } if (node->n_tasks == 1) { // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, @@ -16816,9 +16857,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { params.type = GGML_TASK_COMPUTE; ggml_compute_forward(¶ms, node); - params.type = GGML_TASK_FINALIZE; - ggml_compute_forward(¶ms, node); - ggml_graph_compute_perf_stats_node(node, state->shared); + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.type = GGML_TASK_FINALIZE; + ggml_compute_forward(¶ms, node); + ggml_graph_compute_perf_stats_node(node, state->shared); + } } else { break; } diff --git a/ggml.h b/ggml.h index 459913222..11b51f8bd 100644 --- a/ggml.h +++ b/ggml.h @@ -444,6 +444,9 @@ extern "C" { // compute types + + // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled. + // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995. enum ggml_task_type { GGML_TASK_INIT = 0, GGML_TASK_COMPUTE,