diff --git a/CMakeLists.txt b/CMakeLists.txt index 832c1e986..716673da2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,7 +78,7 @@ option(LLAMA_K_QUANTS "llama: use k-quants" option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" OFF) -option(LLAMA_MULMAT_TUNE "llama: mulmat tune" OFF) +option(LLAMA_TUNE "llama: mulmat tune" ON) # # Build info header @@ -278,9 +278,9 @@ if (LLAMA_METAL) ) endif() -if (LLAMA_MULMAT_TUNE) - add_compile_definitions(GGML_USE_MULMAT_TUNE) - add_compile_definitions(GGML_MULMAT_TUNE_NDEBUG) +if (LLAMA_TUNE) + add_compile_definitions(GGML_USE_TUNE) + add_compile_definitions(GGML_TUNE_NDEBUG) endif() if (LLAMA_K_QUANTS) diff --git a/Makefile b/Makefile index a8d1bdc09..531f62fb0 100644 --- a/Makefile +++ b/Makefile @@ -231,14 +231,14 @@ ifneq ($(filter armv8%,$(UNAME_M)),) CFLAGS += -mfp16-format=ieee -mno-unaligned-access endif -ifdef LLAMA_NO_K_QUANTS +ifndef LLAMA_NO_K_QUANTS k_quants.o: k_quants.c k_quants.h $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_NO_K_QUANTS -ifdef LLAMA_MULMAT_TUNE - CFLAGS += -DGGML_USE_MULMAT_TUNE -DGGML_MULMAT_TUNE_NDEBUG - CXXFLAGS += -DGGML_USE_MULMAT_TUNE +ifndef LLAMA_NO_TUNE +CFLAGS += -DGGML_USE_TUNE -DGGML_TUNE_NDEBUG +CXXFLAGS += -DGGML_USE_TUNE endif # diff --git a/examples/common.cpp b/examples/common.cpp index 882e90c9c..fd6df4947 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -345,7 +345,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.mem_test = true; } else if (arg == "--export") { params.export_cgraph = true; -#ifdef GGML_USE_MULMAT_TUNE +#ifdef GGML_USE_TUNE } else if (arg == "--tune") { params.tune = true; } else if (arg == "--tune-file") { @@ -354,7 +354,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.tune_file = argv[i]; -#endif // GGML_USE_MULMAT_TUNE +#endif // GGML_USE_TUNE } else if (arg == "--verbose-prompt") { params.verbose_prompt = true; } else if (arg == "-r" || arg == "--reverse-prompt") { @@ -508,7 +508,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { #endif fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n"); -#ifdef GGML_USE_MULMAT_TUNE +#ifdef GGML_USE_TUNE fprintf(stderr, " --tune mulmat tune enable. If tune-file is set then exit after bench\n"); fprintf(stderr, " --tune-file FILE mulmat tune data file. If tune is true, then write bench result to this file, else load the file and run\n"); #endif diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 542e463bf..fa243ce95 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -117,7 +117,7 @@ int main(int argc, char ** argv) { return 1; } -#ifdef GGML_USE_MULMAT_TUNE +#ifdef GGML_USE_TUNE if (params.tune || !params.tune_file.empty()) { bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str()); if (!ok || (params.tune && !params.tune_file.empty())) { diff --git a/examples/mulmat-tune/README.md b/examples/mulmat-tune/README.md index cff8a3d64..df023757a 100644 --- a/examples/mulmat-tune/README.md +++ b/examples/mulmat-tune/README.md @@ -23,13 +23,13 @@ run bench ahead of time (saving tens of seconds), but there are two shortcomings Makefile: ``` -make clean && LLAMA_MULMAT_TUNE=1 make +make clean && make ``` CMake (with BLAS): ``` cmake --build . --target clean -cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON +cmake .. -DLLAMA_BLAS=ON cmake --build . --config Release ``` @@ -52,13 +52,13 @@ Run examples: Makefile: ``` -make clean && LLAMA_MULMAT_TUNE=1 make +make clean && make ``` CMake (with BLAS) ``` cmake --build . --target clean -cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON +cmake .. -DLLAMA_BLAS=ON cmake --build . --config Release ``` @@ -103,22 +103,29 @@ setup properly. General steps: 1. run `./mulmat-tune -h` to see how to build for misc vendors. - you can build with `GGML_MULMAT_TUNE_NDEBUG=` to enable the the debug, e.g: + To enable the debug, comment out `-DGGML_TUNE_NDEBUG` from makefile then run: + ``` - make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1 LLAMA_NO_ACCELERATE=1 LLAMA_CLBLAST=1 make + make clean; make ``` + On `macOS`, `ACCELERATE` is enabled by default. When `ACCELERATE` is built along with `CUDA` or `CL`, you may not see `CUDA` or `CL` from debug because `CPU` - or `CPU_BLAS` is more faster (as of the estimation from mulmat tune). + or `CPU_BLAS` is more faster (as of the estimation from mulmat tune), try run + with `-t 1`? 2. create a small prompt file: + ``` head -n 5 ./models/wikitext-2-raw/wiki.valid.raw > ./models/wiki.valid-5.raw ``` + 3. run any of the following example commands. + ``` ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 1 -b 32 ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 4 -b 64 ``` + * `--mlock` is recommended for `macOS`, you may not want to use it. * don't change `-c 128`: too large `context size` causes 0 perplexity trunk. * `-t` is the number of threads, recommend `1`, `2`, `4` or `6`. diff --git a/examples/mulmat-tune/mulmat-tune.cpp b/examples/mulmat-tune/mulmat-tune.cpp index 62f1da277..ab3334d76 100644 --- a/examples/mulmat-tune/mulmat-tune.cpp +++ b/examples/mulmat-tune/mulmat-tune.cpp @@ -262,8 +262,6 @@ int main(int argc, char **argv) { struct ggml_mulmat_tune_params params; memset(¶ms, 0, sizeof(struct ggml_mulmat_tune_params)); - ggml_mulmat_init_task_profiles(); - ggml_mulmat_tune_model_init(¶ms.model, model_name, ftype); params.m_num = m_num; params.n_pass = n_pass; diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 1f14c18de..2cdd9db06 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -158,7 +158,7 @@ int main(int argc, char ** argv) { return 1; } -#ifdef GGML_USE_MULMAT_TUNE +#ifdef GGML_USE_TUNE if (params.tune || !params.tune_file.empty()){ bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str()); if (!ok || (params.tune && !params.tune_file.empty())) { diff --git a/ggml-threading.c b/ggml-threading.c index cf17793f6..6dd6d2817 100644 --- a/ggml-threading.c +++ b/ggml-threading.c @@ -394,7 +394,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) { enum ggml_compute_error err = shared->task_runner(&state->params, state->node); - GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK); + GGML_ASSERT(err == GGML_COMPUTE_OK); ggml_spin_lock(&shared->spin); @@ -433,7 +433,11 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx, // This is the params for main thread. struct ggml_compute_params params; - enum ggml_compute_error err; + enum ggml_compute_error err = GGML_COMPUTE_OK; + +START: + + memset(¶ms, 0, sizeof(struct ggml_compute_params)); for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) { if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) { @@ -504,11 +508,19 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx, } if (err != GGML_COMPUTE_OK) { + if (err == GGML_COMPUTE_FALLBACK) { + struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]; + int n = ggml_get_task_profiles(node, profiles); + GGML_ASSERT(n > 0); + memcpy(&node->task_profile, &profiles[0], + sizeof(struct ggml_task_profile)); + goto START; + } return err; } } - return GGML_COMPUTE_OK; + return err; } struct ggml_threading_context * diff --git a/ggml-tune.c b/ggml-tune.c index fbca953ed..52ca96bf3 100644 --- a/ggml-tune.c +++ b/ggml-tune.c @@ -55,7 +55,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile( struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0}; - struct ggml_task_profile *prof = NULL; + const struct ggml_task_profile *prof = NULL; if (e->M == M && e->N == N && e->K == K) { prof = e->profile; @@ -97,10 +97,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile( e->N = N; e->K = K; - // to disable this, build with - // `make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1 - // make` -#if !defined(GGML_MULMAT_TUNE_NDEBUG) +#ifndef GGML_TUNE_NDEBUG const char *names[3]; for (int i = 0; i < 3; i++) { names[i] = ggml_mulmat_tune_task_backend_name( @@ -163,8 +160,8 @@ void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model, bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune, struct ggml_mulmat_tune_params *params, - struct ggml_task_profile_factory *pf) { - + ggml_task_profiles_provider *profiles_provider) { + GGML_ASSERT(profiles_provider); struct ggml_mulmat_tune_model *model = ¶ms->model; memset(tune, 0, sizeof(struct ggml_mulmat_tune)); @@ -208,8 +205,20 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune, for (int i = 0; i < tune->n_shapes; i++) { struct ggml_mulmat_tune_shape *shape = &tune->shapes[i]; - shape->n_profiles = ggml_mulmat_get_task_profiles( - pf, shape->src0_type, shape->src1_type, &shape->profiles); + + struct ggml_tensor src0 = { + .type = shape->src0_type, + }; + struct ggml_tensor src1 = { + .type = shape->src1_type, + }; + struct ggml_tensor node = { + .op = GGML_OP_MUL_MAT, + .src0 = &src0, + .src1 = &src1, + }; + + shape->n_profiles = profiles_provider(&node, shape->profiles); if (shape->n_profiles == 0) { // allowed for testing. continue; @@ -304,9 +313,20 @@ ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune, for (int i = 0; i < tune->n_shapes; i++) { const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i]; - struct ggml_task_profile *builtin_profiles = NULL; - int n_profiles = ggml_mulmat_get_task_profiles( - NULL, shape->src0_type, shape->src1_type, &builtin_profiles); + struct ggml_tensor src0 = { + .type = shape->src0_type, + }; + struct ggml_tensor src1 = { + .type = shape->src1_type, + }; + struct ggml_tensor node = { + .op = GGML_OP_MUL_MAT, + .src0 = &src0, + .src1 = &src1, + }; + + struct ggml_task_profile builtin_profiles[GGML_MAX_TASK_PROFILES]; + int n_profiles = ggml_get_task_profiles(&node, builtin_profiles); if (n_profiles != shape->n_profiles) { snprintf(errbuf, errbuf_len - 1, "task profiles mismatch"); @@ -382,13 +402,6 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { memset(shape->items, 0, item_size); } - { - size_t sz = sizeof(struct ggml_task_profile) * shape->n_profiles; - shape->profiles = malloc(sz); - GGML_ASSERT(shape->profiles); - memset(shape->profiles, 0, sz); - } - for (int ip = 0; ip < shape->n_profiles; ip++) { struct ggml_task_profile *profile = &shape->profiles[ip]; for (int j = 0; j < 3; j++) { @@ -468,7 +481,7 @@ bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, } } - struct ggml_task_profile *profile = &shape->profiles[ip]; + const struct ggml_task_profile *profile = &shape->profiles[ip]; for (int k = 0; k < 3; k++) { if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) { rc = fprintf(fp, "%9d", item->stages_time[k]); @@ -537,7 +550,7 @@ void ggml_mulmat_tune_estimate_time( const int max_m = shape->items[m_num - 1].M; for (int ip = 0; ip < shape->n_profiles; ip++) { - struct ggml_task_profile *profile = &shape->profiles[ip]; + const struct ggml_task_profile *profile = &shape->profiles[ip]; profile_time[ip].total_time = 0; profile_time[ip].profile = profile; @@ -573,7 +586,7 @@ void ggml_mulmat_tune_estimate_time( GGML_ASSERT(p0 && p1); for (int i_stage = 0; i_stage < 3; i_stage++) { - struct ggml_task_stage *stage = &profile->stages[i_stage]; + const struct ggml_task_stage *stage = &profile->stages[i_stage]; if (stage->backend == GGML_TASK_BACKEND_NONE) { continue; } @@ -736,7 +749,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, return false; } - bool ok = ggml_mulmat_tune_init(tune, params, NULL); + bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles); if (!ok) { return false; } diff --git a/ggml-tune.h b/ggml-tune.h index 404f1f1c4..04b25873c 100644 --- a/ggml-tune.h +++ b/ggml-tune.h @@ -46,7 +46,7 @@ struct ggml_mulmat_tune_shape { enum ggml_type src1_type; int n_profiles; - struct ggml_task_profile *profiles; + struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]; int m_num; int *arr_m; @@ -69,7 +69,7 @@ struct ggml_mulmat_tune { }; struct ggml_mulmat_tune_time { - struct ggml_task_profile *profile; + const struct ggml_task_profile *profile; int stage_time[3]; int total_time; }; @@ -78,7 +78,7 @@ struct mm_cache_element { int M; int N; int K; - struct ggml_task_profile *profile; + const struct ggml_task_profile *profile; int stages_time[3]; }; @@ -108,7 +108,7 @@ void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model, bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune, struct ggml_mulmat_tune_params *params, - struct ggml_task_profile_factory *profile_factory); + ggml_task_profiles_provider *profiles_provider); void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune); diff --git a/ggml.c b/ggml.c index 5d0b83b1d..b75f33b88 100644 --- a/ggml.c +++ b/ggml.c @@ -144,7 +144,7 @@ inline static void* ggml_aligned_malloc(size_t size) { #include "ggml-opencl.h" #endif -#if defined(GGML_USE_MULMAT_TUNE) +#if defined(GGML_USE_TUNE) #include "ggml-tune.h" #endif @@ -4043,8 +4043,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { ggml_cl_init(); #endif - ggml_mulmat_init_task_profiles(); - is_first_call = false; } @@ -15524,164 +15522,254 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg return result; } -// ---- mulmat task profiles ---- +// ---- task profiles ---- -static struct ggml_task_profile_factory default_task_profile_factory = {0}; - -// TODO: thread unsafe. Should be initialized once. -void ggml_mulmat_init_task_profiles(void) { - const size_t sz = sizeof(struct ggml_task_profile_factory); - memset(&default_task_profile_factory, 0, sz); - - // f32 - { - struct ggml_task_profile *p = default_task_profile_factory.f32_f32; - int i = 0; - - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[1].parallel = true; - i++; - -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; - p[i].stages[1].wait = true; - i++; -#endif - -#if defined(GGML_USE_CUBLAS) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; - p[i].stages[1].wait = true; - i++; -#elif defined(GGML_USE_CLBLAST) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; - p[i].stages[1].wait = true; - i++; -#endif - default_task_profile_factory.n_f32_f32 = i; - } - - // f16 - { - struct ggml_task_profile *p = default_task_profile_factory.f16_f32; - int i = 0; - - p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[1].parallel = true; - i++; - -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; - p[i].stages[1].wait = true; - i++; -#endif - -#if defined(GGML_USE_CUBLAS) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; - p[i].stages[1].wait = true; - i++; -#elif defined(GGML_USE_CLBLAST) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; - p[i].stages[1].wait = true; - i++; -#endif - default_task_profile_factory.n_f16_f32 = i; - } - - // qxx - { - struct ggml_task_profile *p = default_task_profile_factory.qxx_f32; - int i = 0; - - p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[1].parallel = true; - i++; - -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[0].parallel = true; - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; - p[i].stages[1].wait = true; - i++; -#endif - -#if defined(GGML_USE_CUBLAS) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; - p[i].stages[1].wait = true; - i++; -#elif defined(GGML_USE_CLBLAST) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; - p[i].stages[1].wait = true; - i++; -#endif - default_task_profile_factory.n_qxx_f32 = i; - } -} - -int ggml_mulmat_get_task_profiles(struct ggml_task_profile_factory *pf, - enum ggml_type src0_t, enum ggml_type src1_t, - struct ggml_task_profile **profiles) { +// Implement `ggml_task_profiles_provider`. +// Fill `profiles` for the `node` and return number of profiles. +// +// NOTE: the node may be incompleted from testing or tunning, so please assert +// everything used here. +inline int ggml_get_task_profiles( + struct ggml_tensor *node, + struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]) { + GGML_ASSERT(node); + GGML_ASSERT(node->op >= 0); GGML_ASSERT(profiles); - if (pf == NULL) { - pf = &default_task_profile_factory; + memset(profiles, 0, + sizeof(struct ggml_task_profile) * GGML_MAX_TASK_PROFILES); + + struct ggml_task_profile *p = profiles; + int n_profiles = 0; + + switch (node->op) { + case GGML_OP_CPY: + case GGML_OP_DUP: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + n_profiles = 1; + } break; + case GGML_OP_ADD: + case GGML_OP_ADD1: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } break; + case GGML_OP_ACC: { + p[0].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } break; + case GGML_OP_SUB: + case GGML_OP_DIV: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_LOG: + case GGML_OP_SUM: + case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: + case GGML_OP_REPEAT: + case GGML_OP_REPEAT_BACK: + case GGML_OP_ABS: + case GGML_OP_SGN: + case GGML_OP_NEG: + case GGML_OP_STEP: + case GGML_OP_RELU: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + n_profiles = 1; + } break; + case GGML_OP_MUL: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } break; + case GGML_OP_GELU: + case GGML_OP_SILU: + case GGML_OP_SILU_BACK: + case GGML_OP_NORM: + case GGML_OP_RMS_NORM: + case GGML_OP_RMS_NORM_BACK: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } break; + case GGML_OP_MUL_MAT: + case GGML_OP_OUT_PROD: { + GGML_ASSERT(node->src0); + GGML_ASSERT(node->src1); + + enum ggml_type src0_t = node->src0->type; + enum ggml_type src1_t = node->src1->type; + + GGML_ASSERT(src1_t == GGML_TYPE_F32); + + int i = 0; + if (src0_t == GGML_TYPE_F32) { + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[1].parallel = true; + i++; + +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; + p[i].stages[1].wait = true; + i++; +#endif + +#if defined(GGML_USE_CUBLAS) + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; + p[i].stages[1].wait = true; + i++; +#elif defined(GGML_USE_CLBLAST) + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; + p[i].stages[1].wait = true; + i++; +#endif + } else if (src0_t == GGML_TYPE_F16) { + p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[1].parallel = true; + i++; + +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; + p[i].stages[1].wait = true; + i++; +#endif + +#if defined(GGML_USE_CUBLAS) + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; + p[i].stages[1].wait = true; + i++; +#elif defined(GGML_USE_CLBLAST) + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; + p[i].stages[1].wait = true; + i++; +#endif + } else if (ggml_is_quantized(src0_t)) { + p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[1].parallel = true; + i++; + +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[0].parallel = true; + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; + p[i].stages[1].wait = true; + i++; +#endif + +#if defined(GGML_USE_CUBLAS) + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; + p[i].stages[1].wait = true; + i++; +#elif defined(GGML_USE_CLBLAST) + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; + p[i].stages[1].wait = true; + i++; +#endif + } + n_profiles = i; + } break; + case GGML_OP_SCALE: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } break; + case GGML_OP_SET: { + p[0].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + n_profiles = 1; + } break; + case GGML_OP_CONT: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_GET_ROWS: + case GGML_OP_GET_ROWS_BACK: + case GGML_OP_DIAG: + case GGML_OP_DIAG_MASK_ZERO: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + n_profiles = 1; + } break; + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX: + case GGML_OP_SOFT_MAX_BACK: + case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } break; + case GGML_OP_ALIBI: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + n_profiles = 1; + } break; + case GGML_OP_CLAMP: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + n_profiles = 1; + } break; + case GGML_OP_CONV_1D_1S: + case GGML_OP_CONV_1D_2S: { + p[0].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } break; + case GGML_OP_FLASH_ATTN: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } break; + case GGML_OP_FLASH_FF: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; } - - GGML_ASSERT(src1_t == GGML_TYPE_F32); - - if (src0_t == GGML_TYPE_F32) { - *profiles = pf->f32_f32; - return pf->n_f32_f32; - } - - if (src0_t == GGML_TYPE_F16) { - *profiles = pf->f16_f32; - return pf->n_f16_f32; - } - - if (ggml_is_quantized(src0_t)) { - *profiles = pf->qxx_f32; - return pf->n_qxx_f32; - } - - GGML_ASSERT(false); -} - -static const struct ggml_task_profile * -ggml_mulmat_get_default_task_profile(struct ggml_task_profile_factory *pf, - enum ggml_type src0_type, - enum ggml_type src1_type) { - GGML_ASSERT(src1_type == GGML_TYPE_F32); - if (pf == NULL) { - pf = &default_task_profile_factory; - } - - struct ggml_task_profile *p = NULL; - - if (src0_type == GGML_TYPE_F32) { - p = &pf->f32_f32[0]; - } else if (src0_type == GGML_TYPE_F16) { - p = &pf->f16_f32[0]; - } else if (ggml_is_quantized(src0_type)) { - p = &pf->qxx_f32[0]; - } else { + case GGML_OP_FLASH_ATTN_BACK: { + p[0].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } break; + case GGML_OP_MAP_UNARY: + case GGML_OP_MAP_BINARY: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + n_profiles = 1; + } break; + case GGML_OP_CROSS_ENTROPY_LOSS: + p[0].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + p[0].stages[2].backend = GGML_TASK_BACKEND_CPU; + n_profiles = 1; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } break; + case GGML_OP_NONE: + case GGML_OP_COUNT: { + GGML_ASSERT(false); + } break; + default: GGML_ASSERT(false); } - for (int i = 0; i < 3; i++) { - GGML_ASSERT(p->stages[i].backend == GGML_TASK_BACKEND_CPU || - p->stages[i].backend == GGML_TASK_BACKEND_NONE); - } - - return p; + GGML_ASSERT(n_profiles > 0 && n_profiles <= GGML_MAX_TASK_PROFILES); + return n_profiles; } // Set task profile for GGML_OP_MUL_MAT or GGML_OP_OUT_PROD. -static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node, - struct ggml_mulmat_tune *tune) { +static const struct ggml_task_profile *ggml_mulmat_get_task_profile( + struct ggml_tensor *node, struct ggml_task_profile *profiles, + int n_profiles, struct ggml_mulmat_tune *tune, int stages_time_us[3]) { + GGML_ASSERT(node); GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD); + GGML_ASSERT(profiles); + GGML_ASSERT(n_profiles >= 2); enum ggml_type src0_t = node->src0->type; enum ggml_type src1_t = node->src1->type; @@ -15697,42 +15785,26 @@ static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node, int N = (int)node->ne[0]; int K = (int)node->src1->ne[0]; - struct ggml_task_profile *profiles = NULL; - int n_profiles = ggml_mulmat_get_task_profiles(NULL, src0_t, src1_t, &profiles); - GGML_ASSERT(n_profiles >= 2); - GGML_ASSERT(profiles); - const struct ggml_task_profile *prof = NULL; if (cond_match) { -#if defined(GGML_USE_MULMAT_TUNE) +#if defined(GGML_USE_TUNE) if (tune != NULL) { - int stages_time_us[3]; - prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t, src1_t, stages_time_us); + prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t, + src1_t, stages_time_us); if (prof != NULL) { - GGML_ASSERT(prof); - memcpy(&node->task_profile, prof, sizeof(struct ggml_task_profile)); - // Do not wait if the estimated execution time is too small (e.g. less than 0.1 ms) - // TODO: need bench actual wait/notify time, see ggml-threading.c - for (int i = 0; i < 3; i++) { - if (node->task_profile.stages[i].wait) { - if (stages_time_us[i] < 100) { - node->task_profile.stages[i].wait = false; - } - } - } - return; + return prof; } } #else UNUSED(tune); + UNUSED(stages_time_us); #endif if (prof == NULL && M >= 32 && N >= 32 && K >= 32) { for (int j = 0; j < n_profiles; j++) { enum ggml_task_backend comp_be = profiles[j].stages[GGML_TASK_COMPUTE].backend; - switch (comp_be) { case GGML_TASK_BACKEND_GPU_CUDA: { GGML_ASSERT(ggml_cpu_has_cublas()); @@ -15753,76 +15825,131 @@ static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node, break; } } + + if (prof) { + break; + } } } } if (prof == NULL) { - prof = ggml_mulmat_get_default_task_profile(NULL, src0_t, src1_t); + prof = &profiles[0]; + GGML_ASSERT(prof->stages[1].backend == GGML_TASK_BACKEND_CPU); } - GGML_ASSERT(prof); - memcpy(&node->task_profile, prof, sizeof(struct ggml_task_profile)); + return prof; } void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { int n_threads = cgraph->n_threads; - if (ggml_cpu_has_blas()) { - for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor *node = cgraph->nodes[i]; - - memset(&node->task_profile, 0, sizeof(struct ggml_task_profile)); - struct ggml_task_stage *stages = node->task_profile.stages; - - // Adapt node->backend: assume GPU at COMPUTE stage. - if (node->backend > GGML_BACKEND_CPU) { - stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_NONE; - stages[GGML_TASK_FINALIZE].backend = GGML_TASK_BACKEND_NONE; - - stages[GGML_TASK_COMPUTE].parallel = false; - bool wait = (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL); - stages[GGML_TASK_COMPUTE].wait = wait; - if (ggml_cpu_has_cublas()) { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_GPU_CUDA; - } else if (ggml_cpu_has_clblast()) { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_GPU_CL; - } else { - GGML_ASSERT(false); - } - } else if (node->op == GGML_OP_MUL_MAT) { - struct ggml_mulmat_tune * tune = NULL; -#if defined(GGML_USE_MULMAT_TUNE) - tune = cgraph->tune; -#endif - ggml_mulmat_set_tensor_task_profile(node, tune); - } else if (node->op == GGML_OP_OUT_PROD) { - ggml_mulmat_set_tensor_task_profile(node, NULL); - } - } - } - struct ggml_threading_context *thrd_ctx = ggml_threading_start( n_threads, ggml_threading_graph_compute_thread, ggml_compute_forward, GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL); // initialize tasks + work buffer { + // int64_t t0 = ggml_time_us(); + size_t work_size = 0; + struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]; + // thread scheduling for the different operations for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; + if (node->op == GGML_OP_NONE || node->op == GGML_OP_CONT) { + continue; + } + + int n_profiles = ggml_get_task_profiles(node, profiles); + + const struct ggml_task_profile *profile = NULL; + + // Adapt node->backend: assume GPU at COMPUTE stage. + if (node->backend == GGML_BACKEND_GPU || + node->backend == GGML_BACKEND_GPU_SPLIT) { + enum ggml_task_backend be; + if (ggml_cpu_has_cublas()) { + be = GGML_TASK_BACKEND_GPU_CUDA; + } else if (ggml_cpu_has_clblast()) { + be = GGML_TASK_BACKEND_GPU_CL; + } else { + GGML_ASSERT(false); + } + + for (int j = 0; j < n_profiles; j++) { + if (profiles[j].stages[1].backend == be) { + profile = &profiles[j]; + break; + } + } + GGML_ASSERT(profile); + } else { + GGML_ASSERT(node->backend == GGML_BACKEND_CPU); + } + + bool profile_copied = false; + + if (node->op == GGML_OP_MUL_MAT) { +#if defined(GGML_USE_TUNE) + int stages_time_us[3]; + profile = ggml_mulmat_get_task_profile( + node, profiles, n_profiles, cgraph->tune, stages_time_us); + GGML_ASSERT(profile); + + if (cgraph->tune) { + memcpy(&node->task_profile, profile, + sizeof(struct ggml_task_profile)); + profile_copied = true; + + // Do not wait if the estimated execution time is too small + // (e.g. less than 0.1 ms) + // TODO: need bench actual wait/notify time, see + // ggml-threading.c + for (int j = 0; j< 3; j++) { + if (node->task_profile.stages[j].wait) { + if (stages_time_us[j] < 100) { + node->task_profile.stages[j].wait = false; + } + } + } + } +#else + profile = ggml_mulmat_get_task_profile(node, profiles, + n_profiles, NULL, NULL); + GGML_ASSERT(profile); +#endif + } else if (node->op == GGML_OP_OUT_PROD) { // FIXME: is is right? + profile = ggml_mulmat_get_task_profile(node, profiles, + n_profiles, NULL, NULL); + GGML_ASSERT(profile); + } else { + profile = &profiles[0]; + GGML_ASSERT(profile->stages[1].backend == + GGML_TASK_BACKEND_CPU); + } + + if (!profile_copied) { + memcpy(&node->task_profile, profile, + sizeof(struct ggml_task_profile)); + } + struct ggml_task_stage *stages = node->task_profile.stages; + // compute stage n_tasks. + int n_tasks = stages[1].parallel ? n_threads : 1; + + // Allocate temp buffer `wdata` for CPU. + // NOTE: GPU MAY fallback to CPU, so we have to cover all possible cases. switch (node->op) { case GGML_OP_CPY: case GGML_OP_DUP: { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; size_t cur = 0; if (ggml_is_quantized(node->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads; + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks; } work_size = MAX(work_size, cur); @@ -15830,27 +15957,20 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_ADD: case GGML_OP_ADD1: { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; - size_t cur = 0; if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads; + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks; } work_size = MAX(work_size, cur); } break; case GGML_OP_ACC: { - stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; - size_t cur = 0; if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads; + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks; } work_size = MAX(work_size, cur); @@ -15870,16 +15990,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_NEG: case GGML_OP_STEP: case GGML_OP_RELU: - { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - } break; case GGML_OP_MUL: - { - if (stages[GGML_TASK_COMPUTE].backend == GGML_TASK_BACKEND_NONE) { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; - } - } break; case GGML_OP_GELU: case GGML_OP_SILU: case GGML_OP_SILU_BACK: @@ -15887,28 +15998,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM_BACK: { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; } break; case GGML_OP_MUL_MAT: - case GGML_OP_OUT_PROD: + case GGML_OP_OUT_PROD: // FIXME: is is right? { size_t cur = 0; enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend; GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE); - // TODO: remove this check once we are sure `ggml_mulmat_set_tensor_task_profile()` is correct. - if ((comp_backend & GGML_TASK_BACKEND_GPU) || comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { - enum ggml_type src0_t = node->src0->type; - enum ggml_type src1_t = node->src1->type; - bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 || - ggml_is_quantized(src0_t)) && - src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 && - ggml_is_contiguous(node->src0) && - ggml_is_contiguous(node->src1); - GGML_ASSERT(cond_match); - } - if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { #if defined(GGML_USE_CLBLAST) cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); @@ -15930,7 +16027,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) GGML_ASSERT(false); } } else if (comp_backend == GGML_TASK_BACKEND_CPU || comp_backend == GGML_TASK_BACKEND_GPU_CUDA) { - // We have to reseve buffer for CUDA because it may fallback to CPU. if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) { GGML_ASSERT(ggml_cpu_has_cublas()); } @@ -15955,13 +16051,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; case GGML_OP_SCALE: { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; } break; case GGML_OP_SET: { - stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; } break; case GGML_OP_CONT: case GGML_OP_RESHAPE: @@ -15972,33 +16064,18 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_GET_ROWS_BACK: case GGML_OP_DIAG: case GGML_OP_DIAG_MASK_ZERO: - { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - } break; case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX_BACK: case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: - { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; - } break; case GGML_OP_ALIBI: - { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - } break; case GGML_OP_CLAMP: { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; } break; case GGML_OP_CONV_1D_1S: case GGML_OP_CONV_1D_2S: { - stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; - GGML_ASSERT(node->src0->ne[3] == 1); GGML_ASSERT(node->src1->ne[2] == 1); GGML_ASSERT(node->src1->ne[3] == 1); @@ -16026,62 +16103,53 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; case GGML_OP_FLASH_ATTN: { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; - size_t cur = 0; const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*ne11*n_threads; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*n_threads; // this is overestimated by x2 + cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 } if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*ne11*n_threads; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*n_threads; // this is overestimated by x2 + cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 } work_size = MAX(work_size, cur); } break; case GGML_OP_FLASH_FF: { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; size_t cur = 0; if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*node->src1->ne[1]*n_threads; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*n_threads; // this is overestimated by x2 + cur = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2 } if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*node->src1->ne[1]*n_threads; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*n_threads; // this is overestimated by x2 + cur = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2 } work_size = MAX(work_size, cur); } break; case GGML_OP_FLASH_ATTN_BACK: { - stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; - size_t cur = 0; const int64_t D = node->src0->ne[0]; const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*mxDn*n_threads; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*n_threads; // this is overestimated by x2 + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 } if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*mxDn*n_threads; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*n_threads; // this is overestimated by x2 + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 } work_size = MAX(work_size, cur); @@ -16089,31 +16157,21 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; } break; case GGML_OP_CROSS_ENTROPY_LOSS: { - stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; - stages[GGML_TASK_FINALIZE].backend = GGML_TASK_BACKEND_CPU; - - size_t cur = ggml_type_size(node->type)*(n_threads + node->src0->ne[0]*n_threads); + size_t cur = ggml_type_size(node->type)*(n_threads + node->src0->ne[0]*n_tasks); work_size = MAX(work_size, cur); } break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; - - size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_threads; + size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks; work_size = MAX(work_size, cur); } break; case GGML_OP_NONE: { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; } break; case GGML_OP_COUNT: { @@ -16134,6 +16192,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size); cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size); } + + // ~ 50 us + //printf("=== prepare computing took %d us\n", (int)(ggml_time_us() - t0)); } const int64_t perf_start_cycles = ggml_perf_cycles(); @@ -16162,16 +16223,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) enum ggml_compute_error err = ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize); - if (err == GGML_COMPUTE_FALLBACK) { - if (node->op == GGML_OP_MUL_MAT) { - const struct ggml_task_profile *p = - ggml_mulmat_get_default_task_profile( - NULL, node->src0->type, node->src1->type); - memcpy(&node->task_profile, p, - sizeof(struct ggml_task_profile)); - } - err = ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize); - } GGML_ASSERT(err == GGML_COMPUTE_OK); // performance stats (node) diff --git a/ggml.h b/ggml.h index f51b658fd..5ab78c4a0 100644 --- a/ggml.h +++ b/ggml.h @@ -202,7 +202,7 @@ #define GGML_MAX_OPT 4 #define GGML_MAX_NAME 32 #define GGML_DEFAULT_N_THREADS 4 -#define GGML_MAX_TASK_PROFILES 8 +#define GGML_MAX_TASK_PROFILES 4 #define GGML_ASSERT(x) \ do { \ @@ -399,17 +399,6 @@ extern "C" { uint8_t dev_flags[4]; }; - struct ggml_task_profile_factory { - struct ggml_task_profile f32_f32[GGML_MAX_TASK_PROFILES]; - int n_f32_f32; - - struct ggml_task_profile f16_f32[GGML_MAX_TASK_PROFILES]; - int n_f16_f32; - - struct ggml_task_profile qxx_f32[GGML_MAX_TASK_PROFILES]; - int n_qxx_f32; - }; - // n-dimensional tensor struct ggml_tensor { enum ggml_type type; @@ -450,6 +439,11 @@ extern "C" { char padding[12]; }; + // Fill `profiles` for the `node` and return number of profiles. + typedef int (ggml_task_profiles_provider) ( + struct ggml_tensor *node, + struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]); + static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); // computation graph @@ -1345,15 +1339,11 @@ extern "C" { GGML_API int ggml_cpu_has_vsx (void); // - // mulmat task profiles + // task profiles // - GGML_API void ggml_mulmat_init_task_profiles(void); - GGML_API int ggml_mulmat_get_task_profiles( - struct ggml_task_profile_factory *pf, - enum ggml_type src0_t, - enum ggml_type src1_t, - struct ggml_task_profile **profiles); + // Implements `ggml_task_profiles_provider`. + GGML_API int ggml_get_task_profiles (struct ggml_tensor *node, struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]); // // Internal types and functions exposed for tests and benchmarks diff --git a/llama.cpp b/llama.cpp index fa5a94e21..acc0e59f7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -21,7 +21,7 @@ #include "ggml-metal.h" #endif -#ifdef GGML_USE_MULMAT_TUNE +#ifdef GGML_USE_TUNE #include "ggml-tune.h" #endif @@ -285,7 +285,7 @@ struct llama_context { int buf_last = 0; size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; -#ifdef GGML_USE_MULMAT_TUNE +#ifdef GGML_USE_TUNE struct ggml_mulmat_tune *tune = nullptr; #endif @@ -1408,7 +1408,7 @@ static bool llama_eval_internal( ggml_cgraph gf = {}; gf.n_threads = n_threads; -#ifdef GGML_USE_MULMAT_TUNE +#ifdef GGML_USE_TUNE gf.tune =lctx.tune; #endif @@ -2743,7 +2743,7 @@ struct llama_context * llama_init_from_file( return ctx; } -#ifdef GGML_USE_MULMAT_TUNE +#ifdef GGML_USE_TUNE bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) { printf("\n"); if (ctx->model.n_gpu_layers != 0) { @@ -2882,7 +2882,7 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons #endif void llama_free(struct llama_context * ctx) { -#ifdef GGML_USE_MULMAT_TUNE +#ifdef GGML_USE_TUNE if (ctx->tune) { delete(ctx->tune); } diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c index ed9d8aa2b..deb15fd84 100644 --- a/tests/test-ggml-threading.c +++ b/tests/test-ggml-threading.c @@ -356,6 +356,17 @@ int main(void) { { ++n_tests; + // required by getting task profiles. + node.op = GGML_OP_MUL_MAT; + struct ggml_tensor src0 = { + .type = GGML_TYPE_Q4_0, + }; + struct ggml_tensor src1 = { + .type = GGML_TYPE_F32, + }; + node.src0 = &src0; + node.src1 = &src1; + node.backend = GGML_BACKEND_GPU; if (test_fallback(&node) == 0) { ++n_passed; diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c index ed612fff4..e0a6950d9 100644 --- a/tests/test-ggml-tune.c +++ b/tests/test-ggml-tune.c @@ -3,6 +3,8 @@ #include +#define UNUSED(x) (void)(x) + static int bench(void); static int estimate_time_non_zero_NK(void); @@ -77,6 +79,20 @@ static int bench(void) { return ok ? 0 : 1; } +// implement `ggml_task_profiles_provider` +static int +ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node, + struct ggml_task_profile *profiles) { + UNUSED(node); + profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU; + profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU; + profiles[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + profiles[1].stages[0].backend = GGML_TASK_BACKEND_CPU; + profiles[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; + + return 2; +} + int estimate_time_non_zero_NK(void) { printf("test: %s\n", __func__); @@ -92,22 +108,10 @@ int estimate_time_non_zero_NK(void) { const int m_num = 2; - struct ggml_task_profile_factory pf; - memset(&pf, 0, sizeof(struct ggml_task_profile_factory)); - - { - pf.n_qxx_f32 = 2; - pf.qxx_f32[0].stages[0].backend = GGML_TASK_BACKEND_CPU; - pf.qxx_f32[0].stages[1].backend = GGML_TASK_BACKEND_CPU; - - pf.qxx_f32[1].stages[0].backend = GGML_TASK_BACKEND_CPU; - pf.qxx_f32[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; - } - struct ggml_mulmat_tune_params params; init_params(¶ms, m_num); - ggml_mulmat_tune_init(&tune, ¶ms, &pf); + ggml_mulmat_tune_init(&tune, ¶ms, ggml_task_profiles_mock_qxx_provider); struct ggml_mulmat_tune_shape *shape = NULL; for (int i = 0; i < tune.n_shapes; i++) {