bulk refactored task profile to support complete fallback; enable tune by default for ease of dev

2025-01-11 03:01:45 +00:00 · 2023-06-15 06:43:08 +08:00 · 2023-06-15 06:43:08 +08:00 · 48016f685c
commit 48016f685c
parent 1b041d7737
15 changed files with 480 additions and 394 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -78,7 +78,7 @@ option(LLAMA_K_QUANTS                        "llama: use k-quants"
 option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER               "llama: build server example"                           OFF)
-option(LLAMA_MULMAT_TUNE                "llama: mulmat tune"                                    OFF)
+option(LLAMA_TUNE                       "llama: mulmat tune"                                    ON)

 #
 # Build info header
@ -278,9 +278,9 @@ if (LLAMA_METAL)
        )
 endif()

-if (LLAMA_MULMAT_TUNE)
-    add_compile_definitions(GGML_USE_MULMAT_TUNE)
-    add_compile_definitions(GGML_MULMAT_TUNE_NDEBUG)
+if (LLAMA_TUNE)
+    add_compile_definitions(GGML_USE_TUNE)
+    add_compile_definitions(GGML_TUNE_NDEBUG)
 endif()

 if (LLAMA_K_QUANTS)
--- a/8
+++ b/8
@ -231,14 +231,14 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
 endif

-ifdef LLAMA_NO_K_QUANTS
+ifndef LLAMA_NO_K_QUANTS
 k_quants.o: k_quants.c k_quants.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_NO_K_QUANTS

-ifdef LLAMA_MULMAT_TUNE
-	CFLAGS   += -DGGML_USE_MULMAT_TUNE -DGGML_MULMAT_TUNE_NDEBUG
-	CXXFLAGS += -DGGML_USE_MULMAT_TUNE
+ifndef LLAMA_NO_TUNE
+CFLAGS   += -DGGML_USE_TUNE -DGGML_TUNE_NDEBUG
+CXXFLAGS += -DGGML_USE_TUNE
 endif

 #
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -345,7 +345,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.mem_test = true;
        } else if (arg == "--export") {
            params.export_cgraph = true;
-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
        } else if (arg == "--tune") {
            params.tune = true;
        } else if (arg == "--tune-file") {
@ -354,7 +354,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.tune_file = argv[i];
-#endif // GGML_USE_MULMAT_TUNE
+#endif // GGML_USE_TUNE
        } else if (arg == "--verbose-prompt") {
            params.verbose_prompt = true;
        } else if (arg == "-r" || arg == "--reverse-prompt") {
@ -508,7 +508,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #endif
    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
    fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
    fprintf(stderr, "  --tune                mulmat tune enable. If tune-file is set then exit after bench\n");
    fprintf(stderr, "  --tune-file FILE      mulmat tune data file. If tune is true, then write bench result to this file, else load the file and run\n");
 #endif
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -117,7 +117,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
    if (params.tune || !params.tune_file.empty()) {
        bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
        if (!ok || (params.tune && !params.tune_file.empty())) {
--- a/examples/mulmat-tune/README.md
+++ b/examples/mulmat-tune/README.md
@ -23,13 +23,13 @@ run bench ahead of time (saving tens of seconds), but there are two shortcomings

 Makefile:
 ```
-make clean && LLAMA_MULMAT_TUNE=1 make
+make clean && make
 ```

 CMake (with BLAS):
 ```
 cmake --build . --target clean
-cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON
+cmake .. -DLLAMA_BLAS=ON
 cmake --build . --config Release
 ```

@ -52,13 +52,13 @@ Run examples:

 Makefile:
 ```
-make clean && LLAMA_MULMAT_TUNE=1 make
+make clean && make
 ```

 CMake (with BLAS)
 ```
 cmake --build . --target clean
-cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON
+cmake .. -DLLAMA_BLAS=ON
 cmake --build . --config Release
 ```

@ -103,22 +103,29 @@ setup properly.
 General steps:

 1. run `./mulmat-tune -h` to see how to build for misc vendors.
-   you can build with `GGML_MULMAT_TUNE_NDEBUG=` to enable the the debug, e.g:
+   To enable the debug, comment out `-DGGML_TUNE_NDEBUG` from makefile then run:
+
   ```
-   make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1 LLAMA_NO_ACCELERATE=1 LLAMA_CLBLAST=1 make
+   make clean; make
   ```
+
   On `macOS`, `ACCELERATE` is enabled by default. When `ACCELERATE` is built along
   with `CUDA` or `CL`, you may not see `CUDA` or `CL` from debug because `CPU`
-   or `CPU_BLAS` is more faster (as of the estimation from mulmat tune).
+   or `CPU_BLAS` is more faster (as of the estimation from mulmat tune), try run
+   with `-t 1`?
 2. create a small prompt file:
+
   ```
   head -n 5 ./models/wikitext-2-raw/wiki.valid.raw > ./models/wiki.valid-5.raw
   ```
+
 3. run any of the following example commands.
+
   ```
   ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 1 -b 32
   ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 4 -b 64
   ```
+
   * `--mlock` is recommended for `macOS`, you may not want to use it.
   * don't change `-c 128`: too large `context size` causes 0 perplexity trunk.
   * `-t` is the number of threads, recommend `1`, `2`, `4` or `6`.
--- a/examples/mulmat-tune/mulmat-tune.cpp
+++ b/examples/mulmat-tune/mulmat-tune.cpp
@ -262,8 +262,6 @@ int main(int argc, char **argv) {
    struct ggml_mulmat_tune_params params;
    memset(&params, 0, sizeof(struct ggml_mulmat_tune_params));

-    ggml_mulmat_init_task_profiles();
-
    ggml_mulmat_tune_model_init(&params.model, model_name, ftype);
    params.m_num = m_num;
    params.n_pass = n_pass;
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
    if (params.tune || !params.tune_file.empty()){
        bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
        if (!ok || (params.tune && !params.tune_file.empty())) {
--- a/ggml-threading.c
+++ b/ggml-threading.c
@ -394,7 +394,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
            enum ggml_compute_error err =
                shared->task_runner(&state->params, state->node);

-            GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK);
+            GGML_ASSERT(err == GGML_COMPUTE_OK);

            ggml_spin_lock(&shared->spin);

@ -433,7 +433,11 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,

    // This is the params for main thread.
    struct ggml_compute_params params;
-    enum ggml_compute_error err;
+    enum ggml_compute_error err = GGML_COMPUTE_OK;
+
+START:
+
+    memset(&params, 0, sizeof(struct ggml_compute_params));

    for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) {
        if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) {
@ -504,11 +508,19 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
        }

        if (err != GGML_COMPUTE_OK) {
+            if (err == GGML_COMPUTE_FALLBACK) {
+                struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
+                int n = ggml_get_task_profiles(node, profiles);
+                GGML_ASSERT(n > 0);
+                memcpy(&node->task_profile, &profiles[0],
+                       sizeof(struct ggml_task_profile));
+                goto START;
+            }
            return err;
        }
    }

-    return GGML_COMPUTE_OK;
+    return err;
 }

 struct ggml_threading_context *
--- a/ggml-tune.c
+++ b/ggml-tune.c
@ -55,7 +55,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(

    struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0};

-    struct ggml_task_profile *prof = NULL;
+    const struct ggml_task_profile *prof = NULL;

    if (e->M == M && e->N == N && e->K == K) {
        prof = e->profile;
@ -97,10 +97,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
                e->N = N;
                e->K = K;

-                // to disable this, build with
-                // `make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1
-                // make`
-#if !defined(GGML_MULMAT_TUNE_NDEBUG)
+#ifndef GGML_TUNE_NDEBUG
                const char *names[3];
                for (int i = 0; i < 3; i++) {
                    names[i] = ggml_mulmat_tune_task_backend_name(
@ -163,8 +160,8 @@ void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,

 bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
                           struct ggml_mulmat_tune_params *params,
-                           struct ggml_task_profile_factory *pf) {
-
+                           ggml_task_profiles_provider *profiles_provider) {
+    GGML_ASSERT(profiles_provider);
    struct ggml_mulmat_tune_model *model = &params->model;

    memset(tune, 0, sizeof(struct ggml_mulmat_tune));
@ -208,8 +205,20 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,

    for (int i = 0; i < tune->n_shapes; i++) {
        struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
-        shape->n_profiles = ggml_mulmat_get_task_profiles(
-            pf, shape->src0_type, shape->src1_type, &shape->profiles);
+
+        struct ggml_tensor src0 = {
+            .type = shape->src0_type,
+        };
+        struct ggml_tensor src1 = {
+            .type = shape->src1_type,
+        };
+        struct ggml_tensor node = {
+            .op = GGML_OP_MUL_MAT,
+            .src0 = &src0,
+            .src1 = &src1,
+        };
+
+        shape->n_profiles = profiles_provider(&node, shape->profiles);
        if (shape->n_profiles == 0) {
            // allowed for testing.
            continue;
@ -304,9 +313,20 @@ ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
    for (int i = 0; i < tune->n_shapes; i++) {
        const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];

-        struct ggml_task_profile *builtin_profiles = NULL;
-        int n_profiles = ggml_mulmat_get_task_profiles(
-            NULL, shape->src0_type, shape->src1_type, &builtin_profiles);
+        struct ggml_tensor src0 = {
+            .type = shape->src0_type,
+        };
+        struct ggml_tensor src1 = {
+            .type = shape->src1_type,
+        };
+        struct ggml_tensor node = {
+            .op = GGML_OP_MUL_MAT,
+            .src0 = &src0,
+            .src1 = &src1,
+        };
+
+        struct ggml_task_profile builtin_profiles[GGML_MAX_TASK_PROFILES];
+        int n_profiles = ggml_get_task_profiles(&node, builtin_profiles);

        if (n_profiles != shape->n_profiles) {
            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
@ -382,13 +402,6 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
            memset(shape->items, 0, item_size);
        }

-        {
-            size_t sz = sizeof(struct ggml_task_profile) * shape->n_profiles;
-            shape->profiles = malloc(sz);
-            GGML_ASSERT(shape->profiles);
-            memset(shape->profiles, 0, sz);
-        }
-
        for (int ip = 0; ip < shape->n_profiles; ip++) {
            struct ggml_task_profile *profile = &shape->profiles[ip];
            for (int j = 0; j < 3; j++) {
@ -468,7 +481,7 @@ bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
                    }
                }

-                struct ggml_task_profile *profile = &shape->profiles[ip];
+                const struct ggml_task_profile *profile = &shape->profiles[ip];
                for (int k = 0; k < 3; k++) {
                    if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) {
                        rc = fprintf(fp, "%9d", item->stages_time[k]);
@ -537,7 +550,7 @@ void ggml_mulmat_tune_estimate_time(
    const int max_m = shape->items[m_num - 1].M;

    for (int ip = 0; ip < shape->n_profiles; ip++) {
-        struct ggml_task_profile *profile = &shape->profiles[ip];
+        const struct ggml_task_profile *profile = &shape->profiles[ip];
        profile_time[ip].total_time = 0;
        profile_time[ip].profile = profile;

@ -573,7 +586,7 @@ void ggml_mulmat_tune_estimate_time(
        GGML_ASSERT(p0 && p1);

        for (int i_stage = 0; i_stage < 3; i_stage++) {
-            struct ggml_task_stage *stage = &profile->stages[i_stage];
+            const struct ggml_task_stage *stage = &profile->stages[i_stage];
            if (stage->backend == GGML_TASK_BACKEND_NONE) {
                continue;
            }
@ -736,7 +749,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
        return false;
    }

-    bool ok = ggml_mulmat_tune_init(tune, params, NULL);
+    bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles);
    if (!ok) {
        return false;
    }
--- a/ggml-tune.h
+++ b/ggml-tune.h
@ -46,7 +46,7 @@ struct ggml_mulmat_tune_shape {
    enum ggml_type src1_type;

    int n_profiles;
-    struct ggml_task_profile *profiles;
+    struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];

    int m_num;
    int *arr_m;
@ -69,7 +69,7 @@ struct ggml_mulmat_tune {
 };

 struct ggml_mulmat_tune_time {
-    struct ggml_task_profile *profile;
+    const struct ggml_task_profile *profile;
    int stage_time[3];
    int total_time;
 };
@ -78,7 +78,7 @@ struct mm_cache_element {
    int M;
    int N;
    int K;
-    struct ggml_task_profile *profile;
+    const struct ggml_task_profile *profile;
    int stages_time[3];
 };

@ -108,7 +108,7 @@ void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,

 bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
                           struct ggml_mulmat_tune_params *params,
-                           struct ggml_task_profile_factory *profile_factory);
+                           ggml_task_profiles_provider *profiles_provider);

 void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune);

--- a/ggml.c
+++ b/ggml.c
@ -144,7 +144,7 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #include "ggml-opencl.h"
 #endif

-#if defined(GGML_USE_MULMAT_TUNE)
+#if defined(GGML_USE_TUNE)
    #include "ggml-tune.h"
 #endif

@ -4043,8 +4043,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
        ggml_cl_init();
 #endif

-        ggml_mulmat_init_task_profiles();
-
        is_first_call = false;
    }

@ -15524,20 +15522,89 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
    return result;
 }

-// ---- mulmat task profiles  ----
+// ---- task profiles  ----

-static struct ggml_task_profile_factory default_task_profile_factory = {0};
+// Implement `ggml_task_profiles_provider`.
+// Fill `profiles` for the `node` and return number of profiles.
+//
+// NOTE: the node may be incompleted from testing or tunning, so please assert
+//       everything used here.
+inline int ggml_get_task_profiles(
+    struct ggml_tensor *node,
+    struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]) {
+    GGML_ASSERT(node);
+    GGML_ASSERT(node->op >= 0);
+    GGML_ASSERT(profiles);

-// TODO: thread unsafe. Should be initialized once.
-void ggml_mulmat_init_task_profiles(void) {
-    const size_t sz = sizeof(struct ggml_task_profile_factory);
-    memset(&default_task_profile_factory, 0, sz);
+    memset(profiles, 0,
+           sizeof(struct ggml_task_profile) * GGML_MAX_TASK_PROFILES);
+
+    struct ggml_task_profile *p = profiles;
+    int n_profiles = 0;
+
+    switch (node->op) {
+    case GGML_OP_CPY:
+    case GGML_OP_DUP: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_ADD:
+    case GGML_OP_ADD1: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_ACC: {
+        p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_SUB:
+    case GGML_OP_DIV:
+    case GGML_OP_SQR:
+    case GGML_OP_SQRT:
+    case GGML_OP_LOG:
+    case GGML_OP_SUM:
+    case GGML_OP_SUM_ROWS:
+    case GGML_OP_MEAN:
+    case GGML_OP_REPEAT:
+    case GGML_OP_REPEAT_BACK:
+    case GGML_OP_ABS:
+    case GGML_OP_SGN:
+    case GGML_OP_NEG:
+    case GGML_OP_STEP:
+    case GGML_OP_RELU: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_MUL: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_GELU:
+    case GGML_OP_SILU:
+    case GGML_OP_SILU_BACK:
+    case GGML_OP_NORM:
+    case GGML_OP_RMS_NORM:
+    case GGML_OP_RMS_NORM_BACK: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_MUL_MAT:
+    case GGML_OP_OUT_PROD: {
+        GGML_ASSERT(node->src0);
+        GGML_ASSERT(node->src1);
+
+        enum ggml_type src0_t = node->src0->type;
+        enum ggml_type src1_t = node->src1->type;
+
+        GGML_ASSERT(src1_t == GGML_TYPE_F32);

-    // f32
-    {
-        struct ggml_task_profile *p = default_task_profile_factory.f32_f32;
        int i = 0;
-
+        if (src0_t == GGML_TYPE_F32) {
            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
            p[i].stages[1].parallel = true;
            i++;
@ -15557,14 +15624,7 @@ void ggml_mulmat_init_task_profiles(void) {
            p[i].stages[1].wait = true;
            i++;
 #endif
-        default_task_profile_factory.n_f32_f32 = i;
-    }
-
-    // f16
-    {
-        struct ggml_task_profile *p = default_task_profile_factory.f16_f32;
-        int i = 0;
-
+        } else if (src0_t == GGML_TYPE_F16) {
            p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
            p[i].stages[1].parallel = true;
@ -15585,14 +15645,7 @@ void ggml_mulmat_init_task_profiles(void) {
            p[i].stages[1].wait = true;
            i++;
 #endif
-        default_task_profile_factory.n_f16_f32 = i;
-    }
-
-    // qxx
-    {
-        struct ggml_task_profile *p = default_task_profile_factory.qxx_f32;
-        int i = 0;
-
+        } else if (ggml_is_quantized(src0_t)) {
            p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
            p[i].stages[1].parallel = true;
@ -15615,73 +15668,108 @@ void ggml_mulmat_init_task_profiles(void) {
            p[i].stages[1].wait = true;
            i++;
 #endif
-        default_task_profile_factory.n_qxx_f32 = i;
        }
-}
-
-int ggml_mulmat_get_task_profiles(struct ggml_task_profile_factory *pf,
-                                  enum ggml_type src0_t, enum ggml_type src1_t,
-                                  struct ggml_task_profile **profiles) {
-    GGML_ASSERT(profiles);
-
-    if (pf == NULL) {
-        pf = &default_task_profile_factory;
+        n_profiles = i;
+    } break;
+    case GGML_OP_SCALE: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_SET: {
+        p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_CONT:
+    case GGML_OP_RESHAPE:
+    case GGML_OP_VIEW:
+    case GGML_OP_PERMUTE:
+    case GGML_OP_TRANSPOSE:
+    case GGML_OP_GET_ROWS:
+    case GGML_OP_GET_ROWS_BACK:
+    case GGML_OP_DIAG:
+    case GGML_OP_DIAG_MASK_ZERO: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_DIAG_MASK_INF:
+    case GGML_OP_SOFT_MAX:
+    case GGML_OP_SOFT_MAX_BACK:
+    case GGML_OP_ROPE:
+    case GGML_OP_ROPE_BACK: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_ALIBI: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_CLAMP: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_CONV_1D_1S:
+    case GGML_OP_CONV_1D_2S: {
+        p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_FLASH_ATTN: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_FLASH_FF: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
    }
-
-    GGML_ASSERT(src1_t == GGML_TYPE_F32);
-
-    if (src0_t == GGML_TYPE_F32) {
-        *profiles = pf->f32_f32;
-        return pf->n_f32_f32;
-    }
-
-    if (src0_t == GGML_TYPE_F16) {
-        *profiles = pf->f16_f32;
-        return pf->n_f16_f32;
-    }
-
-    if (ggml_is_quantized(src0_t)) {
-        *profiles = pf->qxx_f32;
-        return pf->n_qxx_f32;
-    }
-
+    case GGML_OP_FLASH_ATTN_BACK: {
+        p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_MAP_UNARY:
+    case GGML_OP_MAP_BINARY: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_CROSS_ENTROPY_LOSS:
+        p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        p[0].stages[2].backend = GGML_TASK_BACKEND_CPU;
+        n_profiles = 1;
+    case GGML_OP_CROSS_ENTROPY_LOSS_BACK: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_NONE:
+    case GGML_OP_COUNT: {
        GGML_ASSERT(false);
-}
-
-static const struct ggml_task_profile *
-ggml_mulmat_get_default_task_profile(struct ggml_task_profile_factory *pf,
-                                     enum ggml_type src0_type,
-                                     enum ggml_type src1_type) {
-    GGML_ASSERT(src1_type == GGML_TYPE_F32);
-    if (pf == NULL) {
-        pf = &default_task_profile_factory;
-    }
-
-    struct ggml_task_profile *p = NULL;
-
-    if (src0_type == GGML_TYPE_F32) {
-        p = &pf->f32_f32[0];
-    } else if (src0_type == GGML_TYPE_F16) {
-        p = &pf->f16_f32[0];
-    } else if (ggml_is_quantized(src0_type)) {
-        p = &pf->qxx_f32[0];
-    } else {
+    } break;
+    default:
        GGML_ASSERT(false);
    }

-    for (int i = 0; i < 3; i++) {
-        GGML_ASSERT(p->stages[i].backend == GGML_TASK_BACKEND_CPU ||
-                    p->stages[i].backend == GGML_TASK_BACKEND_NONE);
-    }
-
-    return p;
+    GGML_ASSERT(n_profiles > 0 && n_profiles <= GGML_MAX_TASK_PROFILES);
+    return n_profiles;
 }

 // Set task profile for GGML_OP_MUL_MAT or GGML_OP_OUT_PROD.
-static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node,
-                                         struct ggml_mulmat_tune *tune) {
+static const struct ggml_task_profile *ggml_mulmat_get_task_profile(
+    struct ggml_tensor *node, struct ggml_task_profile *profiles,
+    int n_profiles, struct ggml_mulmat_tune *tune, int stages_time_us[3]) {
+
    GGML_ASSERT(node);
    GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD);
+    GGML_ASSERT(profiles);
+    GGML_ASSERT(n_profiles >= 2);

    enum ggml_type src0_t = node->src0->type;
    enum ggml_type src1_t = node->src1->type;
@ -15697,42 +15785,26 @@ static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node,
    int N = (int)node->ne[0];
    int K = (int)node->src1->ne[0];

-    struct ggml_task_profile *profiles = NULL;
-    int n_profiles = ggml_mulmat_get_task_profiles(NULL, src0_t, src1_t, &profiles);
-    GGML_ASSERT(n_profiles >= 2);
-    GGML_ASSERT(profiles);
-
    const struct ggml_task_profile *prof = NULL;

    if (cond_match) {
-#if defined(GGML_USE_MULMAT_TUNE)
+#if defined(GGML_USE_TUNE)
        if (tune != NULL) {
-            int stages_time_us[3];
-            prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t, src1_t, stages_time_us);
+            prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t,
+                                                        src1_t, stages_time_us);
            if (prof != NULL) {
-                 GGML_ASSERT(prof);
-                 memcpy(&node->task_profile, prof, sizeof(struct ggml_task_profile));
-                 // Do not wait if the estimated execution time is too small (e.g. less than 0.1 ms)
-                 // TODO: need bench actual wait/notify time, see ggml-threading.c
-                 for (int i = 0; i < 3; i++) {
-                    if (node->task_profile.stages[i].wait) {
-                        if (stages_time_us[i] < 100) {
-                            node->task_profile.stages[i].wait = false;
-                        }
-                    }
-                 }
-                 return;
+                return prof;
            }
        }
 #else
        UNUSED(tune);
+        UNUSED(stages_time_us);
 #endif

        if (prof == NULL && M >= 32 && N >= 32 && K >= 32) {
            for (int j = 0; j < n_profiles; j++) {
                enum ggml_task_backend comp_be =
                    profiles[j].stages[GGML_TASK_COMPUTE].backend;
-
                switch (comp_be) {
                    case GGML_TASK_BACKEND_GPU_CUDA: {
                        GGML_ASSERT(ggml_cpu_has_cublas());
@ -15753,76 +15825,131 @@ static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node,
                        break;
                    }
                }
+
+                if (prof) {
+                    break;
+                }
            }
        }
    }

    if (prof == NULL) {
-        prof = ggml_mulmat_get_default_task_profile(NULL, src0_t, src1_t);
+        prof = &profiles[0];
+        GGML_ASSERT(prof->stages[1].backend == GGML_TASK_BACKEND_CPU);
    }

-    GGML_ASSERT(prof);
-    memcpy(&node->task_profile, prof, sizeof(struct ggml_task_profile));
+    return prof;
 }

 void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
    int n_threads = cgraph->n_threads;

-    if (ggml_cpu_has_blas()) {
-        for (int i = 0; i < cgraph->n_nodes; i++) {
-            struct ggml_tensor *node = cgraph->nodes[i];
-
-            memset(&node->task_profile, 0, sizeof(struct ggml_task_profile));
-            struct ggml_task_stage *stages = node->task_profile.stages;
-
-            // Adapt node->backend: assume GPU at COMPUTE stage.
-            if (node->backend > GGML_BACKEND_CPU) {
-                stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_NONE;
-                stages[GGML_TASK_FINALIZE].backend = GGML_TASK_BACKEND_NONE;
-
-                stages[GGML_TASK_COMPUTE].parallel = false;
-                bool wait = (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL);
-                stages[GGML_TASK_COMPUTE].wait = wait;
-                if (ggml_cpu_has_cublas()) {
-                    stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_GPU_CUDA;
-                } else if (ggml_cpu_has_clblast()) {
-                    stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_GPU_CL;
-                } else {
-                    GGML_ASSERT(false);
-                }
-            } else if (node->op == GGML_OP_MUL_MAT) {
-                struct ggml_mulmat_tune * tune = NULL;
-#if defined(GGML_USE_MULMAT_TUNE)
-                tune = cgraph->tune;
-#endif
-                ggml_mulmat_set_tensor_task_profile(node, tune);
-            } else if (node->op == GGML_OP_OUT_PROD) {
-                ggml_mulmat_set_tensor_task_profile(node, NULL);
-            }
-        }
-    }
-
    struct ggml_threading_context *thrd_ctx = ggml_threading_start(
        n_threads, ggml_threading_graph_compute_thread, ggml_compute_forward,
        GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL);

    // initialize tasks + work buffer
    {
+        // int64_t t0 = ggml_time_us();
+
        size_t work_size = 0;

+        struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
+
        // thread scheduling for the different operations
        for (int i = 0; i < cgraph->n_nodes; i++) {
            struct ggml_tensor * node = cgraph->nodes[i];
+            if (node->op == GGML_OP_NONE || node->op == GGML_OP_CONT) {
+                continue;
+            }
+
+            int n_profiles = ggml_get_task_profiles(node, profiles);
+
+            const struct ggml_task_profile *profile = NULL;
+
+            // Adapt node->backend: assume GPU at COMPUTE stage.
+            if (node->backend == GGML_BACKEND_GPU ||
+                node->backend == GGML_BACKEND_GPU_SPLIT) {
+                enum ggml_task_backend be;
+                if (ggml_cpu_has_cublas()) {
+                    be = GGML_TASK_BACKEND_GPU_CUDA;
+                } else if (ggml_cpu_has_clblast()) {
+                    be = GGML_TASK_BACKEND_GPU_CL;
+                } else {
+                    GGML_ASSERT(false);
+                }
+
+                for (int j = 0; j < n_profiles; j++) {
+                    if (profiles[j].stages[1].backend == be) {
+                        profile = &profiles[j];
+                        break;
+                    }
+                }
+                GGML_ASSERT(profile);
+            } else {
+                GGML_ASSERT(node->backend == GGML_BACKEND_CPU);
+            }
+
+            bool profile_copied = false;
+
+            if (node->op == GGML_OP_MUL_MAT) {
+#if defined(GGML_USE_TUNE)
+                int stages_time_us[3];
+                profile = ggml_mulmat_get_task_profile(
+                    node, profiles, n_profiles, cgraph->tune, stages_time_us);
+                GGML_ASSERT(profile);
+
+                if (cgraph->tune) {
+                    memcpy(&node->task_profile, profile,
+                           sizeof(struct ggml_task_profile));
+                    profile_copied = true;
+
+                    // Do not wait if the estimated execution time is too small
+                    // (e.g. less than 0.1 ms)
+                    // TODO: need bench actual wait/notify time, see
+                    // ggml-threading.c
+                    for (int j = 0; j< 3; j++) {
+                        if (node->task_profile.stages[j].wait) {
+                            if (stages_time_us[j] < 100) {
+                                node->task_profile.stages[j].wait = false;
+                            }
+                        }
+                    }
+                }
+#else
+                profile = ggml_mulmat_get_task_profile(node, profiles,
+                                                       n_profiles, NULL, NULL);
+                GGML_ASSERT(profile);
+#endif
+            } else if (node->op == GGML_OP_OUT_PROD) { // FIXME: is is right?
+                profile = ggml_mulmat_get_task_profile(node, profiles,
+                                                       n_profiles, NULL, NULL);
+                GGML_ASSERT(profile);
+            } else {
+                profile = &profiles[0];
+                GGML_ASSERT(profile->stages[1].backend ==
+                            GGML_TASK_BACKEND_CPU);
+            }
+
+            if (!profile_copied) {
+                memcpy(&node->task_profile, profile,
+                       sizeof(struct ggml_task_profile));
+            }
+
            struct ggml_task_stage *stages = node->task_profile.stages;

+            // compute stage n_tasks.
+            int n_tasks = stages[1].parallel ? n_threads : 1;
+
+            // Allocate temp buffer `wdata` for CPU.
+            // NOTE: GPU MAY fallback to CPU, so we have to cover all possible cases.
            switch (node->op) {
                case GGML_OP_CPY:
                case GGML_OP_DUP:
                    {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                        size_t cur = 0;
                        if (ggml_is_quantized(node->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads;
+                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks;
                        }

                        work_size = MAX(work_size, cur);
@ -15830,27 +15957,20 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                case GGML_OP_ADD:
                case GGML_OP_ADD1:
                    {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
-
                        size_t cur = 0;

                        if (ggml_is_quantized(node->src0->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads;
+                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks;
                        }

                        work_size = MAX(work_size, cur);
                    } break;
                case GGML_OP_ACC:
                    {
-                        stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
-
                        size_t cur = 0;

                        if (ggml_is_quantized(node->src0->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads;
+                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks;
                        }

                        work_size = MAX(work_size, cur);
@ -15870,16 +15990,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                case GGML_OP_NEG:
                case GGML_OP_STEP:
                case GGML_OP_RELU:
-                    {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                    } break;
                case GGML_OP_MUL:
-                    {
-                        if (stages[GGML_TASK_COMPUTE].backend == GGML_TASK_BACKEND_NONE) {
-                            stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                            stages[GGML_TASK_COMPUTE].parallel = true;
-                        }
-                    } break;
                case GGML_OP_GELU:
                case GGML_OP_SILU:
                case GGML_OP_SILU_BACK:
@ -15887,28 +15998,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                case GGML_OP_RMS_NORM:
                case GGML_OP_RMS_NORM_BACK:
                    {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
                    } break;
                case GGML_OP_MUL_MAT:
-                case GGML_OP_OUT_PROD:
+                case GGML_OP_OUT_PROD: // FIXME: is is right?
                    {
                        size_t cur = 0;
                        enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend;
                        GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE);

-                        // TODO: remove this check once we are sure `ggml_mulmat_set_tensor_task_profile()` is correct.
-                        if ((comp_backend & GGML_TASK_BACKEND_GPU) || comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
-                            enum ggml_type src0_t = node->src0->type;
-                            enum ggml_type src1_t = node->src1->type;
-                            bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 ||
-                                ggml_is_quantized(src0_t)) &&
-                                src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 &&
-                                ggml_is_contiguous(node->src0) &&
-                                ggml_is_contiguous(node->src1);
-                            GGML_ASSERT(cond_match);
-                        }
-
                        if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
 #if defined(GGML_USE_CLBLAST)
                            cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
@ -15930,7 +16027,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                                GGML_ASSERT(false);
                            }
                        } else if (comp_backend == GGML_TASK_BACKEND_CPU || comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
-                            // We have to reseve buffer for CUDA because it may fallback to CPU.
                            if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
                                GGML_ASSERT(ggml_cpu_has_cublas());
                            }
@ -15955,13 +16051,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                    } break;
                case GGML_OP_SCALE:
                    {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
                    } break;
                case GGML_OP_SET:
                    {
-                        stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                    } break;
                case GGML_OP_CONT:
                case GGML_OP_RESHAPE:
@ -15972,33 +16064,18 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                case GGML_OP_GET_ROWS_BACK:
                case GGML_OP_DIAG:
                case GGML_OP_DIAG_MASK_ZERO:
-                    {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                    } break;
                case GGML_OP_DIAG_MASK_INF:
                case GGML_OP_SOFT_MAX:
                case GGML_OP_SOFT_MAX_BACK:
                case GGML_OP_ROPE:
                case GGML_OP_ROPE_BACK:
-                    {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
-                    } break;
                case GGML_OP_ALIBI:
-                    {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                    } break;
                case GGML_OP_CLAMP:
                    {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                    } break;
                case GGML_OP_CONV_1D_1S:
                case GGML_OP_CONV_1D_2S:
                    {
-                        stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
-
                        GGML_ASSERT(node->src0->ne[3] == 1);
                        GGML_ASSERT(node->src1->ne[2] == 1);
                        GGML_ASSERT(node->src1->ne[3] == 1);
@ -16026,62 +16103,53 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                    } break;
                case GGML_OP_FLASH_ATTN:
                    {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
-
                        size_t cur = 0;

                        const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);

                        if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*ne11*n_threads; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*ne11*n_threads; // this is overestimated by x2
+                            cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
                        }

                        if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*ne11*n_threads; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*ne11*n_threads; // this is overestimated by x2
+                            cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
                        }

                        work_size = MAX(work_size, cur);
                    } break;
                case GGML_OP_FLASH_FF:
                    {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
                        size_t cur = 0;

                        if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*node->src1->ne[1]*n_threads; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*node->src1->ne[1]*n_threads; // this is overestimated by x2
+                            cur  = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2
                        }

                        if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*node->src1->ne[1]*n_threads; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*node->src1->ne[1]*n_threads; // this is overestimated by x2
+                            cur  = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2
                        }

                        work_size = MAX(work_size, cur);
                    } break;
                case GGML_OP_FLASH_ATTN_BACK:
                    {
-                        stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
-
                        size_t cur = 0;

                        const int64_t    D = node->src0->ne[0];
                        const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
                        const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
                        if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*mxDn*n_threads; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*mxDn*n_threads; // this is overestimated by x2
+                            cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
                        }

                        if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*mxDn*n_threads; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*mxDn*n_threads; // this is overestimated by x2
+                            cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
                        }

                        work_size = MAX(work_size, cur);
@ -16089,31 +16157,21 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                case GGML_OP_MAP_UNARY:
                case GGML_OP_MAP_BINARY:
                    {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                    } break;
                case GGML_OP_CROSS_ENTROPY_LOSS:
                    {
-                        stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
-                        stages[GGML_TASK_FINALIZE].backend = GGML_TASK_BACKEND_CPU;
-
-                        size_t cur = ggml_type_size(node->type)*(n_threads + node->src0->ne[0]*n_threads);
+                        size_t cur = ggml_type_size(node->type)*(n_threads + node->src0->ne[0]*n_tasks);

                        work_size = MAX(work_size, cur);
                    } break;
                case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
                    {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
-
-                        size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_threads;
+                        size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks;

                        work_size = MAX(work_size, cur);
                    } break;
                case GGML_OP_NONE:
                    {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                    } break;
                case GGML_OP_COUNT:
                    {
@ -16134,6 +16192,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
            GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size);
            cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size);
        }
+
+        // ~ 50 us
+        //printf("=== prepare computing took %d us\n", (int)(ggml_time_us() - t0));
    }

    const int64_t perf_start_cycles  = ggml_perf_cycles();
@ -16162,16 +16223,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)

        enum ggml_compute_error err =
            ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
-        if (err == GGML_COMPUTE_FALLBACK) {
-            if (node->op == GGML_OP_MUL_MAT) {
-                    const struct ggml_task_profile *p =
-                        ggml_mulmat_get_default_task_profile(
-                            NULL, node->src0->type, node->src1->type);
-                    memcpy(&node->task_profile, p,
-                           sizeof(struct ggml_task_profile));
-            }
-            err = ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
-        }
        GGML_ASSERT(err == GGML_COMPUTE_OK);

        // performance stats (node)
--- a/ggml.h
+++ b/ggml.h
@ -202,7 +202,7 @@
 #define GGML_MAX_OPT           4
 #define GGML_MAX_NAME          32
 #define GGML_DEFAULT_N_THREADS 4
-#define GGML_MAX_TASK_PROFILES 8
+#define GGML_MAX_TASK_PROFILES 4

 #define GGML_ASSERT(x) \
    do { \
@ -399,17 +399,6 @@ extern "C" {
        uint8_t dev_flags[4];
    };

-    struct ggml_task_profile_factory {
-        struct ggml_task_profile f32_f32[GGML_MAX_TASK_PROFILES];
-        int n_f32_f32;
-
-        struct ggml_task_profile f16_f32[GGML_MAX_TASK_PROFILES];
-        int n_f16_f32;
-
-        struct ggml_task_profile qxx_f32[GGML_MAX_TASK_PROFILES];
-        int n_qxx_f32;
-    };
-
    // n-dimensional tensor
    struct ggml_tensor {
        enum ggml_type    type;
@ -450,6 +439,11 @@ extern "C" {
        char padding[12];
    };

+    // Fill `profiles` for the `node` and return number of profiles.
+    typedef int (ggml_task_profiles_provider) (
+        struct ggml_tensor *node,
+        struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]);
+
    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);

    // computation graph
@ -1345,15 +1339,11 @@ extern "C" {
    GGML_API int ggml_cpu_has_vsx        (void);

    //
-    // mulmat task profiles
+    // task profiles
    //
-    GGML_API void ggml_mulmat_init_task_profiles(void);

-    GGML_API int ggml_mulmat_get_task_profiles(
-        struct ggml_task_profile_factory *pf,
-        enum ggml_type src0_t,
-        enum ggml_type src1_t,
-        struct ggml_task_profile **profiles);
+    // Implements `ggml_task_profiles_provider`.
+    GGML_API int ggml_get_task_profiles (struct ggml_tensor *node, struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]);

    //
    // Internal types and functions exposed for tests and benchmarks
--- a/llama.cpp
+++ b/llama.cpp
@ -21,7 +21,7 @@
 #include "ggml-metal.h"
 #endif

-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
 #include "ggml-tune.h"
 #endif

@ -285,7 +285,7 @@ struct llama_context {
    int    buf_last = 0;
    size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };

-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
    struct ggml_mulmat_tune *tune = nullptr;
 #endif

@ -1408,7 +1408,7 @@ static bool llama_eval_internal(
    ggml_cgraph gf = {};
    gf.n_threads = n_threads;

-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
    gf.tune =lctx.tune;
 #endif

@ -2743,7 +2743,7 @@ struct llama_context * llama_init_from_file(
    return ctx;
 }

-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
 bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) {
    printf("\n");
    if (ctx->model.n_gpu_layers != 0) {
@ -2882,7 +2882,7 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
 #endif

 void llama_free(struct llama_context * ctx) {
-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
    if (ctx->tune) {
        delete(ctx->tune);
    }
--- a/tests/test-ggml-threading.c
+++ b/tests/test-ggml-threading.c
@ -356,6 +356,17 @@ int main(void) {
    {
        ++n_tests;

+        // required by getting task profiles.
+        node.op = GGML_OP_MUL_MAT;
+        struct ggml_tensor src0 = {
+            .type = GGML_TYPE_Q4_0,
+        };
+        struct ggml_tensor src1 = {
+            .type = GGML_TYPE_F32,
+        };
+        node.src0 = &src0;
+        node.src1 = &src1;
+
        node.backend = GGML_BACKEND_GPU;
        if (test_fallback(&node) == 0) {
            ++n_passed;
--- a/tests/test-ggml-tune.c
+++ b/tests/test-ggml-tune.c
@ -3,6 +3,8 @@

 #include <string.h>

+#define UNUSED(x) (void)(x)
+
 static int bench(void);
 static int estimate_time_non_zero_NK(void);

@ -77,6 +79,20 @@ static int bench(void) {
    return ok ? 0 : 1;
 }

+// implement `ggml_task_profiles_provider`
+static int
+ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node,
+                                     struct ggml_task_profile *profiles) {
+    UNUSED(node);
+    profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
+    profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
+    profiles[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+    profiles[1].stages[0].backend = GGML_TASK_BACKEND_CPU;
+    profiles[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
+
+    return 2;
+}
+
 int estimate_time_non_zero_NK(void) {
    printf("test: %s\n", __func__);

@ -92,22 +108,10 @@ int estimate_time_non_zero_NK(void) {

    const int m_num = 2;

-    struct ggml_task_profile_factory pf;
-    memset(&pf, 0, sizeof(struct ggml_task_profile_factory));
-
-    {
-        pf.n_qxx_f32 = 2;
-        pf.qxx_f32[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
-        pf.qxx_f32[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
-
-        pf.qxx_f32[1].stages[0].backend = GGML_TASK_BACKEND_CPU;
-        pf.qxx_f32[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
-    }
-
    struct ggml_mulmat_tune_params params;
    init_params(&params, m_num);

-    ggml_mulmat_tune_init(&tune, &params, &pf);
+    ggml_mulmat_tune_init(&tune, &params, ggml_task_profiles_mock_qxx_provider);

    struct ggml_mulmat_tune_shape *shape = NULL;
    for (int i = 0; i < tune.n_shapes; i++) {