diff --git a/examples/mulmat-tune/README.md b/examples/mulmat-tune/README.md
index df023757a..4e521211d 100644
--- a/examples/mulmat-tune/README.md
+++ b/examples/mulmat-tune/README.md
@@ -214,26 +214,19 @@ The following results are generated with Accelerate compiled.
 **Example**
 
 ```
-5 3B 2 6 1
+[tune] done, elapsed time: 0 seconds.
+10 xB 12 4 2
 
-3200 3200  2 0 3 10
-16 0 0 0  16 1 0 1   0 0 0 0
-16 1 0 2  17 0 1 0   0 0 0 0
- 0 0 0 0  34 0 1 0   0 0 0 0
-   1        1      793 0     9103     2102 0 0     6014 0
-   2        2     1591 0     8034     2305 0 0    30982 0
-   4        4     2236 0     6476     2484 0 0    31388 0
-   8        7     4161 0     6623     2389 0 0    29204 0
-  16       15     8339 0     6434     2752 0 0    34303 0
-  32       32    16919 0     6915     3651 0 0    42511 0
-  64      200    34270 0     6574     4528 0 0    68212 0
- 128      188    69400 0     6325     6839 0 0    74437 0
- 256      303   134597 0     6168    11544 0 0   110180 0
- 512      687   279685 0     6337    29712 0 0   159728 0
+1024 1024 12 0 2 4
+100 110 000 1 CPU
+110 101 000 2 BLAS
+   1       11      309 0     1234       90 0
+   2       23      654 0     1359      215 0
+   4       44     1283 0     1362      421 0
+   8       85     2341 0     1357      347 0
 
-3200 8640  2 0 2 10
-
- ...
+1024 2048 12 0 2 4
+...
 
  ```
 
@@ -249,17 +242,17 @@ shape+
 # head
 version: 1
 model: "3B" | "7B" | "13B" | "30B" | "65B"
-ggml_ftype: 0 - 4, 7 - 14
+ggml_ftype: 0 - 3, 7 - 14
 n_shapes: number of shapes
 n_threads: number of threads
 
-shape := N K  m_num n_profiles
-task_conf_profile+
+shape := N K  src0_ggml_type src1_ggml_type n_profiles m_num
+task_profile+
 bench_item+
 
-task_conf_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize)
-stage_conf: backend parallel wait
-backend: 0 (NONE) | 16 (CPU) | 17 (CPU_BLAS) | 32 (GPU) | 33 (GPU_CUDA) | 34 (GPU_CL)
+task_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize) id name
+stage_conf(bitmap): valid parallel wait
+valid: 0 (false) | 1 (true)
 parallel: 0 (false) | 1 (true)
 wait: 0 (false) | 1 (true)
 
diff --git a/examples/mulmat-tune/mulmat-tune.cpp b/examples/mulmat-tune/mulmat-tune.cpp
index da1d0a1c1..ba1cc0f8a 100644
--- a/examples/mulmat-tune/mulmat-tune.cpp
+++ b/examples/mulmat-tune/mulmat-tune.cpp
@@ -111,6 +111,11 @@ static void usage(char *prog) {
 }
 
 int main(int argc, char **argv) {
+    if (!ggml_cpu_has_blas()) {
+        fprintf(stderr, "error: this program is not built with BLAS.\n");
+        return 1;
+    }
+
     if (argc == 2) {
         if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) {
             usage(argv[0]);
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index cf52109bc..5a4c7725a 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -2207,17 +2207,12 @@ void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml
     ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
 }
 
-bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
+// NOTE: don't check matrix size, otherwise mul_mat tune will fail to run.
+static bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
     // TODO: find the optimal values for these
     if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
         src1->type == GGML_TYPE_F32 &&
-        dst->type == GGML_TYPE_F32 &&
-        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
+        dst->type == GGML_TYPE_F32) {
         return true;
     }
 
@@ -2539,11 +2534,17 @@ void ggml_cuda_free_scratch() {
     g_scratch_buffer = nullptr;
 }
 
-bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
-    ggml_cuda_func_t func;
-    const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
+bool ggml_cuda_is_gpu_offloading(struct ggml_tensor * tensor) {
+    GGML_ASSERT(tensor);
+    GGML_ASSERT(tensor->src0);
+    return tensor->backend == GGML_BACKEND_GPU
         || tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT
         || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
+}
+
+bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
+    ggml_cuda_func_t func;
+    const bool any_on_device = is_gpu_offloading(tensor);
 
     switch (tensor->op) {
         case GGML_OP_ADD:
@@ -2571,7 +2572,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
             func = ggml_cuda_rms_norm;
             break;
         case GGML_OP_MUL_MAT:
-            if (!any_on_device/* && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)*/) {
+            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
                 return false;
             }
             func = ggml_cuda_mul_mat;
diff --git a/ggml-cuda.h b/ggml-cuda.h
index d32b44842..75ea94392 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -16,7 +16,7 @@ void   ggml_init_cublas(void);
 void   ggml_cuda_set_tensor_split(const float * tensor_split);
 
 void   ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+bool   ggml_cuda_is_gpu_offloading(const struct ggml_tensor * src0);
 size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
 
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index 2a1a04fca..28098793d 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -1589,18 +1589,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
     }
 }
 
+bool ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor) {
+    GGML_ASSERT(tensor);
+    return (tensor->src0 && tensor->src0->backend == GGML_BACKEND_GPU) ||
+        (tensor->src1 && tensor->src1->backend == GGML_BACKEND_GPU);
+}
 
-bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    // TODO: find the optimal values for these
+// NOTE: don't check matrix size, otherwise mul_mat tune will fail to run.
+static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
     if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
         src1->type == GGML_TYPE_F32 &&
-        dst->type == GGML_TYPE_F32 /*&&
-        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)*/) {
+        dst->type == GGML_TYPE_F32) {
         return true;
     }
 
diff --git a/ggml-opencl.h b/ggml-opencl.h
index a92b445c9..1de12f55a 100644
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@@ -9,7 +9,7 @@ extern "C" {
 void ggml_cl_init(void);
 
 void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+bool   ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor);
 size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
 
diff --git a/ggml-threading.c b/ggml-threading.c
index 7ef763c0f..dada9f3fe 100644
--- a/ggml-threading.c
+++ b/ggml-threading.c
@@ -376,7 +376,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
 
     struct ggml_compute_state_shared *shared = state->shared;
     GGML_ASSERT(shared);
-    GGML_ASSERT(shared->task_runner);
+    //GGML_ASSERT(shared->task_runner);
 
     shared->n_ready++;
 
@@ -397,7 +397,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
                                            : shared->task_runner;
             enum ggml_compute_error err = runner(&state->params, state->node);
 
-            GGML_ASSERT(err == GGML_COMPUTE_OK);
+            GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK);
 
             ggml_spin_lock(&shared->spin);
 
@@ -430,7 +430,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
                               size_t wsize) {
     GGML_ASSERT(ctx);
     GGML_ASSERT(node);
-    GGML_ASSERT(ctx->shared.task_runner);
+    // GGML_ASSERT(ctx->shared.task_runner);
 
     ggml_task_runner *runner = ctx->shared.task_runner;
     if (node->task_profile.runner) {
@@ -448,7 +448,7 @@ START:
     memset(&params, 0, sizeof(struct ggml_compute_params));
 
     for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) {
-        if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) {
+        if (!node->task_profile.stages[type].valid) {
             continue;
         }
 
@@ -519,18 +519,17 @@ START:
             if (err == GGML_COMPUTE_FALLBACK) {
                 PRINT_DEBUG("[main] fallback from profile, id=%d\n",
                             node->task_profile.id);
-                GGML_ASSERT(node->task_profile.stages[1].backend >
-                            GGML_TASK_BACKEND_CPU);
+                GGML_ASSERT(node->task_profile.id > 1);
 
                 struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
                 int n = ggml_get_task_profiles(node, profiles);
                 GGML_ASSERT(n > 0);
-                GGML_ASSERT(profiles[0].stages[1].backend ==
-                            GGML_TASK_BACKEND_CPU);
+                GGML_ASSERT(profiles[0].id == 1);
 
                 memcpy(&node->task_profile, &profiles[0],
-                       sizeof(struct ggml_task_profile));
+                    sizeof(struct ggml_task_profile));
                 runner = ctx->shared.task_runner;
+                GGML_ASSERT(runner);
 
                 goto START;
             }
diff --git a/ggml-threading.h b/ggml-threading.h
index 189fc2ed5..81192450c 100644
--- a/ggml-threading.h
+++ b/ggml-threading.h
@@ -29,7 +29,9 @@ typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data);
 // thread: optional OS thread runner, default value:
 // `ggml_threading_graph_compute_thread`.
 //
-// features: optional for configure
+// task_runner: default task runner, nullable wheen tensor.runner is not NULL.
+//              Overridden by tensor.runner.
+// features: configure threading behaviour, optional.
 // threading additional features. see `ggml_threading_feature`, default 0.
 //
 // stages_time: optional for collecting per-stage wall clock time.
@@ -51,12 +53,6 @@ enum ggml_compute_error
 ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
                               struct ggml_tensor *node, void *wdata,
                               size_t wsize);
-
-// This is an experimental functionality for mulmat tune, as a thin wrapper.
-enum ggml_compute_error
-ggml_compute_forward_wrapper(const struct ggml_compute_params *params,
-                             struct ggml_tensor *tensor);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml-tune.c b/ggml-tune.c
index aeb63e957..444269ae4 100644
--- a/ggml-tune.c
+++ b/ggml-tune.c
@@ -24,26 +24,7 @@ static uint64_t ggml_mulmat_tune_cache_hash(int M, int N, int K) {
     return hash;
 }
 
-static const char *
-ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) {
-    switch (backend) {
-    case GGML_TASK_BACKEND_NONE:
-        return "";
-    case GGML_TASK_BACKEND_CPU:
-        return "CPU";
-    case GGML_TASK_BACKEND_CPU_BLAS:
-        return "BLAS";
-    case GGML_TASK_BACKEND_GPU:
-        return "GPU";
-    case GGML_TASK_BACKEND_GPU_CUDA:
-        return "CUDA";
-    case GGML_TASK_BACKEND_GPU_CL:
-        return "CL";
-    default:
-        GGML_ASSERT(false);
-    }
-}
-
+// Return profile id, -1 when failed (such as unable to match shape).
 // NOTE: we can not use the profile from tune because the profiles do not
 // contain fields such as runner, get_size.
 int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
@@ -101,20 +82,15 @@ int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
                 e->K = K;
 
 #ifndef GGML_TUNE_NDEBUG
-                const char *names[3];
-                for (int i = 0; i < 3; i++) {
-                    names[i] = ggml_mulmat_tune_task_backend_name(
-                        prof->stages[i].backend);
-                }
                 printf("\n[tune] M: %3d, N: %5d, K: %5d, profile id: %d, "
                        "backends: %s %s %s\n",
-                       M, N, K, prof->id, names[0], names[1], names[2]);
+                       M, N, K, prof->id, prof->name);
 #endif
             }
         }
     }
 
-    return prof->id;
+    return prof ? prof->id : -1;
 }
 
 void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
@@ -283,25 +259,24 @@ static bool ggml_mulmat_tune_write_profiles(
     int rc;
     for (int i = 0; i < n_profiles; i++) {
         const struct ggml_task_profile *profile = &profiles[i];
-        rc = fprintf(fp, "%d ", profile->id);
-        if (rc <= 0) {
-            return false;
-        }
-
         for (int j = 0; j < 3; j++) {
             const struct ggml_task_stage *ts = &profile->stages[j];
-            rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0,
-                         ts->wait ? 1 : 0);
+            rc = fprintf(fp, "%1d%1d%1d", ts->valid ? 1 : 0,
+                         ts->parallel ? 1 : 0, ts->wait ? 1 : 0);
             if (rc <= 0) {
                 return false;
             }
             if (j < 2) {
-                rc = fprintf(fp, "  ");
+                rc = fprintf(fp, " ");
                 if (rc <= 0) {
                     return false;
                 }
             }
         }
+        rc = fprintf(fp, " %d %s", profile->id, profile->name);
+        if (rc <= 0) {
+            return false;
+        }
         rc = fprintf(fp, "\n");
         if (rc <= 0) {
             return false;
@@ -407,24 +382,24 @@ bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
     return ok;
 }
 
-bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
+int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
     GGML_ASSERT(tune);
     memset(tune, 0, sizeof(struct ggml_mulmat_tune));
 
     int rc = fscanf(fp, "%d", &tune->version);
     if (rc <= 0) {
-        return false;
+        return 1;
     }
 
     if (tune->version != GGML_MULMAT_TUNE_VERSION) {
         fprintf(stderr, "[tune] version mismatch, run bench again\n");
-        return false;
+        return 2;
     }
 
     rc = fscanf(fp, "%s %d %d %d", tune->model, (int *)&tune->ftype,
                 &tune->n_shapes, &tune->n_threads);
     if (rc <= 0) {
-        return false;
+        return 3;
     }
 
     for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
@@ -434,7 +409,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
                     (int *)&shape->src0_type, (int *)&shape->src1_type,
                     &shape->n_profiles, &shape->m_num);
         if (rc <= 0) {
-            return false;
+            return 4;
         }
 
         {
@@ -451,24 +426,24 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
         for (int ip = 0; ip < shape->n_profiles; ip++) {
             struct ggml_task_profile *profile = &shape->profiles[ip];
 
-            rc = fscanf(fp, "%d ", &profile->id);
-            if (rc <= 0) {
-                return false;
-            }
-
             for (int j = 0; j < 3; j++) {
                 struct ggml_task_stage *ts = &profile->stages[j];
-                int backend;
+                int valid;
                 int parallel;
                 int wait;
-                rc = fscanf(fp, "%d %d %d", &backend, &parallel, &wait);
+                rc = fscanf(fp, " %1d%1d%1d", &valid, &parallel, &wait);
                 if (rc <= 0) {
-                    return false;
+                    return 5;
                 }
-                ts->backend = (enum ggml_task_backend)backend;
+                ts->valid = valid ? true : false;
                 ts->parallel = parallel ? true : false;
                 ts->wait = wait ? true : false;
             }
+
+            rc = fscanf(fp, "%d %s", &profile->id, profile->name);
+            if (rc <= 0) {
+                return 6;
+            }
         }
 
         for (int i_m = 0; i_m < shape->m_num; i_m++) {
@@ -477,7 +452,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
                 if (ip == 0) {
                     rc = fscanf(fp, "%d", &M);
                     if (rc <= 0) {
-                        return false;
+                        return 7;
                     }
                 }
                 struct ggml_mulmat_tune_m *item =
@@ -486,13 +461,13 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
                 rc = fscanf(fp, "%d %d %d", &item->stages_time[0],
                             &item->stages_time[1], &item->stages_time[2]);
                 if (rc <= 0) {
-                    return false;
+                    return 8;
                 }
             }
         }
     }
 
-    return true;
+    return 0;
 }
 
 bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
@@ -535,7 +510,7 @@ bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
 
                 const struct ggml_task_profile *profile = &shape->profiles[ip];
                 for (int k = 0; k < 3; k++) {
-                    if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) {
+                    if (profile->stages[k].valid) {
                         rc = fprintf(fp, "%9d", item->stages_time[k]);
                         if (rc <= 0) {
                             return false;
@@ -562,8 +537,6 @@ const struct ggml_mulmat_tune_shape *
 ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N,
                            const int K, enum ggml_type src0_type,
                            enum ggml_type src1_type) {
-    GGML_ASSERT(N > 0 && K > 0);
-
     for (int i = 0; i < tune->n_shapes; i++) {
         const struct ggml_mulmat_tune_shape *s = &tune->shapes[i];
         if (s->src0_type != src0_type || s->src1_type != src1_type) {
@@ -574,13 +547,17 @@ ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N,
             if (s->N == N && s->K == K) {
                 return s;
             }
-        } else if (s->N > 0 && s->K == 0) {
-            if (s->N == N) {
-                return s;
-            }
-        } else if (s->N == 0 && s->K > 0) {
-            if (s->K == K) {
-                return s;
+        }
+
+        if (GGML_MULMAT_N_SHAPES == 6) {
+            if (s->N > 0 && s->K == 0) {
+                if (s->N == N) {
+                    return s;
+                }
+            } else if (s->N == 0 && s->K > 0) {
+                if (s->K == K) {
+                    return s;
+                }
             }
         }
     }
@@ -639,7 +616,7 @@ void ggml_mulmat_tune_estimate_time(
 
         for (int i_stage = 0; i_stage < 3; i_stage++) {
             const struct ggml_task_stage *stage = &profile->stages[i_stage];
-            if (stage->backend == GGML_TASK_BACKEND_NONE) {
+            if (!stage->valid) {
                 continue;
             }
 
@@ -784,23 +761,6 @@ static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) {
     return sz;
 }
 
-int ggml_mulmat_tune_get_builtin_task_backends(
-    enum ggml_task_backend *backends) {
-    int i = 0;
-    backends[i++] = GGML_TASK_BACKEND_CPU;
-
-    if (ggml_cpu_has_cpublas()) {
-        backends[i++] = GGML_TASK_BACKEND_CPU_BLAS;
-    }
-
-    if (ggml_cpu_has_cublas()) {
-        backends[i++] = GGML_TASK_BACKEND_GPU_CUDA;
-    } else if (ggml_cpu_has_clblast()) {
-        backends[i++] = GGML_TASK_BACKEND_GPU_CL;
-    }
-    return i;
-}
-
 bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
                             struct ggml_mulmat_tune_params *params) {
     GGML_ASSERT(tune);
@@ -809,23 +769,6 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
 
     memset(tune, 0, sizeof(struct ggml_mulmat_tune));
 
-    enum ggml_task_backend backends[16];
-    int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
-    if (n_backends < 2) {
-        fprintf(stderr,
-                "[tune] error: this program was not built with BLAS.\n");
-        return false;
-    }
-
-    if (params->model.ftype >= GGML_FTYPE_MOSTLY_Q2_K &&
-        params->model.ftype <= GGML_FTYPE_MOSTLY_Q6_K) {
-#if defined(GGML_USE_CLBLAST)
-        printf("[tune] error: cl implementation does not support k_quants at "
-               "the time of writing this code, skip.\n");
-        return false;
-#endif
-    }
-
     bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles);
     if (!ok) {
         return false;
@@ -835,12 +778,13 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
         char buf[128] = {0};
         int offset = 0;
 
-        for (int i = 0; i < n_backends; i++) {
+        for (int i = 0; i < tune->shapes[0].n_profiles; i++) {
             if (i > 0) {
                 buf[offset++] = ',';
                 buf[offset++] = ' ';
             }
-            const char *name = ggml_mulmat_tune_task_backend_name(backends[i]);
+            const char *name = tune->shapes[0].profiles[i].name;
+            GGML_ASSERT(name != NULL && strcmp(name, "") != 0);
             size_t len = strlen(name);
             memcpy(&buf[offset], name, len);
             offset += (int)len;
@@ -848,17 +792,17 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
 
         fprintf(stdout,
                 "[tune] model: %s, ggml ftype: %d, "
-                "n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n",
+                "n_pass: %d, n_shapes: %d, n_threads: %d, profiles: %s\n",
                 params->model.name, params->model.ftype, params->n_pass,
-                params->n_threads, tune->n_shapes, buf);
+                tune->n_shapes, params->n_threads, buf);
     }
 
     int64_t stages_time[3];
     int64_t t0 = ggml_time_ms();
 
-    struct ggml_threading_context *thrd_ctx = ggml_threading_start(
-        tune->n_threads, NULL, ggml_compute_forward_wrapper,
-        GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time);
+    struct ggml_threading_context *thrd_ctx =
+        ggml_threading_start(tune->n_threads, NULL, NULL,
+                             GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time);
 
     for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
         const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
@@ -896,6 +840,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
 
             for (int ip = 0; ip < shape->n_profiles; ip++) {
                 const struct ggml_task_profile *profile = &shape->profiles[ip];
+                // GGML_ASSERT(profile->runner);
 
                 memcpy(&node->task_profile, profile,
                        sizeof(struct ggml_task_profile));
@@ -911,9 +856,15 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
                         stages_time[j] = 0;
                     }
 
-                    enum ggml_compute_error err = ggml_threading_compute_tensor(
-                        thrd_ctx, node, wdata, wsize);
-                    GGML_ASSERT(err == GGML_COMPUTE_OK);
+                    ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
+
+                    if (memcmp(profile, &node->task_profile,
+                               sizeof(struct ggml_task_profile)) != 0) {
+                        printf("[tune] error: task profile changed, tensor op: "
+                               "%d, original id: %d, current id: %d\n",
+                               node->op, profile->id, node->task_profile.id);
+                        exit(1);
+                    }
 
                     for (int i = 0; i < 3; i++) {
                         int v = (int)stages_time[i];
diff --git a/ggml-tune.h b/ggml-tune.h
index 7955a50a9..addcd34db 100644
--- a/ggml-tune.h
+++ b/ggml-tune.h
@@ -10,7 +10,7 @@
 extern "C" {
 #endif
 
-#define GGML_MULMAT_TUNE_VERSION 9
+#define GGML_MULMAT_TUNE_VERSION 10
 #define GGML_MULMAT_N_SHAPES 4
 #define GGML_MULMAT_CACHE_LEN 16
 
@@ -119,7 +119,7 @@ void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune);
 
 bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp);
 
-bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp);
+int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp);
 
 const struct ggml_mulmat_tune_shape *
 ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, int N, int K,
@@ -129,11 +129,6 @@ void ggml_mulmat_tune_estimate_time(const struct ggml_mulmat_tune_shape *shape,
                                     int M,
                                     struct ggml_mulmat_tune_time *profile_time);
 
-const char *ggml_task_backend_name(enum ggml_task_backend backend);
-
-int ggml_mulmat_tune_get_builtin_task_backends(
-    enum ggml_task_backend *backends);
-
 bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
                             struct ggml_mulmat_tune_params *params);
 
diff --git a/ggml.c b/ggml.c
index 43ec93a64..62750b20b 100644
--- a/ggml.c
+++ b/ggml.c
@@ -8500,15 +8500,6 @@ static void ggml_compute_forward_mul_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
-#ifdef GGML_USE_CLBLAST
-    if (src1->backend == GGML_BACKEND_GPU) {
-        if (ith == 0) {
-            ggml_cl_mul(src0, src1, dst);
-        }
-        return;
-    }
-#endif
-
     const int64_t nr = ggml_nrows(src0);
 
     const int64_t ne00 = src0->ne[0];
@@ -9933,6 +9924,168 @@ static void ggml_compute_forward_rms_norm_back(
     }
 }
 
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+static void ggml_compute_forward_mul_mat_blas(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+    struct ggml_tensor * src0 = dst->src0;
+    struct ggml_tensor * src1 = dst->src1;
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    const int64_t ne0  = dst->ne[0];
+    const int64_t ne1  = dst->ne[1];
+    const int64_t ne2  = dst->ne[2];
+    const int64_t ne3  = dst->ne[3];
+
+    const int nb00 = src0->nb[0];
+    const int nb01 = src0->nb[1];
+    const int nb02 = src0->nb[2];
+    const int nb03 = src0->nb[3];
+
+    const int nb10 = src1->nb[0];
+    // const int nb11 = src1->nb[1];
+    const int nb12 = src1->nb[2];
+    const int nb13 = src1->nb[3];
+
+    const int nb0  = dst->nb[0];
+    const int nb1  = dst->nb[1];
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    GGML_ASSERT(ne02 == ne12);
+    GGML_ASSERT(ne03 == ne13);
+    GGML_ASSERT(ne2  == ne12);
+    GGML_ASSERT(ne3  == ne13);
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    GGML_ASSERT(ne0 == ne01);
+    GGML_ASSERT(ne1 == ne11);
+    GGML_ASSERT(ne2 == ne02);
+    GGML_ASSERT(ne3 == ne03);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    if (src0->type == GGML_TYPE_F32) {
+        // we don't support permuted src0 or src1
+        GGML_ASSERT(nb00 == sizeof(float));
+        GGML_ASSERT(params->nth == 1);
+        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
+
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
+                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
+                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                        ne11, ne01, ne10,
+                        1.0f,    y, ne10,
+                                 x, ne00,
+                        0.0f,    d, ne01);
+            }
+        }
+        return;
+    } else if (src0->type == GGML_TYPE_F16) {
+        // TODO: we don't support permuted src0
+        GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+        GGML_ASSERT(params->nth == 1);
+        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
+
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                float * const wdata = params->wdata;
+                {
+                    size_t id = 0;
+                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
+                        for (int64_t i00 = 0; i00 < ne00; ++i00) {
+                            wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
+                        }
+                    }
+
+                    assert(id*sizeof(float) <= params->wsize);
+                }
+
+                const float * x = wdata;
+                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
+                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+
+                // zT = y * xT
+                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                        ne11, ne01, ne10,
+                        1.0f,    y, ne10,
+                                 x, ne00,
+                        0.0f,    d, ne01);
+            }
+        }
+        return;
+    } else if (ggml_is_quantized(src0->type)) {
+        // we don't support permuted src0 or src1
+        GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[src0->type]);
+        GGML_ASSERT(src0->data);
+        GGML_ASSERT(params->wdata);
+
+        float * const wdata = params->wdata;
+        dequantize_row_q_t const dequantize_row_q = quantize_fns[src0->type].dequantize_row_q;
+
+        if (params->type == GGML_TASK_INIT) {
+            // rows per thread
+            const int dr = (ne01 + nth - 1)/nth;
+
+            // row range for this thread
+            const int ir0 = dr*ith;
+            int ir1 = MIN(ir0 + dr, ne01);
+
+            for (int64_t i03 = 0; i03 < ne03; i03++) {
+                for (int64_t i02 = 0; i02 < ne02; i02++) {
+                    char  * data0_offset = (char *) src0->data + i03*nb03 + i02*nb02;
+                    float * wdata_offset = wdata + i03*ne03 + i02*ne02;
+                    for (int64_t i = ir0; i < ir1; ++i) {
+                        dequantize_row_q(data0_offset + i*nb01, wdata_offset + i*ne00, ne00);
+                    }
+                }
+            }
+            return;
+        }
+
+        GGML_ASSERT(nth == 1);
+        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
+
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                const float * x = wdata;
+                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
+                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+                // zT = y * xT
+                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                        ne11, ne01, ne10,
+                        1.0f,    y, ne10,
+                                 x, ne00,
+                        0.0f,    d, ne01);
+            }
+        }
+        return;
+    } else {
+        GGML_ASSERT(false);
+    }
+}
+#endif
+
 // CPU only
 static void ggml_compute_forward_mul_mat_f32(
         const struct ggml_compute_params * params,
@@ -9947,9 +10100,6 @@ static void ggml_compute_forward_mul_mat_f32(
     const int64_t ne02 = src0->ne[2];
     const int64_t ne03 = src0->ne[3];
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    const int64_t ne10 = src1->ne[0];
-#endif
     const int64_t ne11 = src1->ne[1];
 #ifndef NDEBUG
     const int64_t ne12 = src1->ne[2];
@@ -10004,37 +10154,7 @@ static void ggml_compute_forward_mul_mat_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-    enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
-    GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
-
-    if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-        GGML_ASSERT(params->nth == 1);
-        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
-
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
-                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne00,
-                        0.0f,    d, ne01);
-            }
-        }
-        //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
-
-        return;
-#else
-        GGML_ASSERT(false);
-#endif
-    }
-
     GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
-    GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU);
 
     // parallelize by src0 rows using ggml_vec_dot_f32
 
@@ -10152,57 +10272,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-    enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend;
-    enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
-
-    GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
-
-    if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-        GGML_ASSERT(nb10 == sizeof(float));
-        GGML_ASSERT(params->nth == 1);
-        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
-
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                float * const wdata = params->wdata;
-                {
-                    size_t id = 0;
-                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
-                        for (int64_t i00 = 0; i00 < ne00; ++i00) {
-                            wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
-                        }
-                    }
-
-                    assert(id*sizeof(float) <= params->wsize);
-                }
-
-                const float * x = wdata;
-                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-                // zT = y * xT
-                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne00,
-                        0.0f,    d, ne01);
-            }
-        }
-
-        /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
-
-        return;
-#else
-        GGML_ASSERT(false);
-#endif
-    }
-
-    GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE);
-    GGML_ASSERT(init_backend == GGML_TASK_BACKEND_CPU);
-    GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU);
-
     if (params->type == GGML_TASK_INIT) {
         ggml_fp16_t * const wdata = params->wdata;
 
@@ -10348,68 +10417,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-    enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend;
-    enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
-    GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
-
-    if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-        GGML_ASSERT (init_backend == GGML_TASK_BACKEND_CPU);
-        GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE);
-        GGML_ASSERT(src0->data);
-        GGML_ASSERT(params->wdata);
-
-        float * const wdata = params->wdata;
-        dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
-
-        if (params->type == GGML_TASK_INIT) {
-            // rows per thread
-            const int dr = (ne01 + nth - 1)/nth;
-
-            // row range for this thread
-            const int ir0 = dr*ith;
-            int ir1 = MIN(ir0 + dr, ne01);
-
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    char  * data0_offset = (char *) src0->data + i03*nb03 + i02*nb02;
-                    float * wdata_offset = wdata + i03*ne03 + i02*ne02;
-                    for (int64_t i = ir0; i < ir1; ++i) {
-                        dequantize_row_q(data0_offset + i*nb01, wdata_offset + i*ne00, ne00);
-                    }
-                }
-            }
-            return;
-        }
-
-        GGML_ASSERT(nth == 1);
-        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
-
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-                // zT = y * xT
-                const float * x = wdata;
-                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne00,
-                        0.0f,    d, ne01);
-            }
-        }
-
-        return;
-#else
-        GGML_ASSERT(false);
-#endif
-    }
-
-    GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE);
-    GGML_ASSERT(init_backend == GGML_TASK_BACKEND_CPU);
-    GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU);
-
     if (params->type == GGML_TASK_INIT) {
         GGML_ASSERT(params->nth == 1);
 
@@ -14257,6 +14264,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(
 
 /////////////////////////////////
 
+// CPU only: no BLAS.
 static enum ggml_compute_error ggml_compute_forward(const struct ggml_compute_params * params, struct ggml_tensor * tensor) {
     GGML_ASSERT(params);
 
@@ -15459,96 +15467,163 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
 
 // ---- task profiles  ----
 
-// TODO: replace with ggml_compute_forward_cuda
+// Check the type and memeory layout for mul_mat on blas(CPU BLAS)
+static bool ggml_mul_mat_check_type_mem(struct ggml_tensor *tensor) {
+    enum ggml_type src0_t = tensor->src0->type;
+    enum ggml_type src1_t = tensor->src1->type;
+
+    // This is the minimal requirement to run mulmat with BLAS.
+    // Don't check matrix size because that would break tuning.
+    return (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 ||
+            ggml_is_quantized(src0_t)) &&
+           src1_t == GGML_TYPE_F32 && tensor->type == GGML_TYPE_F32 &&
+           ggml_is_contiguous(tensor->src0) && ggml_is_contiguous(tensor->src1);
+}
+
 // DO NOT check matrix size further.
 #if defined(GGML_USE_CUBLAS)
-static enum ggml_compute_error ggml_compute_forward_cuda(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * tensor) {
-    GGML_ASSERT (ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor));
+// Implements ggml_task_runner.
+static enum ggml_compute_error
+ggml_compute_forward_cuda(const struct ggml_compute_params *params,
+                          struct ggml_tensor *tensor) {
+    if (tensor->op == GGML_OP_MUL_MAT) {
+        GGML_ASSERT(ggml_mul_mat_check_type_mem(tensor));
+    }
+
     if (ggml_cuda_compute_forward(params, tensor)) {
         return GGML_COMPUTE_OK;
     }
+
     GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
-    GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
+    GGML_ASSERT(tensor->src1 == NULL ||
+                tensor->src1->backend == GGML_BACKEND_CPU);
+
     return GGML_COMPUTE_FALLBACK;
 }
-#endif
+#endif // GGML_USE_CUBLAS
 
-// TODO: replace with ggml_cl_mul_mat.
-// DO NOT check matrix size further.
 #if defined(GGML_USE_CLBLAST)
-static enum ggml_compute_error ggml_compute_forward_cl(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * tensor) {
+// Implements ggml_task_runner.
+static enum ggml_compute_error
+ggml_compute_forward_cl(const struct ggml_compute_params *params,
+                        struct ggml_tensor *tensor) {
     switch (tensor->op) {
-        case GGML_OP_MUL_MAT:
-            GGML_ASSERT(ggml_cl_can_mul_mat(tensor->src0, tensor->src1, tensor));
-            ggml_cl_mul_mat(tensor->src0, tensor->src1, tensor, params->wdata, params->wsize);
-            return GGML_COMPUTE_OK;
-        default:
-            break;
+    case GGML_OP_MUL: {
+        if (tensor->src1 && ggml_cl_is_gpu_offloading(tensor)) {
+            if (params->ith == 0) {
+                ggml_cl_mul(tensor->src0, tensor->src1, tensor);
+                return GGML_COMPUTE_OK;
+            }
+        }
+    } break;
+    case GGML_OP_MUL_MAT: {
+        GGML_ASSERT(ggml_mul_mat_check_type_mem(tensor));
+        ggml_cl_mul_mat(tensor->src0, tensor->src1, tensor, params->wdata,
+                        params->wsize);
+        return GGML_COMPUTE_OK;
+    } break;
+    default: {
+    } break;
     }
 
-    GGML_ASSERT(false);
+    return GGML_COMPUTE_FALLBACK;
 }
 
-static int ggml_compute_forward_get_wsize_cl (struct ggml_tensor *tensor) {
-     switch (tensor->op) {
-        case GGML_OP_MUL_MAT:
-            return ggml_cl_mul_mat_get_wsize(tensor->src0, tensor->src1, tensor);
-        default:
-            break;
-     }
-     return -1;
+// Implements ggml_task_wsize_getter.
+static int ggml_compute_forward_cl_get_wsize(struct ggml_tensor *tensor) {
+    switch (tensor->op) {
+    case GGML_OP_MUL_MAT:
+        return ggml_cl_mul_mat_get_wsize(tensor->src0, tensor->src1, tensor);
+    default:
+        break;
+    }
+    return -1;
 }
-#endif
+#endif // GGML_USE_CLBLAST
 
-// The wrapper for external mulmat tune tool.
-enum ggml_compute_error ggml_compute_forward_wrapper(const struct ggml_compute_params *params,
-    struct ggml_tensor *tensor) {
-    // We call ggml_compute_forward because the CUDA mul_mat entry point
-    // was moved out of `ggml_compute_forward_mul_mat`.
-    return ggml_compute_forward(params, tensor);
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+// Implements ggml_task_runner.
+static enum ggml_compute_error
+ggml_compute_forward_blas(const struct ggml_compute_params *params,
+                          struct ggml_tensor *tensor) {
+    switch (tensor->op) {
+    case GGML_OP_MUL_MAT: {
+        GGML_ASSERT(ggml_mul_mat_check_type_mem(tensor));
+        ggml_compute_forward_mul_mat_blas(params, tensor);
+        return GGML_COMPUTE_OK;
+    } break;
+    default: {
+    } break;
+    }
+
+    return GGML_COMPUTE_FALLBACK;
 }
 
+// Implements ggml_task_wsize_getter.
+static int ggml_compute_forward_blas_get_wsize(struct ggml_tensor *tensor) {
+    switch (tensor->op) {
+    case GGML_OP_MUL_MAT: {
+        GGML_ASSERT(tensor->src1->type == GGML_TYPE_F32);
+        enum ggml_type src0_t = tensor->src0->type;
+
+        if (src0_t == GGML_TYPE_F16) {
+            return GGML_TYPE_SIZE[GGML_TYPE_F32] *
+                   (tensor->src0->ne[0] * tensor->src0->ne[1]);
+        } else if (src0_t == GGML_TYPE_F32) {
+            return 0;
+        } else if (ggml_is_quantized(src0_t)) {
+            return GGML_TYPE_SIZE[GGML_TYPE_F32] *
+                   (tensor->src0->ne[0] * tensor->src0->ne[1]);
+        } else {
+            GGML_ASSERT(false);
+        }
+    } break;
+    default:
+        break;
+    }
+    return -1;
+}
+#endif // GGML_USE_ACCELERATE | GGML_USE_OPENBLAS
+
 // Implement `ggml_task_profiles_provider`.
-// Fill `profiles` for the `node` and return number of profiles.
+// Fill `profiles` for the `tensor` and return number of profiles.
 //
-// NOTE: the node may be incompleted from testing or tunning, so please assert
+// NOTE: the tensor may be incompleted from testing or tunning, so please assert
 //       everything used here.
 //
-// TODO: configure cuda for none mul_mat nodes.
+// First profile is always CPU, followed by BLAS, CUDA/CL.
 int ggml_get_task_profiles(
-    struct ggml_tensor *node,
+    struct ggml_tensor *tensor,
     struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]) {
-    GGML_ASSERT(node);
-    GGML_ASSERT(node->op >= 0);
+
+    GGML_ASSERT(tensor);
+    GGML_ASSERT(tensor->op >= 0);
     GGML_ASSERT(profiles);
 
     memset(profiles, 0,
            sizeof(struct ggml_task_profile) * GGML_MAX_TASK_PROFILES);
 
     struct ggml_task_profile *p = profiles;
-    int n_profiles = 0;
 
-    switch (node->op) {
+    int n_profiles = 1;
+    strcpy(p[0].name, "CPU");
+    p[0].runner = ggml_compute_forward;
+    // p[0].wsize_getter = ...;
+
+    switch (tensor->op) {
     case GGML_OP_CPY:
     case GGML_OP_DUP: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
-        n_profiles = 1;
+        p[0].stages[1].valid = true;
     } break;
     case GGML_OP_ADD:
     case GGML_OP_ADD1: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     } break;
     case GGML_OP_ACC: {
-        p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[0].valid = true;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     } break;
     case GGML_OP_SUB:
     case GGML_OP_DIV:
@@ -15565,13 +15640,11 @@ int ggml_get_task_profiles(
     case GGML_OP_NEG:
     case GGML_OP_STEP:
     case GGML_OP_RELU: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
-        n_profiles = 1;
+        p[0].stages[1].valid = true;
     } break;
     case GGML_OP_MUL: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     } break;
     case GGML_OP_GELU:
     case GGML_OP_SILU:
@@ -15579,69 +15652,32 @@ int ggml_get_task_profiles(
     case GGML_OP_NORM:
     case GGML_OP_RMS_NORM:
     case GGML_OP_RMS_NORM_BACK: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     } break;
     case GGML_OP_MUL_MAT:
-    case GGML_OP_OUT_PROD: {
-        // CPU only profiles.
-        // CUDA/CL: see end of function.
-        GGML_ASSERT(node->src0);
-        GGML_ASSERT(node->src1);
-
-        enum ggml_type src0_t = node->src0->type;
-        enum ggml_type src1_t = node->src1->type;
-
-        GGML_ASSERT(src1_t == GGML_TYPE_F32);
-
-        int i = 0;
+    case GGML_OP_OUT_PROD: { // FIXME: is this correct?
+        enum ggml_type src0_t = tensor->src0->type;
         if (src0_t == GGML_TYPE_F32) {
-            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
-            p[i].stages[1].parallel = true;
-            i++;
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
-            p[i].stages[1].wait = true;
-            i++;
-#endif
+            p[0].stages[1].valid = true;
+            p[0].stages[1].parallel = true;
         } else if (src0_t == GGML_TYPE_F16) {
-            p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
-            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
-            p[i].stages[1].parallel = true;
-            i++;
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
-            p[i].stages[1].wait = true;
-            i++;
-#endif
+            p[0].stages[0].valid = true;
+            p[0].stages[1].valid = true;
+            p[0].stages[1].parallel = true;
         } else if (ggml_is_quantized(src0_t)) {
-            p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
-            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
-            p[i].stages[1].parallel = true;
-            i++;
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-            p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
-            p[i].stages[0].parallel = true;
-            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
-            p[i].stages[1].wait = true;
-            i++;
-#endif
+            p[0].stages[0].valid = true;
+            p[0].stages[1].valid = true;
+            p[0].stages[1].parallel = true;
         }
-        n_profiles = i;
     } break;
     case GGML_OP_SCALE: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     } break;
     case GGML_OP_SET: {
-        p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
-        n_profiles = 1;
+        p[0].stages[0].valid = true;
+        p[0].stages[1].valid = true;
     } break;
     case GGML_OP_CONT:
     case GGML_OP_RESHAPE:
@@ -15652,64 +15688,53 @@ int ggml_get_task_profiles(
     case GGML_OP_GET_ROWS_BACK:
     case GGML_OP_DIAG:
     case GGML_OP_DIAG_MASK_ZERO: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
-        n_profiles = 1;
+        p[0].stages[1].valid = true;
     } break;
     case GGML_OP_DIAG_MASK_INF:
     case GGML_OP_SOFT_MAX:
     case GGML_OP_SOFT_MAX_BACK:
     case GGML_OP_ROPE:
     case GGML_OP_ROPE_BACK: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     } break;
     case GGML_OP_ALIBI: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
-        n_profiles = 1;
+        p[0].stages[1].valid = true;
     } break;
     case GGML_OP_CLAMP: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
-        n_profiles = 1;
+        p[0].stages[1].valid = true;
     } break;
     case GGML_OP_CONV_1D_1S:
     case GGML_OP_CONV_1D_2S: {
-        p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[0].valid = true;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     } break;
     case GGML_OP_FLASH_ATTN: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     } break;
     case GGML_OP_FLASH_FF: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     }
     case GGML_OP_FLASH_ATTN_BACK: {
-        p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[0].valid = true;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     } break;
     case GGML_OP_MAP_UNARY:
     case GGML_OP_MAP_BINARY: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
-        n_profiles = 1;
+        p[0].stages[1].valid = true;
     } break;
     case GGML_OP_CROSS_ENTROPY_LOSS:
-        p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[0].valid = true;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        p[0].stages[2].backend = GGML_TASK_BACKEND_CPU;
-        n_profiles = 1;
+        p[0].stages[2].valid = true;
     case GGML_OP_CROSS_ENTROPY_LOSS_BACK: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     } break;
     case GGML_OP_NONE:
     case GGML_OP_COUNT: {
@@ -15719,227 +15744,196 @@ int ggml_get_task_profiles(
         GGML_ASSERT(false);
     }
 
-#if defined(GGML_USE_CUBLAS)
-    switch (node->op) {
-        case GGML_OP_ADD:
-        case GGML_OP_MUL:
-        case GGML_OP_SILU:
-        case GGML_OP_RMS_NORM:
-        case GGML_OP_MUL_MAT:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_ROPE: {
-            int i = n_profiles;
-            p[i].runner = ggml_compute_forward_cuda;
-            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+    if (tensor->op == GGML_OP_MUL_MAT) {
+        enum ggml_type src0_t = tensor->src0->type;
+        int i = n_profiles;
+
+        strcpy(p[i].name, "BLAS");
+        p[i].runner = ggml_compute_forward_blas;
+        p[i].wsize_getter = ggml_compute_forward_blas_get_wsize;
+
+        if (src0_t == GGML_TYPE_F32) {
+            p[i].stages[1].valid = true;
             p[i].stages[1].wait = true;
-            ++n_profiles;
-        } break;
-        default: {
-        } break;
+        } else if (src0_t == GGML_TYPE_F16) {
+            p[i].stages[1].valid = true;
+            p[i].stages[1].wait = true;
+        } else if (ggml_is_quantized(src0_t)) {
+            p[i].stages[0].valid = true;
+            p[i].stages[0].parallel = true;
+            p[i].stages[1].valid = true;
+            p[i].stages[1].wait = true;
+        }
+        ++n_profiles;
+    }
+#endif
+
+#if defined(GGML_USE_CUBLAS)
+    if (true) { // FIXME: filter supported op to avoid unnecceary fallback.
+        int i = n_profiles;
+        strcpy(p[i].name, "CUDA");
+        p[i].runner = ggml_compute_forward_cuda;
+        p[i].stages[1].valid = true;
+        p[i].stages[1].wait = true;
+        ++n_profiles;
     }
 #elif defined(GGML_USE_CLBLAST)
-    switch (node->op) {
-        case GGML_OP_MUL_MAT: {
-            int i = n_profiles;
-            p[i].runner = ggml_compute_forward_cl;
-            p[i].get_wsize = ggml_compute_forward_get_wsize_cl;
-            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
-            p[i].stages[1].wait = true;
-            ++n_profiles;
-        } break;
-        default: {
-        } break;
+    if (tensor->op == GGML_OP_MUL || tensor->op == GGML_OP_MUL_MAT) {
+        int i = n_profiles;
+        strcpy(p[i].name, "CL");
+        p[i].runner = ggml_compute_forward_cl;
+        p[i].wsize_getter = ggml_compute_forward_cl_get_wsize;
+        p[i].stages[1].valid = true;
+        p[i].stages[1].wait = true;
+        ++n_profiles;
     }
 #endif
 
     GGML_ASSERT(n_profiles > 0 && n_profiles <= GGML_MAX_TASK_PROFILES);
-    for (int i = 0; i < n_profiles; i++) {
-        profiles[i].id = i + 1;
+
+    for (int j = 0; j < n_profiles; j++) {
+        profiles[j].id = j + 1;
     }
+
     return n_profiles;
 }
 
-// Set task profile for GGML_OP_MUL_MAT or GGML_OP_OUT_PROD.
-static const struct ggml_task_profile *ggml_mulmat_get_task_profile(
-    struct ggml_tensor *node, struct ggml_task_profile *profiles,
-    int n_profiles, struct ggml_mulmat_tune *tune, int stages_time_us[3]) {
+// Try to fix task profile for given tensor, because the task profile might not
+// be the most performant.
+static void ggml_optimize_tensor_task_profile(
+    struct ggml_tensor *tensor, struct ggml_task_profile *profiles,
+    int n_profiles, struct ggml_mulmat_tune *tune) {
+
+    if (tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_OUT_PROD) {
+        return;
+    }
+
+    GGML_ASSERT(tensor);
+    GGML_ASSERT(tensor->op == GGML_OP_MUL_MAT ||
+                tensor->op == GGML_OP_OUT_PROD);
+    GGML_ASSERT(tensor->task_profile.id == n_profiles);
 
-    GGML_ASSERT(node);
-    GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD);
     GGML_ASSERT(profiles);
-    GGML_ASSERT(n_profiles > 0);
+    GGML_ASSERT(n_profiles > 1);
 
-    enum ggml_type src0_t = node->src0->type;
-    enum ggml_type src1_t = node->src1->type;
+    int M = (int)tensor->ne[1];
+    int N = (int)tensor->ne[0];
+    int K = (int)tensor->src1->ne[0];
 
-    // Type and memory layout requirements for computing mul_mat with BLAS.
-    bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 ||
-                       ggml_is_quantized(src0_t)) &&
-                       src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 &&
-                       ggml_is_contiguous(node->src0) &&
-                       ggml_is_contiguous(node->src1);
-
-    int M = (int)node->ne[1];
-    int N = (int)node->ne[0];
-    int K = (int)node->src1->ne[0];
-
-    const struct ggml_task_profile *prof = NULL;
-
-    if (cond_match) {
 #if defined(GGML_USE_TUNE)
-        if (tune != NULL) {
-            GGML_ASSERT(n_profiles >= 2);
-            int id = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t,
-                                                        src1_t, stages_time_us);
+    if (tune != NULL && ggml_mul_mat_check_type_mem(tensor)) {
+        GGML_ASSERT(tensor->backend == 0 && tensor->src0->backend == 0 &&
+                    tensor->src1->backend == 0);
+
+        GGML_ASSERT(n_profiles >= 2);
+
+        enum ggml_type src0_t = tensor->src0->type;
+        enum ggml_type src1_t = tensor->src1->type;
+
+        int stages_time_us[3];
+
+        int id = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t,
+                                                      src1_t, stages_time_us);
+        if (id > 0) {
+            struct ggml_task_profile *prof = NULL;
+
             for (int i = 0; i < n_profiles; i++) {
                 if (profiles[i].id == id) {
                     prof = &profiles[i];
-                    return prof;
-                }
-            }
-        }
-#else
-        UNUSED(tune);
-        UNUSED(stages_time_us);
-#endif
-
-        if (prof == NULL && M >= 32 && N >= 32 && K >= 32) {
-            for (int j = 0; j < n_profiles; j++) {
-                enum ggml_task_backend comp_be =
-                    profiles[j].stages[GGML_TASK_COMPUTE].backend;
-                switch (comp_be) {
-                    case GGML_TASK_BACKEND_GPU_CUDA: {
-                        GGML_ASSERT(ggml_cpu_has_cublas());
-                        prof = &profiles[j];
-                        break;
-                    }
-                    case GGML_TASK_BACKEND_GPU_CL: {
-                        GGML_ASSERT(ggml_cpu_has_clblast());
-                        prof = &profiles[j];
-                        break;
-                    }
-                    case GGML_TASK_BACKEND_CPU_BLAS: {
-                        GGML_ASSERT(ggml_cpu_has_cpublas());
-                        prof = &profiles[j];
-                        break;
-                    }
-                    default: {
-                        break;
-                    }
-                }
-
-                if (prof) {
                     break;
                 }
             }
-        }
-    }
 
-    if (prof == NULL) {
-        prof = &profiles[0];
-        GGML_ASSERT(prof->stages[1].backend == GGML_TASK_BACKEND_CPU);
-    }
-
-    return prof;
-}
-
-void ggml_graph_compute_set_tensor_task_proile(struct ggml_tensor *node,
-                                               struct ggml_cgraph *cgraph) {
-    // Pre-specified.
-    for (int i = 0; i < 3; i++) {
-        if (node->task_profile.stages[i].backend > 0) {
-            return;
-        }
-    }
-
-    struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
-    int n_profiles = ggml_get_task_profiles(node, profiles);
-
-    const struct ggml_task_profile *profile = NULL;
-
-    // GPU offloading. A special case of pre-specified task_profile.
-    if (node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT) {
-        if (node->op != GGML_OP_MUL_MAT && node->op != GGML_OP_OUT_PROD) {
-            enum ggml_task_backend be;
-            if (ggml_cpu_has_cublas()) {
-                be = GGML_TASK_BACKEND_GPU_CUDA;
-            } else if (ggml_cpu_has_clblast()) {
-                be = GGML_TASK_BACKEND_GPU_CL;
-            } else {
-                GGML_ASSERT(false);
-            }
-
-            for (int j = 0; j < n_profiles; j++) {
-                if (profiles[j].stages[1].backend == be) {
-                    profile = &profiles[j];
-                    break;
-                }
-            }
-            GGML_ASSERT(profile);
-            GGML_ASSERT(!cgraph->tune);
-
-            memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
-            return;
-        }
-    }
-
-    // mul_mat: GGML_OP_MUL_MAT and GGML_OP_OUT_PROD.
-    if (node->op == GGML_OP_MUL_MAT) {
-#if defined(GGML_USE_TUNE)
-        GGML_ASSERT(node->backend == GGML_BACKEND_CPU);
-
-        int stages_time_us[3];
-        profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles,
-                                               cgraph->tune, stages_time_us);
-        GGML_ASSERT(profile);
-
-        memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
-
-        if (cgraph->tune) {
-            memcpy(&node->task_profile, profile,
+            if (prof) {
+                memcpy(&tensor->task_profile, prof,
                     sizeof(struct ggml_task_profile));
 
-            // Do not wait if the estimated execution time is too small
-            // (e.g. less than 0.1 ms)
-            // TODO: need bench actual wait/notify time, see
-            // ggml-threading.c
-            for (int j = 0; j < 3; j++) {
-                if (node->task_profile.stages[j].wait) {
-                    if (stages_time_us[j] < 100) {
-                        node->task_profile.stages[j].wait = false;
+                // Do not wait if the estimated execution time is too small
+                // (e.g. less than 0.1 ms)
+                // TODO: need bench actual wait/notify time, see
+                // ggml-threading.c
+                for (int j = 0; j < 3; j++) {
+                    if (tensor->task_profile.stages[j].wait) {
+                        if (stages_time_us[j] < 100) {
+                            tensor->task_profile.stages[j].wait = false;
+                        }
                     }
                 }
+                return;
             }
         }
-        return;
+    }
 #else
-        profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, NULL,
-                                               NULL);
-        GGML_ASSERT(profile);
-        memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
-        return;
+    UNUSED(tune);
 #endif
-    } else if (node->op == GGML_OP_OUT_PROD) { // FIXME: is this correct?
-        profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, NULL,
-                                               NULL);
-        GGML_ASSERT(profile);
-        memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
-        return;
+
+    // Guess the optimal matrix size.
+    bool size_match = (M >= 32 && N >= 32 && K >= 32);
+    UNUSED(size_match);
+
+    for (int i = n_profiles - 1; i >= 0; --i) {
+        const char *name = profiles[i].name;
+
+        if (strcmp(name, "CUDA") == 0) {
+#if defined(GGML_USE_CUBLAS)
+            if ((size_match || ggml_cuda_is_gpu_offloading(tensor)) &&
+                ggml_mul_mat_check_type_mem(tensor)) {
+                memcpy(&tensor->task_profile, &profiles[i],
+                       sizeof(struct ggml_task_profile));
+                return;
+            }
+#endif
+        }
+
+        if (strcmp(name, "CL") == 0) {
+#if defined(GGML_USE_CLBLAST)
+            if ((size_match || ggml_cl_is_gpu_offloading(tensor)) &&
+                ggml_mul_mat_check_type_mem(tensor)) {
+                memcpy(&tensor->task_profile, &profiles[i],
+                       sizeof(struct ggml_task_profile));
+                return;
+            }
+#endif
+        }
+
+        if (strcmp(name, "BLAS") == 0) {
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+            if (size_match && ggml_mul_mat_check_type_mem(tensor)) {
+                memcpy(&tensor->task_profile, &profiles[0],
+                       sizeof(struct ggml_task_profile));
+                return;
+            }
+#endif
+        }
     }
 
-    // default.
-    profile = &profiles[0];
-    GGML_ASSERT(profile->stages[1].backend == GGML_TASK_BACKEND_CPU);
-    memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
+    memcpy(&tensor->task_profile, &profiles[0],
+           sizeof(struct ggml_task_profile));
+}
+
+static void ggml_set_tensor_task_profile(struct ggml_tensor *tensor,
+                                         struct ggml_mulmat_tune *tune) {
+    struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
+    int n_profiles = ggml_get_task_profiles(tensor, profiles);
+    GGML_ASSERT(n_profiles > 0);
+
+    // By default use profile with the largest id.
+    // Profile id starts from 1.
+    memcpy(&tensor->task_profile, &profiles[n_profiles - 1],
+           sizeof(struct ggml_task_profile));
+
+    if (n_profiles > 1) {
+        GGML_ASSERT(tensor->task_profile.id > 1);
+        ggml_optimize_tensor_task_profile(tensor, profiles, n_profiles, tune);
+    }
+
+    GGML_ASSERT(tensor->task_profile.id > 0);
 }
 
 void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
     int n_threads = cgraph->n_threads;
 
-    struct ggml_threading_context *thrd_ctx = ggml_threading_start(
-        n_threads, NULL, ggml_compute_forward,
-        GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL);
-
     // initialize tasks + work buffer
     {
         // int64_t t0 = ggml_time_us();
@@ -15952,25 +15946,26 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
             GGML_ASSERT (node->op != GGML_OP_NONE);
 
-            struct ggml_task_stage *stages = node->task_profile.stages;
+            if (node->task_profile.id == 0) {
+                ggml_set_tensor_task_profile(node, cgraph->tune);
+            }
 
-            ggml_graph_compute_set_tensor_task_proile(node, cgraph);
+            struct ggml_task_stage *stages = node->task_profile.stages;
 
             //
             // Allocate temp buffer `wdata` for CPU.
             // NOTE: GPU MAY fallback to CPU, so we have to cover all possible cases.
             //
 
-            if (node->task_profile.get_wsize) {
-                int sz = node->task_profile.get_wsize(node);
+            if (node->task_profile.wsize_getter) {
+                int sz = node->task_profile.wsize_getter(node);
                 if (sz >= 0) {
                     work_size = MAX(work_size, (size_t)sz);
+                    // FIXME: is it safe to continue in case fallback?
                     continue;
                 }
             }
 
-            //printf("op: %d, comp backend: %d\n", node->op, node->task_profile.stages[1].backend);
-
             // compute stage n_tasks.
             int n_tasks = stages[1].parallel ? n_threads : 1;
 
@@ -16034,35 +16029,17 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_OUT_PROD: // FIXME: is this correct?
                     {
                         size_t cur = 0;
-                        enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend;
-                        GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE);
-                        if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
-                            GGML_ASSERT(ggml_cpu_has_cpublas());
-                            GGML_ASSERT(node->src1->type == GGML_TYPE_F32);
+                        GGML_ASSERT(node->src1->type == GGML_TYPE_F32);
 
-                            if (node->src0->type == GGML_TYPE_F32) {
-                                cur = 0;
-                            } else if (node->src0->type == GGML_TYPE_F16) {
-                                // here we need memory just for single 2D matrix from src0
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-                            } else if (ggml_is_quantized(node->src0->type)) {
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-                            } else {
-                                GGML_ASSERT(false);
-                            }
-                        } else { // CPU or GPU fallback
-                            GGML_ASSERT(node->src1->type == GGML_TYPE_F32);
-
-                            if (node->src0->type == GGML_TYPE_F32) {
-                                cur = 0;
-                            } else if (node->src0->type == GGML_TYPE_F16) {
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
-                            } else if (ggml_is_quantized(node->src0->type)) {
-                                const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type;
-                                cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q];
-                            } else {
-                                GGML_ASSERT(false);
-                            }
+                        if (node->src0->type == GGML_TYPE_F32) {
+                            cur = 0;
+                        } else if (node->src0->type == GGML_TYPE_F16) {
+                            cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
+                        } else if (ggml_is_quantized(node->src0->type)) {
+                            const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type;
+                            cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q];
+                        } else {
+                            GGML_ASSERT(false);
                         }
 
                         work_size = MAX(work_size, cur);
@@ -16218,6 +16195,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
     const int64_t perf_start_cycles  = ggml_perf_cycles();
     const int64_t perf_start_time_us = ggml_perf_time_us();
 
+    struct ggml_threading_context *thrd_ctx = ggml_threading_start(n_threads,
+        NULL, ggml_compute_forward, GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL);
+
     for (int i = 0; i < cgraph->n_nodes; i++) {
         GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes);
 
diff --git a/ggml.h b/ggml.h
index d4d5d3521..554645ba8 100644
--- a/ggml.h
+++ b/ggml.h
@@ -362,29 +362,10 @@ extern "C" {
 
     static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
 
-    // As part of task config profile solution, `ggml_task_backend` defines
-    // backends for each task stage. Similar to `ggml_tensor.backend`,
-    // `ggml_tensor.task_profile` generalizes how to configure tensor computing
-    // at per task-stage level.
-    //
-    // The following enum values are designed as combination of hardware and
-    // optional software interface.
-    enum ggml_task_backend {
-        GGML_TASK_BACKEND_NONE     = 0,
-
-        // [0x10, 0x1F]: CPU
-        GGML_TASK_BACKEND_CPU      = 0x10,
-        GGML_TASK_BACKEND_CPU_BLAS = 0x11,
-
-        // [0x20 - 0x2F]: GPU
-        GGML_TASK_BACKEND_GPU      = 0x20,
-        GGML_TASK_BACKEND_GPU_CUDA = 0x21,
-        GGML_TASK_BACKEND_GPU_CL   = 0x22,
-    };
-
     // config for computing one of the 3 task stages of a tensor.
     struct ggml_task_stage {
-        enum ggml_task_backend backend;
+        bool valid;
+
         bool parallel;
         // hint idle workers go waiting, valid only when parallel is false.
         bool wait;
@@ -407,13 +388,16 @@ extern "C" {
     // Get wsize for node computing.
     // When return -1: should be explained as `fallback to CPU`, caller MUST
     // determine how much memory to reserve for this node.
-    typedef int (ggml_task_get_wsize)(struct ggml_tensor *tensor);
+    typedef int (ggml_task_wsize_getter)(struct ggml_tensor *tensor);
 
     // config for computing a tensor.
     struct ggml_task_profile {
         // profile id, start from 1.
         int id;
 
+        // Required, not empty, no whitespaces.
+        char name[16];
+
         // index 0: INIT, 1: COMPUTE, 2: FINALIZE
         struct ggml_task_stage stages[3];
 
@@ -421,7 +405,7 @@ extern "C" {
         ggml_task_runner *runner;
 
         // Optional function to return required wsize for wdata.
-        ggml_task_get_wsize *get_wsize;
+        ggml_task_wsize_getter *wsize_getter;
 
         // Optional flag for development.
         // MUST be used only in testing codes.
diff --git a/llama.cpp b/llama.cpp
index 06555e1dd..e6bddffd5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2744,8 +2744,9 @@ struct llama_context * llama_init_from_file(
 }
 
 #ifdef GGML_USE_TUNE
-bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) {
-    GGML_ASSERT (ctx->model.n_gpu_layers == 0);
+bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune,
+                       const char *fname) {
+    GGML_ASSERT(ctx->model.n_gpu_layers == 0);
 
     printf("\n");
 
@@ -2755,7 +2756,7 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
 
     enum ggml_ftype ggml_ftype;
     switch (hparams->ftype) {
-        case LLAMA_FTYPE_ALL_F32:
+    case LLAMA_FTYPE_ALL_F32:
         ggml_ftype = GGML_FTYPE_ALL_F32;
         break;
     case LLAMA_FTYPE_MOSTLY_F16:
@@ -2767,9 +2768,6 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
     case LLAMA_FTYPE_MOSTLY_Q4_1:
         ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1;
         break;
-    case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
-        ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1_SOME_F16;
-        break;
     case LLAMA_FTYPE_MOSTLY_Q5_0:
         ggml_ftype = GGML_FTYPE_MOSTLY_Q5_0;
         break;
@@ -2799,8 +2797,8 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
         ggml_ftype = GGML_FTYPE_MOSTLY_Q6_K;
         break;
     default:
-        throw std::runtime_error(
-            format("invalid output file type %d\n", hparams->ftype));
+        fprintf(stderr, "[tune] unsupported file type %d\n", hparams->ftype);
+        return false;
     }
 
     int n_vocab = hparams->n_vocab;
@@ -2808,30 +2806,36 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
     int n_rot = hparams->n_rot;
 
     int n_mult = hparams->n_mult;
-    int n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult;
+    int n_ff = ((2 * (4 * n_embd) / 3 + n_mult - 1) / n_mult) * n_mult;
 
     struct ggml_mulmat_tune_params params = {
-        /*.model =*/ {
-            /* .name    =*/ model_name,
-            /* .ftype   =*/ ggml_ftype,
-            /* .n_vocab =*/ n_vocab,
-            /* .n_embd  =*/ n_embd,
-            /* .n_ff    =*/ n_ff,
-            /* .n_rot   =*/ n_rot,
+        /*.model =*/{
+            /* .name    =*/model_name,
+            /* .ftype   =*/ggml_ftype,
+            /* .n_vocab =*/n_vocab,
+            /* .n_embd  =*/n_embd,
+            /* .n_ff    =*/n_ff,
+            /* .n_rot   =*/n_rot,
         },
-        /* .m_num          =*/ 8,
-        /* .n_pass         =*/ 1,
-        /* .n_threads      =*/ n_threads,
-        /* .prrogress      =*/ true,
-        /* .output_console =*/ false,
-        /* .fname          =*/ fname,
+        /* .m_num          =*/8,
+        /* .n_pass         =*/1,
+        /* .n_threads      =*/n_threads,
+        /* .prrogress      =*/true,
+        /* .output_console =*/false,
+        /* .fname          =*/fname,
     };
 
     bool empty_fname = !fname || strcmp(fname, "") == 0;
 
-    ctx->tune = new(struct ggml_mulmat_tune);
+    ctx->tune = new (struct ggml_mulmat_tune);
     if (!ctx->tune) {
-        throw std::runtime_error(format("failed to allocate memory for tune\n"));
+        fprintf(stderr, "[tune] failed to allocate memory for tune\n");
+        return false;
+    }
+
+    if (!ggml_cpu_has_blas()) {
+        fprintf(stderr, "[tune] this program is not built with BLAS, abort.\n");
+        return false;
     }
 
     if (tune) {
@@ -2844,31 +2848,30 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
             ggml_mulmat_tune_free(ctx->tune);
             return true;
         }
-    } else {
-        if (empty_fname) {
-            return false;
-        }
+    } else if (empty_fname) {
+        return false;
     }
 
     if (!empty_fname) {
         FILE *fp = fopen(fname, "r");
         if (!fp) {
-            fprintf(stderr, "[tune] failed to open file %s.\n",
-                    fname);
+            fprintf(stderr, "[tune] failed to open file %s.\n", fname);
+            return false;
         } else {
-            bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp);
+            int rc = ggml_mulmat_tune_read_data(ctx->tune, fp);
             fclose(fp);
 
-            if (!ok) {
+            if (rc != 0) {
                 fprintf(stderr,
-                        "[tune] failed to read data from %s\n",
-                        fname);
+                        "[tune] failed to read data from %s, error code: %d\n",
+                        fname, rc);
                 return false;
             }
 
             fprintf(stderr, "[tune] loaded data from %s\n", fname);
 
-            ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads);
+            bool ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype,
+                                                params.n_threads);
             if (!ok) {
                 return false;
             }
diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c
index 2079fe144..e904fc10d 100644
--- a/tests/test-ggml-threading.c
+++ b/tests/test-ggml-threading.c
@@ -41,9 +41,8 @@ static const int n_repeat = 10;
 // counter with array.
 static int work_done_arr[MAX_N_THREADS];
 
-static enum ggml_compute_error
-mock_task_runner(const struct ggml_compute_params *params,
-                 struct ggml_tensor *node) {
+static enum ggml_compute_error mock_task_runner(const struct ggml_compute_params *params,
+                             struct ggml_tensor *node) {
     int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000;
     if (node->task_profile.stages[params->type].parallel) {
         loops /= params->nth;
@@ -80,20 +79,15 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
 
     int t0 = (int)ggml_time_us();
 
-    struct ggml_threading_context *ctx = ggml_threading_start(
-        n_threads, NULL, mock_task_runner, features, /*stages_time*/ NULL);
+    node->task_profile.runner = mock_task_runner;
+
+    struct ggml_threading_context *ctx =
+        ggml_threading_start(n_threads, NULL, NULL, features, /*stages_time*/ NULL);
 
     int t1 = (int)ggml_time_us();
 
     for (int i = 0; i < n_repeat; i++) {
-        enum ggml_compute_error err = ggml_threading_compute_tensor(
-            ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
-        if (err != GGML_COMPUTE_OK) {
-            ggml_threading_stop(ctx);
-            printf("ggml_threading_compute_tensor failed with error: %d.\n",
-                   err);
-            return 1;
-        }
+        ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
     }
 
     int t2 = (int)ggml_time_us();
@@ -107,7 +101,7 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
     int expect = 0;
     for (int i = 0; i < 3; i++) {
         const struct ggml_task_stage *ts = &stages[i];
-        if (ts->backend != GGML_TASK_BACKEND_NONE) {
+        if (ts->valid) {
             if (ts->parallel) {
                 expect += n_threads;
             } else {
@@ -144,14 +138,12 @@ static enum ggml_compute_error
 mock_task_runner_fallback(const struct ggml_compute_params *params,
                           struct ggml_tensor *node) {
     UNUSED(params);
-    if (node->backend == GGML_BACKEND_GPU) {
-        // ... finally failed to compute in GPU.
 
-        node->backend = GGML_BACKEND_CPU;
+    // failed to run ...
+    if (node->task_profile.id == 2) {
         return GGML_COMPUTE_FALLBACK;
-    } else {
-        return GGML_COMPUTE_OK;
     }
+    return GGML_COMPUTE_OK;
 }
 
 // By design, fallback should happen when attempt computing tensor in GPU,
@@ -164,6 +156,9 @@ int test_fallback(struct ggml_tensor *node) {
     enum ggml_compute_error err =
         ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
     if (err == GGML_COMPUTE_FALLBACK) {
+        // mock setup new profile ...
+        node->task_profile.id = 1;
+
         err = ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL,
                                             /*wsize*/ 0);
     }
@@ -214,12 +209,12 @@ int main(void) {
 
     struct ggml_tensor node;
     memset(&node, 0, sizeof(struct ggml_tensor));
+    node.task_profile.runner = mock_task_runner;
 
     struct ggml_task_stage *stages = node.task_profile.stages;
 
-    stages[0].backend = GGML_TASK_BACKEND_CPU;
-    stages[1].backend = GGML_TASK_BACKEND_CPU;
-    stages[2].backend = GGML_TASK_BACKEND_NONE;
+    stages[0].valid = true;
+    stages[1].valid = true;
 
     int n_passed = 0;
     int n_tests = 0;
@@ -277,7 +272,7 @@ int main(void) {
 
         struct ggml_threading_context *ctx =
             ggml_threading_start(n_threads, ggml_threading_graph_compute_thread,
-                                 mock_task_runner, 0, /*stages_time*/ NULL);
+                                 NULL, 0, /*stages_time*/ NULL);
 
         int t1 = (int)ggml_time_us();
 
@@ -416,8 +411,8 @@ int main(void) {
         node.src0 = &src0;
         node.src1 = &src1;
 
-        node.backend = GGML_BACKEND_GPU;
-        stages[1].backend = GGML_TASK_BACKEND_GPU;
+        node.task_profile.id = 2;
+        stages[1].valid = true;
         if (test_fallback(&node) == 0) {
             ++n_passed;
             printf("[test-ggml-threading] test fallback: ok\n\n");
diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c
index 4339881e5..97fd6cfbf 100644
--- a/tests/test-ggml-tune.c
+++ b/tests/test-ggml-tune.c
@@ -46,13 +46,9 @@ int main(void) {
 }
 
 static int bench(void) {
-    {
-        enum ggml_task_backend backends[16];
-        int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
-        if (n_backends < 2) {
-            printf("[test-ggml-tune] skipped because no BLAS\n");
-            return 0;
-        }
+    if (!ggml_cpu_has_blas()) {
+        printf("[test-ggml-tune] skipped because no BLAS\n");
+        return 0;
     }
 
     {
@@ -118,10 +114,13 @@ static int
 ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node,
                                      struct ggml_task_profile *profiles) {
     UNUSED(node);
-    profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
-    profiles[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
-    profiles[1].stages[0].backend = GGML_TASK_BACKEND_CPU;
-    profiles[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
+    profiles[0].id = 1;
+    profiles[0].stages[0].valid = true;
+    profiles[0].stages[1].valid = true;
+
+    profiles[0].id = 2;
+    profiles[1].stages[0].valid = true;
+    profiles[1].stages[1].valid = true;
 
     return 2;
 }