bulk refactoring task profile and related to run CL GPU offloading.

* removed ggml_task_backend, infavour of ggml_task_profile.runner and newly added id and name. * extracted mul_mat blas codes into ggml_compute_forward_mul_mat_blas, thus align with CUDA/CL a bit more and make it easier to fix profile and run tune. * rewrote task profile and update/add some cuda/cl codes, finnaly made CL GPU offloading work. * misc minor fix/update to tune, the data format was changed.
2025-01-10 18:51:45 +00:00 · 2023-06-18 12:29:16 +08:00 · 2023-06-18 12:29:16 +08:00 · 06b00827a0
commit 06b00827a0
parent 6b83a3e16f
15 changed files with 673 additions and 773 deletions
--- a/examples/mulmat-tune/README.md
+++ b/examples/mulmat-tune/README.md
@ -214,26 +214,19 @@ The following results are generated with Accelerate compiled.
 **Example**

 ```
-5 3B 2 6 1
+[tune] done, elapsed time: 0 seconds.
+10 xB 12 4 2

-3200 3200  2 0 3 10
-16 0 0 0  16 1 0 1   0 0 0 0
-16 1 0 2  17 0 1 0   0 0 0 0
- 0 0 0 0  34 0 1 0   0 0 0 0
-   1        1      793 0     9103     2102 0 0     6014 0
-   2        2     1591 0     8034     2305 0 0    30982 0
-   4        4     2236 0     6476     2484 0 0    31388 0
-   8        7     4161 0     6623     2389 0 0    29204 0
-  16       15     8339 0     6434     2752 0 0    34303 0
-  32       32    16919 0     6915     3651 0 0    42511 0
-  64      200    34270 0     6574     4528 0 0    68212 0
- 128      188    69400 0     6325     6839 0 0    74437 0
- 256      303   134597 0     6168    11544 0 0   110180 0
- 512      687   279685 0     6337    29712 0 0   159728 0
+1024 1024 12 0 2 4
+100 110 000 1 CPU
+110 101 000 2 BLAS
+   1       11      309 0     1234       90 0
+   2       23      654 0     1359      215 0
+   4       44     1283 0     1362      421 0
+   8       85     2341 0     1357      347 0

-3200 8640  2 0 2 10
-
- ...
+1024 2048 12 0 2 4
+...

 ```

@ -249,17 +242,17 @@ shape+
 # head
 version: 1
 model: "3B" | "7B" | "13B" | "30B" | "65B"
-ggml_ftype: 0 - 4, 7 - 14
+ggml_ftype: 0 - 3, 7 - 14
 n_shapes: number of shapes
 n_threads: number of threads

-shape := N K  m_num n_profiles
-task_conf_profile+
+shape := N K  src0_ggml_type src1_ggml_type n_profiles m_num
+task_profile+
 bench_item+

-task_conf_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize)
-stage_conf: backend parallel wait
-backend: 0 (NONE) | 16 (CPU) | 17 (CPU_BLAS) | 32 (GPU) | 33 (GPU_CUDA) | 34 (GPU_CL)
+task_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize) id name
+stage_conf(bitmap): valid parallel wait
+valid: 0 (false) | 1 (true)
 parallel: 0 (false) | 1 (true)
 wait: 0 (false) | 1 (true)

--- a/examples/mulmat-tune/mulmat-tune.cpp
+++ b/examples/mulmat-tune/mulmat-tune.cpp
@ -111,6 +111,11 @@ static void usage(char *prog) {
 }

 int main(int argc, char **argv) {
+    if (!ggml_cpu_has_blas()) {
+        fprintf(stderr, "error: this program is not built with BLAS.\n");
+        return 1;
+    }
+
    if (argc == 2) {
        if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) {
            usage(argv[0]);
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -2207,17 +2207,12 @@ void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml
    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
 }

-bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
+// NOTE: don't check matrix size, otherwise mul_mat tune will fail to run.
+static bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
    // TODO: find the optimal values for these
    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
        src1->type == GGML_TYPE_F32 &&
-        dst->type == GGML_TYPE_F32 &&
-        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
+        dst->type == GGML_TYPE_F32) {
        return true;
    }

@ -2539,11 +2534,17 @@ void ggml_cuda_free_scratch() {
    g_scratch_buffer = nullptr;
 }

-bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
-    ggml_cuda_func_t func;
-    const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
+bool ggml_cuda_is_gpu_offloading(struct ggml_tensor * tensor) {
+    GGML_ASSERT(tensor);
+    GGML_ASSERT(tensor->src0);
+    return tensor->backend == GGML_BACKEND_GPU
        || tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT
        || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
+}
+
+bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
+    ggml_cuda_func_t func;
+    const bool any_on_device = is_gpu_offloading(tensor);

    switch (tensor->op) {
        case GGML_OP_ADD:
@ -2571,7 +2572,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
            func = ggml_cuda_rms_norm;
            break;
        case GGML_OP_MUL_MAT:
-            if (!any_on_device/* && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)*/) {
+            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
                return false;
            }
            func = ggml_cuda_mul_mat;
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -16,7 +16,7 @@ void   ggml_init_cublas(void);
 void   ggml_cuda_set_tensor_split(const float * tensor_split);

 void   ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+bool   ggml_cuda_is_gpu_offloading(const struct ggml_tensor * src0);
 size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);

--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -1589,18 +1589,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
    }
 }

+bool ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor) {
+    GGML_ASSERT(tensor);
+    return (tensor->src0 && tensor->src0->backend == GGML_BACKEND_GPU) ||
+        (tensor->src1 && tensor->src1->backend == GGML_BACKEND_GPU);
+}

-bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    // TODO: find the optimal values for these
+// NOTE: don't check matrix size, otherwise mul_mat tune will fail to run.
+static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
        src1->type == GGML_TYPE_F32 &&
-        dst->type == GGML_TYPE_F32 /*&&
-        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)*/) {
+        dst->type == GGML_TYPE_F32) {
        return true;
    }

--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@ -9,7 +9,7 @@ extern "C" {
 void ggml_cl_init(void);

 void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+bool   ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor);
 size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);

--- a/ggml-threading.c
+++ b/ggml-threading.c
@ -376,7 +376,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {

    struct ggml_compute_state_shared *shared = state->shared;
    GGML_ASSERT(shared);
-    GGML_ASSERT(shared->task_runner);
+    //GGML_ASSERT(shared->task_runner);

    shared->n_ready++;

@ -397,7 +397,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
                                           : shared->task_runner;
            enum ggml_compute_error err = runner(&state->params, state->node);

-            GGML_ASSERT(err == GGML_COMPUTE_OK);
+            GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK);

            ggml_spin_lock(&shared->spin);

@ -430,7 +430,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
                              size_t wsize) {
    GGML_ASSERT(ctx);
    GGML_ASSERT(node);
-    GGML_ASSERT(ctx->shared.task_runner);
+    // GGML_ASSERT(ctx->shared.task_runner);

    ggml_task_runner *runner = ctx->shared.task_runner;
    if (node->task_profile.runner) {
@ -448,7 +448,7 @@ START:
    memset(&params, 0, sizeof(struct ggml_compute_params));

    for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) {
-        if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) {
+        if (!node->task_profile.stages[type].valid) {
            continue;
        }

@ -519,18 +519,17 @@ START:
            if (err == GGML_COMPUTE_FALLBACK) {
                PRINT_DEBUG("[main] fallback from profile, id=%d\n",
                            node->task_profile.id);
-                GGML_ASSERT(node->task_profile.stages[1].backend >
-                            GGML_TASK_BACKEND_CPU);
+                GGML_ASSERT(node->task_profile.id > 1);

                struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
                int n = ggml_get_task_profiles(node, profiles);
                GGML_ASSERT(n > 0);
-                GGML_ASSERT(profiles[0].stages[1].backend ==
-                            GGML_TASK_BACKEND_CPU);
+                GGML_ASSERT(profiles[0].id == 1);

                memcpy(&node->task_profile, &profiles[0],
-                       sizeof(struct ggml_task_profile));
+                    sizeof(struct ggml_task_profile));
                runner = ctx->shared.task_runner;
+                GGML_ASSERT(runner);

                goto START;
            }
--- a/ggml-threading.h
+++ b/ggml-threading.h
@ -29,7 +29,9 @@ typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data);
 // thread: optional OS thread runner, default value:
 // `ggml_threading_graph_compute_thread`.
 //
-// features: optional for configure
+// task_runner: default task runner, nullable wheen tensor.runner is not NULL.
+//              Overridden by tensor.runner.
+// features: configure threading behaviour, optional.
 // threading additional features. see `ggml_threading_feature`, default 0.
 //
 // stages_time: optional for collecting per-stage wall clock time.
@ -51,12 +53,6 @@ enum ggml_compute_error
 ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
                              struct ggml_tensor *node, void *wdata,
                              size_t wsize);
-
-// This is an experimental functionality for mulmat tune, as a thin wrapper.
-enum ggml_compute_error
-ggml_compute_forward_wrapper(const struct ggml_compute_params *params,
-                             struct ggml_tensor *tensor);
-
 #ifdef __cplusplus
 }
 #endif
--- a/ggml-tune.c
+++ b/ggml-tune.c
@ -24,26 +24,7 @@ static uint64_t ggml_mulmat_tune_cache_hash(int M, int N, int K) {
    return hash;
 }

-static const char *
-ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) {
-    switch (backend) {
-    case GGML_TASK_BACKEND_NONE:
-        return "";
-    case GGML_TASK_BACKEND_CPU:
-        return "CPU";
-    case GGML_TASK_BACKEND_CPU_BLAS:
-        return "BLAS";
-    case GGML_TASK_BACKEND_GPU:
-        return "GPU";
-    case GGML_TASK_BACKEND_GPU_CUDA:
-        return "CUDA";
-    case GGML_TASK_BACKEND_GPU_CL:
-        return "CL";
-    default:
-        GGML_ASSERT(false);
-    }
-}
-
+// Return profile id, -1 when failed (such as unable to match shape).
 // NOTE: we can not use the profile from tune because the profiles do not
 // contain fields such as runner, get_size.
 int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
@ -101,20 +82,15 @@ int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
                e->K = K;

 #ifndef GGML_TUNE_NDEBUG
-                const char *names[3];
-                for (int i = 0; i < 3; i++) {
-                    names[i] = ggml_mulmat_tune_task_backend_name(
-                        prof->stages[i].backend);
-                }
                printf("\n[tune] M: %3d, N: %5d, K: %5d, profile id: %d, "
                       "backends: %s %s %s\n",
-                       M, N, K, prof->id, names[0], names[1], names[2]);
+                       M, N, K, prof->id, prof->name);
 #endif
            }
        }
    }

-    return prof->id;
+    return prof ? prof->id : -1;
 }

 void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
@ -283,25 +259,24 @@ static bool ggml_mulmat_tune_write_profiles(
    int rc;
    for (int i = 0; i < n_profiles; i++) {
        const struct ggml_task_profile *profile = &profiles[i];
-        rc = fprintf(fp, "%d ", profile->id);
-        if (rc <= 0) {
-            return false;
-        }
-
        for (int j = 0; j < 3; j++) {
            const struct ggml_task_stage *ts = &profile->stages[j];
-            rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0,
-                         ts->wait ? 1 : 0);
+            rc = fprintf(fp, "%1d%1d%1d", ts->valid ? 1 : 0,
+                         ts->parallel ? 1 : 0, ts->wait ? 1 : 0);
            if (rc <= 0) {
                return false;
            }
            if (j < 2) {
-                rc = fprintf(fp, "  ");
+                rc = fprintf(fp, " ");
                if (rc <= 0) {
                    return false;
                }
            }
        }
+        rc = fprintf(fp, " %d %s", profile->id, profile->name);
+        if (rc <= 0) {
+            return false;
+        }
        rc = fprintf(fp, "\n");
        if (rc <= 0) {
            return false;
@ -407,24 +382,24 @@ bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
    return ok;
 }

-bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
+int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
    GGML_ASSERT(tune);
    memset(tune, 0, sizeof(struct ggml_mulmat_tune));

    int rc = fscanf(fp, "%d", &tune->version);
    if (rc <= 0) {
-        return false;
+        return 1;
    }

    if (tune->version != GGML_MULMAT_TUNE_VERSION) {
        fprintf(stderr, "[tune] version mismatch, run bench again\n");
-        return false;
+        return 2;
    }

    rc = fscanf(fp, "%s %d %d %d", tune->model, (int *)&tune->ftype,
                &tune->n_shapes, &tune->n_threads);
    if (rc <= 0) {
-        return false;
+        return 3;
    }

    for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
@ -434,7 +409,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
                    (int *)&shape->src0_type, (int *)&shape->src1_type,
                    &shape->n_profiles, &shape->m_num);
        if (rc <= 0) {
-            return false;
+            return 4;
        }

        {
@ -451,24 +426,24 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
        for (int ip = 0; ip < shape->n_profiles; ip++) {
            struct ggml_task_profile *profile = &shape->profiles[ip];

-            rc = fscanf(fp, "%d ", &profile->id);
-            if (rc <= 0) {
-                return false;
-            }
-
            for (int j = 0; j < 3; j++) {
                struct ggml_task_stage *ts = &profile->stages[j];
-                int backend;
+                int valid;
                int parallel;
                int wait;
-                rc = fscanf(fp, "%d %d %d", &backend, &parallel, &wait);
+                rc = fscanf(fp, " %1d%1d%1d", &valid, &parallel, &wait);
                if (rc <= 0) {
-                    return false;
+                    return 5;
                }
-                ts->backend = (enum ggml_task_backend)backend;
+                ts->valid = valid ? true : false;
                ts->parallel = parallel ? true : false;
                ts->wait = wait ? true : false;
            }
+
+            rc = fscanf(fp, "%d %s", &profile->id, profile->name);
+            if (rc <= 0) {
+                return 6;
+            }
        }

        for (int i_m = 0; i_m < shape->m_num; i_m++) {
@ -477,7 +452,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
                if (ip == 0) {
                    rc = fscanf(fp, "%d", &M);
                    if (rc <= 0) {
-                        return false;
+                        return 7;
                    }
                }
                struct ggml_mulmat_tune_m *item =
@ -486,13 +461,13 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
                rc = fscanf(fp, "%d %d %d", &item->stages_time[0],
                            &item->stages_time[1], &item->stages_time[2]);
                if (rc <= 0) {
-                    return false;
+                    return 8;
                }
            }
        }
    }

-    return true;
+    return 0;
 }

 bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
@ -535,7 +510,7 @@ bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,

                const struct ggml_task_profile *profile = &shape->profiles[ip];
                for (int k = 0; k < 3; k++) {
-                    if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) {
+                    if (profile->stages[k].valid) {
                        rc = fprintf(fp, "%9d", item->stages_time[k]);
                        if (rc <= 0) {
                            return false;
@ -562,8 +537,6 @@ const struct ggml_mulmat_tune_shape *
 ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N,
                           const int K, enum ggml_type src0_type,
                           enum ggml_type src1_type) {
-    GGML_ASSERT(N > 0 && K > 0);
-
    for (int i = 0; i < tune->n_shapes; i++) {
        const struct ggml_mulmat_tune_shape *s = &tune->shapes[i];
        if (s->src0_type != src0_type || s->src1_type != src1_type) {
@ -574,13 +547,17 @@ ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N,
            if (s->N == N && s->K == K) {
                return s;
            }
-        } else if (s->N > 0 && s->K == 0) {
-            if (s->N == N) {
-                return s;
-            }
-        } else if (s->N == 0 && s->K > 0) {
-            if (s->K == K) {
-                return s;
+        }
+
+        if (GGML_MULMAT_N_SHAPES == 6) {
+            if (s->N > 0 && s->K == 0) {
+                if (s->N == N) {
+                    return s;
+                }
+            } else if (s->N == 0 && s->K > 0) {
+                if (s->K == K) {
+                    return s;
+                }
            }
        }
    }
@ -639,7 +616,7 @@ void ggml_mulmat_tune_estimate_time(

        for (int i_stage = 0; i_stage < 3; i_stage++) {
            const struct ggml_task_stage *stage = &profile->stages[i_stage];
-            if (stage->backend == GGML_TASK_BACKEND_NONE) {
+            if (!stage->valid) {
                continue;
            }

@ -784,23 +761,6 @@ static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) {
    return sz;
 }

-int ggml_mulmat_tune_get_builtin_task_backends(
-    enum ggml_task_backend *backends) {
-    int i = 0;
-    backends[i++] = GGML_TASK_BACKEND_CPU;
-
-    if (ggml_cpu_has_cpublas()) {
-        backends[i++] = GGML_TASK_BACKEND_CPU_BLAS;
-    }
-
-    if (ggml_cpu_has_cublas()) {
-        backends[i++] = GGML_TASK_BACKEND_GPU_CUDA;
-    } else if (ggml_cpu_has_clblast()) {
-        backends[i++] = GGML_TASK_BACKEND_GPU_CL;
-    }
-    return i;
-}
-
 bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
                            struct ggml_mulmat_tune_params *params) {
    GGML_ASSERT(tune);
@ -809,23 +769,6 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,

    memset(tune, 0, sizeof(struct ggml_mulmat_tune));

-    enum ggml_task_backend backends[16];
-    int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
-    if (n_backends < 2) {
-        fprintf(stderr,
-                "[tune] error: this program was not built with BLAS.\n");
-        return false;
-    }
-
-    if (params->model.ftype >= GGML_FTYPE_MOSTLY_Q2_K &&
-        params->model.ftype <= GGML_FTYPE_MOSTLY_Q6_K) {
-#if defined(GGML_USE_CLBLAST)
-        printf("[tune] error: cl implementation does not support k_quants at "
-               "the time of writing this code, skip.\n");
-        return false;
-#endif
-    }
-
    bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles);
    if (!ok) {
        return false;
@ -835,12 +778,13 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
        char buf[128] = {0};
        int offset = 0;

-        for (int i = 0; i < n_backends; i++) {
+        for (int i = 0; i < tune->shapes[0].n_profiles; i++) {
            if (i > 0) {
                buf[offset++] = ',';
                buf[offset++] = ' ';
            }
-            const char *name = ggml_mulmat_tune_task_backend_name(backends[i]);
+            const char *name = tune->shapes[0].profiles[i].name;
+            GGML_ASSERT(name != NULL && strcmp(name, "") != 0);
            size_t len = strlen(name);
            memcpy(&buf[offset], name, len);
            offset += (int)len;
@ -848,17 +792,17 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,

        fprintf(stdout,
                "[tune] model: %s, ggml ftype: %d, "
-                "n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n",
+                "n_pass: %d, n_shapes: %d, n_threads: %d, profiles: %s\n",
                params->model.name, params->model.ftype, params->n_pass,
-                params->n_threads, tune->n_shapes, buf);
+                tune->n_shapes, params->n_threads, buf);
    }

    int64_t stages_time[3];
    int64_t t0 = ggml_time_ms();

-    struct ggml_threading_context *thrd_ctx = ggml_threading_start(
-        tune->n_threads, NULL, ggml_compute_forward_wrapper,
-        GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time);
+    struct ggml_threading_context *thrd_ctx =
+        ggml_threading_start(tune->n_threads, NULL, NULL,
+                             GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time);

    for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
        const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
@ -896,6 +840,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,

            for (int ip = 0; ip < shape->n_profiles; ip++) {
                const struct ggml_task_profile *profile = &shape->profiles[ip];
+                // GGML_ASSERT(profile->runner);

                memcpy(&node->task_profile, profile,
                       sizeof(struct ggml_task_profile));
@ -911,9 +856,15 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
                        stages_time[j] = 0;
                    }

-                    enum ggml_compute_error err = ggml_threading_compute_tensor(
-                        thrd_ctx, node, wdata, wsize);
-                    GGML_ASSERT(err == GGML_COMPUTE_OK);
+                    ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
+
+                    if (memcmp(profile, &node->task_profile,
+                               sizeof(struct ggml_task_profile)) != 0) {
+                        printf("[tune] error: task profile changed, tensor op: "
+                               "%d, original id: %d, current id: %d\n",
+                               node->op, profile->id, node->task_profile.id);
+                        exit(1);
+                    }

                    for (int i = 0; i < 3; i++) {
                        int v = (int)stages_time[i];
--- a/ggml-tune.h
+++ b/ggml-tune.h
@ -10,7 +10,7 @@
 extern "C" {
 #endif

-#define GGML_MULMAT_TUNE_VERSION 9
+#define GGML_MULMAT_TUNE_VERSION 10
 #define GGML_MULMAT_N_SHAPES 4
 #define GGML_MULMAT_CACHE_LEN 16

@ -119,7 +119,7 @@ void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune);

 bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp);

-bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp);
+int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp);

 const struct ggml_mulmat_tune_shape *
 ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, int N, int K,
@ -129,11 +129,6 @@ void ggml_mulmat_tune_estimate_time(const struct ggml_mulmat_tune_shape *shape,
                                    int M,
                                    struct ggml_mulmat_tune_time *profile_time);

-const char *ggml_task_backend_name(enum ggml_task_backend backend);
-
-int ggml_mulmat_tune_get_builtin_task_backends(
-    enum ggml_task_backend *backends);
-
 bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
                            struct ggml_mulmat_tune_params *params);

--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -362,29 +362,10 @@ extern "C" {

    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);

-    // As part of task config profile solution, `ggml_task_backend` defines
-    // backends for each task stage. Similar to `ggml_tensor.backend`,
-    // `ggml_tensor.task_profile` generalizes how to configure tensor computing
-    // at per task-stage level.
-    //
-    // The following enum values are designed as combination of hardware and
-    // optional software interface.
-    enum ggml_task_backend {
-        GGML_TASK_BACKEND_NONE     = 0,
-
-        // [0x10, 0x1F]: CPU
-        GGML_TASK_BACKEND_CPU      = 0x10,
-        GGML_TASK_BACKEND_CPU_BLAS = 0x11,
-
-        // [0x20 - 0x2F]: GPU
-        GGML_TASK_BACKEND_GPU      = 0x20,
-        GGML_TASK_BACKEND_GPU_CUDA = 0x21,
-        GGML_TASK_BACKEND_GPU_CL   = 0x22,
-    };
-
    // config for computing one of the 3 task stages of a tensor.
    struct ggml_task_stage {
-        enum ggml_task_backend backend;
+        bool valid;
+
        bool parallel;
        // hint idle workers go waiting, valid only when parallel is false.
        bool wait;
@ -407,13 +388,16 @@ extern "C" {
    // Get wsize for node computing.
    // When return -1: should be explained as `fallback to CPU`, caller MUST
    // determine how much memory to reserve for this node.
-    typedef int (ggml_task_get_wsize)(struct ggml_tensor *tensor);
+    typedef int (ggml_task_wsize_getter)(struct ggml_tensor *tensor);

    // config for computing a tensor.
    struct ggml_task_profile {
        // profile id, start from 1.
        int id;

+        // Required, not empty, no whitespaces.
+        char name[16];
+
        // index 0: INIT, 1: COMPUTE, 2: FINALIZE
        struct ggml_task_stage stages[3];

@ -421,7 +405,7 @@ extern "C" {
        ggml_task_runner *runner;

        // Optional function to return required wsize for wdata.
-        ggml_task_get_wsize *get_wsize;
+        ggml_task_wsize_getter *wsize_getter;

        // Optional flag for development.
        // MUST be used only in testing codes.
--- a/llama.cpp
+++ b/llama.cpp
@ -2744,8 +2744,9 @@ struct llama_context * llama_init_from_file(
 }

 #ifdef GGML_USE_TUNE
-bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) {
-    GGML_ASSERT (ctx->model.n_gpu_layers == 0);
+bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune,
+                       const char *fname) {
+    GGML_ASSERT(ctx->model.n_gpu_layers == 0);

    printf("\n");

@ -2755,7 +2756,7 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons

    enum ggml_ftype ggml_ftype;
    switch (hparams->ftype) {
-        case LLAMA_FTYPE_ALL_F32:
+    case LLAMA_FTYPE_ALL_F32:
        ggml_ftype = GGML_FTYPE_ALL_F32;
        break;
    case LLAMA_FTYPE_MOSTLY_F16:
@ -2767,9 +2768,6 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
    case LLAMA_FTYPE_MOSTLY_Q4_1:
        ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1;
        break;
-    case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
-        ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1_SOME_F16;
-        break;
    case LLAMA_FTYPE_MOSTLY_Q5_0:
        ggml_ftype = GGML_FTYPE_MOSTLY_Q5_0;
        break;
@ -2799,8 +2797,8 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
        ggml_ftype = GGML_FTYPE_MOSTLY_Q6_K;
        break;
    default:
-        throw std::runtime_error(
-            format("invalid output file type %d\n", hparams->ftype));
+        fprintf(stderr, "[tune] unsupported file type %d\n", hparams->ftype);
+        return false;
    }

    int n_vocab = hparams->n_vocab;
@ -2808,30 +2806,36 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
    int n_rot = hparams->n_rot;

    int n_mult = hparams->n_mult;
-    int n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult;
+    int n_ff = ((2 * (4 * n_embd) / 3 + n_mult - 1) / n_mult) * n_mult;

    struct ggml_mulmat_tune_params params = {
-        /*.model =*/ {
-            /* .name    =*/ model_name,
-            /* .ftype   =*/ ggml_ftype,
-            /* .n_vocab =*/ n_vocab,
-            /* .n_embd  =*/ n_embd,
-            /* .n_ff    =*/ n_ff,
-            /* .n_rot   =*/ n_rot,
+        /*.model =*/{
+            /* .name    =*/model_name,
+            /* .ftype   =*/ggml_ftype,
+            /* .n_vocab =*/n_vocab,
+            /* .n_embd  =*/n_embd,
+            /* .n_ff    =*/n_ff,
+            /* .n_rot   =*/n_rot,
        },
-        /* .m_num          =*/ 8,
-        /* .n_pass         =*/ 1,
-        /* .n_threads      =*/ n_threads,
-        /* .prrogress      =*/ true,
-        /* .output_console =*/ false,
-        /* .fname          =*/ fname,
+        /* .m_num          =*/8,
+        /* .n_pass         =*/1,
+        /* .n_threads      =*/n_threads,
+        /* .prrogress      =*/true,
+        /* .output_console =*/false,
+        /* .fname          =*/fname,
    };

    bool empty_fname = !fname || strcmp(fname, "") == 0;

-    ctx->tune = new(struct ggml_mulmat_tune);
+    ctx->tune = new (struct ggml_mulmat_tune);
    if (!ctx->tune) {
-        throw std::runtime_error(format("failed to allocate memory for tune\n"));
+        fprintf(stderr, "[tune] failed to allocate memory for tune\n");
+        return false;
+    }
+
+    if (!ggml_cpu_has_blas()) {
+        fprintf(stderr, "[tune] this program is not built with BLAS, abort.\n");
+        return false;
    }

    if (tune) {
@ -2844,31 +2848,30 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
            ggml_mulmat_tune_free(ctx->tune);
            return true;
        }
-    } else {
-        if (empty_fname) {
-            return false;
-        }
+    } else if (empty_fname) {
+        return false;
    }

    if (!empty_fname) {
        FILE *fp = fopen(fname, "r");
        if (!fp) {
-            fprintf(stderr, "[tune] failed to open file %s.\n",
-                    fname);
+            fprintf(stderr, "[tune] failed to open file %s.\n", fname);
+            return false;
        } else {
-            bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp);
+            int rc = ggml_mulmat_tune_read_data(ctx->tune, fp);
            fclose(fp);

-            if (!ok) {
+            if (rc != 0) {
                fprintf(stderr,
-                        "[tune] failed to read data from %s\n",
-                        fname);
+                        "[tune] failed to read data from %s, error code: %d\n",
+                        fname, rc);
                return false;
            }

            fprintf(stderr, "[tune] loaded data from %s\n", fname);

-            ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads);
+            bool ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype,
+                                                params.n_threads);
            if (!ok) {
                return false;
            }
--- a/tests/test-ggml-threading.c
+++ b/tests/test-ggml-threading.c
@ -41,9 +41,8 @@ static const int n_repeat = 10;
 // counter with array.
 static int work_done_arr[MAX_N_THREADS];

-static enum ggml_compute_error
-mock_task_runner(const struct ggml_compute_params *params,
-                 struct ggml_tensor *node) {
+static enum ggml_compute_error mock_task_runner(const struct ggml_compute_params *params,
+                             struct ggml_tensor *node) {
    int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000;
    if (node->task_profile.stages[params->type].parallel) {
        loops /= params->nth;
@ -80,20 +79,15 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {

    int t0 = (int)ggml_time_us();

-    struct ggml_threading_context *ctx = ggml_threading_start(
-        n_threads, NULL, mock_task_runner, features, /*stages_time*/ NULL);
+    node->task_profile.runner = mock_task_runner;
+
+    struct ggml_threading_context *ctx =
+        ggml_threading_start(n_threads, NULL, NULL, features, /*stages_time*/ NULL);

    int t1 = (int)ggml_time_us();

    for (int i = 0; i < n_repeat; i++) {
-        enum ggml_compute_error err = ggml_threading_compute_tensor(
-            ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
-        if (err != GGML_COMPUTE_OK) {
-            ggml_threading_stop(ctx);
-            printf("ggml_threading_compute_tensor failed with error: %d.\n",
-                   err);
-            return 1;
-        }
+        ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
    }

    int t2 = (int)ggml_time_us();
@ -107,7 +101,7 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
    int expect = 0;
    for (int i = 0; i < 3; i++) {
        const struct ggml_task_stage *ts = &stages[i];
-        if (ts->backend != GGML_TASK_BACKEND_NONE) {
+        if (ts->valid) {
            if (ts->parallel) {
                expect += n_threads;
            } else {
@ -144,14 +138,12 @@ static enum ggml_compute_error
 mock_task_runner_fallback(const struct ggml_compute_params *params,
                          struct ggml_tensor *node) {
    UNUSED(params);
-    if (node->backend == GGML_BACKEND_GPU) {
-        // ... finally failed to compute in GPU.

-        node->backend = GGML_BACKEND_CPU;
+    // failed to run ...
+    if (node->task_profile.id == 2) {
        return GGML_COMPUTE_FALLBACK;
-    } else {
-        return GGML_COMPUTE_OK;
    }
+    return GGML_COMPUTE_OK;
 }

 // By design, fallback should happen when attempt computing tensor in GPU,
@ -164,6 +156,9 @@ int test_fallback(struct ggml_tensor *node) {
    enum ggml_compute_error err =
        ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
    if (err == GGML_COMPUTE_FALLBACK) {
+        // mock setup new profile ...
+        node->task_profile.id = 1;
+
        err = ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL,
                                            /*wsize*/ 0);
    }
@ -214,12 +209,12 @@ int main(void) {

    struct ggml_tensor node;
    memset(&node, 0, sizeof(struct ggml_tensor));
+    node.task_profile.runner = mock_task_runner;

    struct ggml_task_stage *stages = node.task_profile.stages;

-    stages[0].backend = GGML_TASK_BACKEND_CPU;
-    stages[1].backend = GGML_TASK_BACKEND_CPU;
-    stages[2].backend = GGML_TASK_BACKEND_NONE;
+    stages[0].valid = true;
+    stages[1].valid = true;

    int n_passed = 0;
    int n_tests = 0;
@ -277,7 +272,7 @@ int main(void) {

        struct ggml_threading_context *ctx =
            ggml_threading_start(n_threads, ggml_threading_graph_compute_thread,
-                                 mock_task_runner, 0, /*stages_time*/ NULL);
+                                 NULL, 0, /*stages_time*/ NULL);

        int t1 = (int)ggml_time_us();

@ -416,8 +411,8 @@ int main(void) {
        node.src0 = &src0;
        node.src1 = &src1;

-        node.backend = GGML_BACKEND_GPU;
-        stages[1].backend = GGML_TASK_BACKEND_GPU;
+        node.task_profile.id = 2;
+        stages[1].valid = true;
        if (test_fallback(&node) == 0) {
            ++n_passed;
            printf("[test-ggml-threading] test fallback: ok\n\n");
--- a/tests/test-ggml-tune.c
+++ b/tests/test-ggml-tune.c
@ -46,13 +46,9 @@ int main(void) {
 }

 static int bench(void) {
-    {
-        enum ggml_task_backend backends[16];
-        int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
-        if (n_backends < 2) {
-            printf("[test-ggml-tune] skipped because no BLAS\n");
-            return 0;
-        }
+    if (!ggml_cpu_has_blas()) {
+        printf("[test-ggml-tune] skipped because no BLAS\n");
+        return 0;
    }

    {
@ -118,10 +114,13 @@ static int
 ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node,
                                     struct ggml_task_profile *profiles) {
    UNUSED(node);
-    profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
-    profiles[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
-    profiles[1].stages[0].backend = GGML_TASK_BACKEND_CPU;
-    profiles[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
+    profiles[0].id = 1;
+    profiles[0].stages[0].valid = true;
+    profiles[0].stages[1].valid = true;
+
+    profiles[0].id = 2;
+    profiles[1].stages[0].valid = true;
+    profiles[1].stages[1].valid = true;

    return 2;
 }