bulk refactoring task profile and related to run CL GPU offloading.

* removed ggml_task_backend, infavour of ggml_task_profile.runner and newly added id and name. * extracted mul_mat blas codes into ggml_compute_forward_mul_mat_blas, thus align with CUDA/CL a bit more and make it easier to fix profile and run tune. * rewrote task profile and update/add some cuda/cl codes, finnaly made CL GPU offloading work. * misc minor fix/update to tune, the data format was changed.
2025-01-11 03:01:45 +00:00 · 2023-06-18 12:29:16 +08:00 · 2023-06-18 12:29:16 +08:00 · 06b00827a0
commit 06b00827a0
parent 6b83a3e16f
15 changed files with 673 additions and 773 deletions
--- a/examples/mulmat-tune/README.md
+++ b/examples/mulmat-tune/README.md
@ -214,25 +214,18 @@ The following results are generated with Accelerate compiled.
 **Example**
 ```
-5 3B 2 6 1
+[tune] done, elapsed time: 0 seconds.
 10 xB 12 4 2
-3200 3200  2 0 3 10
+1024 1024 12 0 2 4
-16 0 0 0  16 1 0 1   0 0 0 0
+100 110 000 1 CPU
-16 1 0 2  17 0 1 0   0 0 0 0
+110 101 000 2 BLAS
- 0 0 0 0  34 0 1 0   0 0 0 0
+   1       11      309 0     1234       90 0
-   1        1      793 0     9103     2102 0 0     6014 0
+   2       23      654 0     1359      215 0
-   2        2     1591 0     8034     2305 0 0    30982 0
+   4       44     1283 0     1362      421 0
-   4        4     2236 0     6476     2484 0 0    31388 0
+   8       85     2341 0     1357      347 0
   8        7     4161 0     6623     2389 0 0    29204 0
  16       15     8339 0     6434     2752 0 0    34303 0
  32       32    16919 0     6915     3651 0 0    42511 0
  64      200    34270 0     6574     4528 0 0    68212 0
 128      188    69400 0     6325     6839 0 0    74437 0
 256      303   134597 0     6168    11544 0 0   110180 0
 512      687   279685 0     6337    29712 0 0   159728 0
 3200 8640  2 0 2 10
 1024 2048 12 0 2 4
 ...
 ```
@ -249,17 +242,17 @@ shape+
 # head
 version: 1
 model: "3B" | "7B" | "13B" | "30B" | "65B"
-ggml_ftype: 0 - 4, 7 - 14
+ggml_ftype: 0 - 3, 7 - 14
 n_shapes: number of shapes
 n_threads: number of threads
-shape := N K  m_num n_profiles
+shape := N K  src0_ggml_type src1_ggml_type n_profiles m_num
-task_conf_profile+
+task_profile+
 bench_item+
-task_conf_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize)
+task_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize) id name
-stage_conf: backend parallel wait
+stage_conf(bitmap): valid parallel wait
-backend: 0 (NONE) | 16 (CPU) | 17 (CPU_BLAS) | 32 (GPU) | 33 (GPU_CUDA) | 34 (GPU_CL)
+valid: 0 (false) | 1 (true)
 parallel: 0 (false) | 1 (true)
 wait: 0 (false) | 1 (true)
--- a/examples/mulmat-tune/mulmat-tune.cpp
+++ b/examples/mulmat-tune/mulmat-tune.cpp
@ -111,6 +111,11 @@ static void usage(char *prog) {
 }
 int main(int argc, char **argv) {
    if (!ggml_cpu_has_blas()) {
        fprintf(stderr, "error: this program is not built with BLAS.\n");
        return 1;
    }
    if (argc == 2) {
        if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) {
            usage(argv[0]);
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -2207,17 +2207,12 @@ void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml
    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
 }
-bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+// NOTE: don't check matrix size, otherwise mul_mat tune will fail to run.
-    const int64_t ne10 = src1->ne[0];
+static bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
    const int64_t ne0 = dst->ne[0];
    const int64_t ne1 = dst->ne[1];
    // TODO: find the optimal values for these
    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
        src1->type == GGML_TYPE_F32 &&
-        dst->type == GGML_TYPE_F32 &&
+        dst->type == GGML_TYPE_F32) {
        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
        return true;
    }
@ -2539,11 +2534,17 @@ void ggml_cuda_free_scratch() {
    g_scratch_buffer = nullptr;
 }
-bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
+bool ggml_cuda_is_gpu_offloading(struct ggml_tensor * tensor) {
-    ggml_cuda_func_t func;
+    GGML_ASSERT(tensor);
-    const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
+    GGML_ASSERT(tensor->src0);
    return tensor->backend == GGML_BACKEND_GPU
        || tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT
        || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
 }
 bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
    ggml_cuda_func_t func;
    const bool any_on_device = is_gpu_offloading(tensor);
    switch (tensor->op) {
        case GGML_OP_ADD:
@ -2571,7 +2572,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
            func = ggml_cuda_rms_norm;
            break;
        case GGML_OP_MUL_MAT:
-            if (!any_on_device/* && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)*/) {
+            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
                return false;
            }
            func = ggml_cuda_mul_mat;
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -16,7 +16,7 @@ void   ggml_init_cublas(void);
 void   ggml_cuda_set_tensor_split(const float * tensor_split);
 void   ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+bool   ggml_cuda_is_gpu_offloading(const struct ggml_tensor * src0);
 size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -1589,18 +1589,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
    }
 }
 bool ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor) {
    GGML_ASSERT(tensor);
    return (tensor->src0 && tensor->src0->backend == GGML_BACKEND_GPU) ||
        (tensor->src1 && tensor->src1->backend == GGML_BACKEND_GPU);
 }
-bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+// NOTE: don't check matrix size, otherwise mul_mat tune will fail to run.
-    const int64_t ne10 = src1->ne[0];
+static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
    const int64_t ne0 = dst->ne[0];
    const int64_t ne1 = dst->ne[1];
    // TODO: find the optimal values for these
    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
        src1->type == GGML_TYPE_F32 &&
-        dst->type == GGML_TYPE_F32 /*&&
+        dst->type == GGML_TYPE_F32) {
        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)*/) {
        return true;
    }
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@ -9,7 +9,7 @@ extern "C" {
 void ggml_cl_init(void);
 void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+bool   ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor);
 size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
--- a/ggml-threading.c
+++ b/ggml-threading.c
@ -376,7 +376,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
    struct ggml_compute_state_shared *shared = state->shared;
    GGML_ASSERT(shared);
-    GGML_ASSERT(shared->task_runner);
+    //GGML_ASSERT(shared->task_runner);
    shared->n_ready++;
@ -397,7 +397,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
                                           : shared->task_runner;
            enum ggml_compute_error err = runner(&state->params, state->node);
-            GGML_ASSERT(err == GGML_COMPUTE_OK);
+            GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK);
            ggml_spin_lock(&shared->spin);
@ -430,7 +430,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
                              size_t wsize) {
    GGML_ASSERT(ctx);
    GGML_ASSERT(node);
-    GGML_ASSERT(ctx->shared.task_runner);
+    // GGML_ASSERT(ctx->shared.task_runner);
    ggml_task_runner *runner = ctx->shared.task_runner;
    if (node->task_profile.runner) {
@ -448,7 +448,7 @@ START:
    memset(&params, 0, sizeof(struct ggml_compute_params));
    for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) {
-        if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) {
+        if (!node->task_profile.stages[type].valid) {
            continue;
        }
@ -519,18 +519,17 @@ START:
            if (err == GGML_COMPUTE_FALLBACK) {
                PRINT_DEBUG("[main] fallback from profile, id=%d\n",
                            node->task_profile.id);
-                GGML_ASSERT(node->task_profile.stages[1].backend >
+                GGML_ASSERT(node->task_profile.id > 1);
                            GGML_TASK_BACKEND_CPU);
                struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
                int n = ggml_get_task_profiles(node, profiles);
                GGML_ASSERT(n > 0);
-                GGML_ASSERT(profiles[0].stages[1].backend ==
+                GGML_ASSERT(profiles[0].id == 1);
                            GGML_TASK_BACKEND_CPU);
                memcpy(&node->task_profile, &profiles[0],
                    sizeof(struct ggml_task_profile));
                runner = ctx->shared.task_runner;
                GGML_ASSERT(runner);
                goto START;
            }
--- a/ggml-threading.h
+++ b/ggml-threading.h
@ -29,7 +29,9 @@ typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data);
 // thread: optional OS thread runner, default value:
 // `ggml_threading_graph_compute_thread`.
 //
-// features: optional for configure
+// task_runner: default task runner, nullable wheen tensor.runner is not NULL.
 //              Overridden by tensor.runner.
 // features: configure threading behaviour, optional.
 // threading additional features. see `ggml_threading_feature`, default 0.
 //
 // stages_time: optional for collecting per-stage wall clock time.
@ -51,12 +53,6 @@ enum ggml_compute_error
 ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
                              struct ggml_tensor *node, void *wdata,
                              size_t wsize);
 // This is an experimental functionality for mulmat tune, as a thin wrapper.
 enum ggml_compute_error
 ggml_compute_forward_wrapper(const struct ggml_compute_params *params,
                             struct ggml_tensor *tensor);
 #ifdef __cplusplus
 }
 #endif
--- a/ggml-tune.c
+++ b/ggml-tune.c
@ -24,26 +24,7 @@ static uint64_t ggml_mulmat_tune_cache_hash(int M, int N, int K) {
    return hash;
 }
-static const char *
+// Return profile id, -1 when failed (such as unable to match shape).
 ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) {
    switch (backend) {
    case GGML_TASK_BACKEND_NONE:
        return "";
    case GGML_TASK_BACKEND_CPU:
        return "CPU";
    case GGML_TASK_BACKEND_CPU_BLAS:
        return "BLAS";
    case GGML_TASK_BACKEND_GPU:
        return "GPU";
    case GGML_TASK_BACKEND_GPU_CUDA:
        return "CUDA";
    case GGML_TASK_BACKEND_GPU_CL:
        return "CL";
    default:
        GGML_ASSERT(false);
    }
 }
 // NOTE: we can not use the profile from tune because the profiles do not
 // contain fields such as runner, get_size.
 int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
@ -101,20 +82,15 @@ int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
                e->K = K;
 #ifndef GGML_TUNE_NDEBUG
                const char *names[3];
                for (int i = 0; i < 3; i++) {
                    names[i] = ggml_mulmat_tune_task_backend_name(
                        prof->stages[i].backend);
                }
                printf("\n[tune] M: %3d, N: %5d, K: %5d, profile id: %d, "
                       "backends: %s %s %s\n",
-                       M, N, K, prof->id, names[0], names[1], names[2]);
+                       M, N, K, prof->id, prof->name);
 #endif
            }
        }
    }
-    return prof->id;
+    return prof ? prof->id : -1;
 }
 void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
@ -283,15 +259,10 @@ static bool ggml_mulmat_tune_write_profiles(
    int rc;
    for (int i = 0; i < n_profiles; i++) {
        const struct ggml_task_profile *profile = &profiles[i];
        rc = fprintf(fp, "%d ", profile->id);
        if (rc <= 0) {
            return false;
        }
        for (int j = 0; j < 3; j++) {
            const struct ggml_task_stage *ts = &profile->stages[j];
-            rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0,
+            rc = fprintf(fp, "%1d%1d%1d", ts->valid ? 1 : 0,
-                         ts->wait ? 1 : 0);
+                         ts->parallel ? 1 : 0, ts->wait ? 1 : 0);
            if (rc <= 0) {
                return false;
            }
@ -302,6 +273,10 @@ static bool ggml_mulmat_tune_write_profiles(
                }
            }
        }
        rc = fprintf(fp, " %d %s", profile->id, profile->name);
        if (rc <= 0) {
            return false;
        }
        rc = fprintf(fp, "\n");
        if (rc <= 0) {
            return false;
@ -407,24 +382,24 @@ bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
    return ok;
 }
-bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
+int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
    GGML_ASSERT(tune);
    memset(tune, 0, sizeof(struct ggml_mulmat_tune));
    int rc = fscanf(fp, "%d", &tune->version);
    if (rc <= 0) {
-        return false;
+        return 1;
    }
    if (tune->version != GGML_MULMAT_TUNE_VERSION) {
        fprintf(stderr, "[tune] version mismatch, run bench again\n");
-        return false;
+        return 2;
    }
    rc = fscanf(fp, "%s %d %d %d", tune->model, (int *)&tune->ftype,
                &tune->n_shapes, &tune->n_threads);
    if (rc <= 0) {
-        return false;
+        return 3;
    }
    for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
@ -434,7 +409,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
                    (int *)&shape->src0_type, (int *)&shape->src1_type,
                    &shape->n_profiles, &shape->m_num);
        if (rc <= 0) {
-            return false;
+            return 4;
        }
        {
@ -451,24 +426,24 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
        for (int ip = 0; ip < shape->n_profiles; ip++) {
            struct ggml_task_profile *profile = &shape->profiles[ip];
            rc = fscanf(fp, "%d ", &profile->id);
            if (rc <= 0) {
                return false;
            }
            for (int j = 0; j < 3; j++) {
                struct ggml_task_stage *ts = &profile->stages[j];
-                int backend;
+                int valid;
                int parallel;
                int wait;
-                rc = fscanf(fp, "%d %d %d", &backend, &parallel, &wait);
+                rc = fscanf(fp, " %1d%1d%1d", &valid, &parallel, &wait);
                if (rc <= 0) {
-                    return false;
+                    return 5;
                }
-                ts->backend = (enum ggml_task_backend)backend;
+                ts->valid = valid ? true : false;
                ts->parallel = parallel ? true : false;
                ts->wait = wait ? true : false;
            }
            rc = fscanf(fp, "%d %s", &profile->id, profile->name);
            if (rc <= 0) {
                return 6;
            }
        }
        for (int i_m = 0; i_m < shape->m_num; i_m++) {
@ -477,7 +452,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
                if (ip == 0) {
                    rc = fscanf(fp, "%d", &M);
                    if (rc <= 0) {
-                        return false;
+                        return 7;
                    }
                }
                struct ggml_mulmat_tune_m *item =
@ -486,13 +461,13 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
                rc = fscanf(fp, "%d %d %d", &item->stages_time[0],
                            &item->stages_time[1], &item->stages_time[2]);
                if (rc <= 0) {
-                    return false;
+                    return 8;
                }
            }
        }
    }
-    return true;
+    return 0;
 }
 bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
@ -535,7 +510,7 @@ bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
                const struct ggml_task_profile *profile = &shape->profiles[ip];
                for (int k = 0; k < 3; k++) {
-                    if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) {
+                    if (profile->stages[k].valid) {
                        rc = fprintf(fp, "%9d", item->stages_time[k]);
                        if (rc <= 0) {
                            return false;
@ -562,8 +537,6 @@ const struct ggml_mulmat_tune_shape *
 ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N,
                           const int K, enum ggml_type src0_type,
                           enum ggml_type src1_type) {
    GGML_ASSERT(N > 0 && K > 0);
    for (int i = 0; i < tune->n_shapes; i++) {
        const struct ggml_mulmat_tune_shape *s = &tune->shapes[i];
        if (s->src0_type != src0_type || s->src1_type != src1_type) {
@ -574,7 +547,10 @@ ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N,
            if (s->N == N && s->K == K) {
                return s;
            }
-        } else if (s->N > 0 && s->K == 0) {
+        }
        if (GGML_MULMAT_N_SHAPES == 6) {
            if (s->N > 0 && s->K == 0) {
                if (s->N == N) {
                    return s;
                }
@ -584,6 +560,7 @@ ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N,
                }
            }
        }
    }
    return NULL;
 }
@ -639,7 +616,7 @@ void ggml_mulmat_tune_estimate_time(
        for (int i_stage = 0; i_stage < 3; i_stage++) {
            const struct ggml_task_stage *stage = &profile->stages[i_stage];
-            if (stage->backend == GGML_TASK_BACKEND_NONE) {
+            if (!stage->valid) {
                continue;
            }
@ -784,23 +761,6 @@ static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) {
    return sz;
 }
 int ggml_mulmat_tune_get_builtin_task_backends(
    enum ggml_task_backend *backends) {
    int i = 0;
    backends[i++] = GGML_TASK_BACKEND_CPU;
    if (ggml_cpu_has_cpublas()) {
        backends[i++] = GGML_TASK_BACKEND_CPU_BLAS;
    }
    if (ggml_cpu_has_cublas()) {
        backends[i++] = GGML_TASK_BACKEND_GPU_CUDA;
    } else if (ggml_cpu_has_clblast()) {
        backends[i++] = GGML_TASK_BACKEND_GPU_CL;
    }
    return i;
 }
 bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
                            struct ggml_mulmat_tune_params *params) {
    GGML_ASSERT(tune);
@ -809,23 +769,6 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
    memset(tune, 0, sizeof(struct ggml_mulmat_tune));
    enum ggml_task_backend backends[16];
    int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
    if (n_backends < 2) {
        fprintf(stderr,
                "[tune] error: this program was not built with BLAS.\n");
        return false;
    }
    if (params->model.ftype >= GGML_FTYPE_MOSTLY_Q2_K &&
        params->model.ftype <= GGML_FTYPE_MOSTLY_Q6_K) {
 #if defined(GGML_USE_CLBLAST)
        printf("[tune] error: cl implementation does not support k_quants at "
               "the time of writing this code, skip.\n");
        return false;
 #endif
    }
    bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles);
    if (!ok) {
        return false;
@ -835,12 +778,13 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
        char buf[128] = {0};
        int offset = 0;
-        for (int i = 0; i < n_backends; i++) {
+        for (int i = 0; i < tune->shapes[0].n_profiles; i++) {
            if (i > 0) {
                buf[offset++] = ',';
                buf[offset++] = ' ';
            }
-            const char *name = ggml_mulmat_tune_task_backend_name(backends[i]);
+            const char *name = tune->shapes[0].profiles[i].name;
            GGML_ASSERT(name != NULL && strcmp(name, "") != 0);
            size_t len = strlen(name);
            memcpy(&buf[offset], name, len);
            offset += (int)len;
@ -848,16 +792,16 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
        fprintf(stdout,
                "[tune] model: %s, ggml ftype: %d, "
-                "n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n",
+                "n_pass: %d, n_shapes: %d, n_threads: %d, profiles: %s\n",
                params->model.name, params->model.ftype, params->n_pass,
-                params->n_threads, tune->n_shapes, buf);
+                tune->n_shapes, params->n_threads, buf);
    }
    int64_t stages_time[3];
    int64_t t0 = ggml_time_ms();
-    struct ggml_threading_context *thrd_ctx = ggml_threading_start(
+    struct ggml_threading_context *thrd_ctx =
-        tune->n_threads, NULL, ggml_compute_forward_wrapper,
+        ggml_threading_start(tune->n_threads, NULL, NULL,
                             GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time);
    for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
@ -896,6 +840,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
            for (int ip = 0; ip < shape->n_profiles; ip++) {
                const struct ggml_task_profile *profile = &shape->profiles[ip];
                // GGML_ASSERT(profile->runner);
                memcpy(&node->task_profile, profile,
                       sizeof(struct ggml_task_profile));
@ -911,9 +856,15 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
                        stages_time[j] = 0;
                    }
-                    enum ggml_compute_error err = ggml_threading_compute_tensor(
+                    ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
-                        thrd_ctx, node, wdata, wsize);
+
-                    GGML_ASSERT(err == GGML_COMPUTE_OK);
+                    if (memcmp(profile, &node->task_profile,
                               sizeof(struct ggml_task_profile)) != 0) {
                        printf("[tune] error: task profile changed, tensor op: "
                               "%d, original id: %d, current id: %d\n",
                               node->op, profile->id, node->task_profile.id);
                        exit(1);
                    }
                    for (int i = 0; i < 3; i++) {
                        int v = (int)stages_time[i];
--- a/ggml-tune.h
+++ b/ggml-tune.h
@ -10,7 +10,7 @@
 extern "C" {
 #endif
-#define GGML_MULMAT_TUNE_VERSION 9
+#define GGML_MULMAT_TUNE_VERSION 10
 #define GGML_MULMAT_N_SHAPES 4
 #define GGML_MULMAT_CACHE_LEN 16
@ -119,7 +119,7 @@ void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune);
 bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp);
-bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp);
+int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp);
 const struct ggml_mulmat_tune_shape *
 ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, int N, int K,
@ -129,11 +129,6 @@ void ggml_mulmat_tune_estimate_time(const struct ggml_mulmat_tune_shape *shape,
                                    int M,
                                    struct ggml_mulmat_tune_time *profile_time);
 const char *ggml_task_backend_name(enum ggml_task_backend backend);
 int ggml_mulmat_tune_get_builtin_task_backends(
    enum ggml_task_backend *backends);
 bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
                            struct ggml_mulmat_tune_params *params);
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -362,29 +362,10 @@ extern "C" {
    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
    // As part of task config profile solution, `ggml_task_backend` defines
    // backends for each task stage. Similar to `ggml_tensor.backend`,
    // `ggml_tensor.task_profile` generalizes how to configure tensor computing
    // at per task-stage level.
    //
    // The following enum values are designed as combination of hardware and
    // optional software interface.
    enum ggml_task_backend {
        GGML_TASK_BACKEND_NONE     = 0,
        // [0x10, 0x1F]: CPU
        GGML_TASK_BACKEND_CPU      = 0x10,
        GGML_TASK_BACKEND_CPU_BLAS = 0x11,
        // [0x20 - 0x2F]: GPU
        GGML_TASK_BACKEND_GPU      = 0x20,
        GGML_TASK_BACKEND_GPU_CUDA = 0x21,
        GGML_TASK_BACKEND_GPU_CL   = 0x22,
    };
    // config for computing one of the 3 task stages of a tensor.
    struct ggml_task_stage {
-        enum ggml_task_backend backend;
+        bool valid;
        bool parallel;
        // hint idle workers go waiting, valid only when parallel is false.
        bool wait;
@ -407,13 +388,16 @@ extern "C" {
    // Get wsize for node computing.
    // When return -1: should be explained as `fallback to CPU`, caller MUST
    // determine how much memory to reserve for this node.
-    typedef int (ggml_task_get_wsize)(struct ggml_tensor *tensor);
+    typedef int (ggml_task_wsize_getter)(struct ggml_tensor *tensor);
    // config for computing a tensor.
    struct ggml_task_profile {
        // profile id, start from 1.
        int id;
        // Required, not empty, no whitespaces.
        char name[16];
        // index 0: INIT, 1: COMPUTE, 2: FINALIZE
        struct ggml_task_stage stages[3];
@ -421,7 +405,7 @@ extern "C" {
        ggml_task_runner *runner;
        // Optional function to return required wsize for wdata.
-        ggml_task_get_wsize *get_wsize;
+        ggml_task_wsize_getter *wsize_getter;
        // Optional flag for development.
        // MUST be used only in testing codes.
--- a/llama.cpp
+++ b/llama.cpp
@ -2744,7 +2744,8 @@ struct llama_context * llama_init_from_file(
 }
 #ifdef GGML_USE_TUNE
-bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) {
+bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune,
                       const char *fname) {
    GGML_ASSERT(ctx->model.n_gpu_layers == 0);
    printf("\n");
@ -2767,9 +2768,6 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
    case LLAMA_FTYPE_MOSTLY_Q4_1:
        ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1;
        break;
    case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
        ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1_SOME_F16;
        break;
    case LLAMA_FTYPE_MOSTLY_Q5_0:
        ggml_ftype = GGML_FTYPE_MOSTLY_Q5_0;
        break;
@ -2799,8 +2797,8 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
        ggml_ftype = GGML_FTYPE_MOSTLY_Q6_K;
        break;
    default:
-        throw std::runtime_error(
+        fprintf(stderr, "[tune] unsupported file type %d\n", hparams->ftype);
-            format("invalid output file type %d\n", hparams->ftype));
+        return false;
    }
    int n_vocab = hparams->n_vocab;
@ -2831,7 +2829,13 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
    ctx->tune = new (struct ggml_mulmat_tune);
    if (!ctx->tune) {
-        throw std::runtime_error(format("failed to allocate memory for tune\n"));
+        fprintf(stderr, "[tune] failed to allocate memory for tune\n");
        return false;
    }
    if (!ggml_cpu_has_blas()) {
        fprintf(stderr, "[tune] this program is not built with BLAS, abort.\n");
        return false;
    }
    if (tune) {
@ -2844,31 +2848,30 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
            ggml_mulmat_tune_free(ctx->tune);
            return true;
        }
-    } else {
+    } else if (empty_fname) {
        if (empty_fname) {
        return false;
    }
    }
    if (!empty_fname) {
        FILE *fp = fopen(fname, "r");
        if (!fp) {
-            fprintf(stderr, "[tune] failed to open file %s.\n",
+            fprintf(stderr, "[tune] failed to open file %s.\n", fname);
-                    fname);
+            return false;
        } else {
-            bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp);
+            int rc = ggml_mulmat_tune_read_data(ctx->tune, fp);
            fclose(fp);
-            if (!ok) {
+            if (rc != 0) {
                fprintf(stderr,
-                        "[tune] failed to read data from %s\n",
+                        "[tune] failed to read data from %s, error code: %d\n",
-                        fname);
+                        fname, rc);
                return false;
            }
            fprintf(stderr, "[tune] loaded data from %s\n", fname);
-            ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads);
+            bool ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype,
                                                params.n_threads);
            if (!ok) {
                return false;
            }
--- a/tests/test-ggml-threading.c
+++ b/tests/test-ggml-threading.c
@ -41,8 +41,7 @@ static const int n_repeat = 10;
 // counter with array.
 static int work_done_arr[MAX_N_THREADS];
-static enum ggml_compute_error
+static enum ggml_compute_error mock_task_runner(const struct ggml_compute_params *params,
 mock_task_runner(const struct ggml_compute_params *params,
                             struct ggml_tensor *node) {
    int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000;
    if (node->task_profile.stages[params->type].parallel) {
@ -80,20 +79,15 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
    int t0 = (int)ggml_time_us();
-    struct ggml_threading_context *ctx = ggml_threading_start(
+    node->task_profile.runner = mock_task_runner;
-        n_threads, NULL, mock_task_runner, features, /*stages_time*/ NULL);
+
    struct ggml_threading_context *ctx =
        ggml_threading_start(n_threads, NULL, NULL, features, /*stages_time*/ NULL);
    int t1 = (int)ggml_time_us();
    for (int i = 0; i < n_repeat; i++) {
-        enum ggml_compute_error err = ggml_threading_compute_tensor(
+        ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
            ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
        if (err != GGML_COMPUTE_OK) {
            ggml_threading_stop(ctx);
            printf("ggml_threading_compute_tensor failed with error: %d.\n",
                   err);
            return 1;
        }
    }
    int t2 = (int)ggml_time_us();
@ -107,7 +101,7 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
    int expect = 0;
    for (int i = 0; i < 3; i++) {
        const struct ggml_task_stage *ts = &stages[i];
-        if (ts->backend != GGML_TASK_BACKEND_NONE) {
+        if (ts->valid) {
            if (ts->parallel) {
                expect += n_threads;
            } else {
@ -144,14 +138,12 @@ static enum ggml_compute_error
 mock_task_runner_fallback(const struct ggml_compute_params *params,
                          struct ggml_tensor *node) {
    UNUSED(params);
    if (node->backend == GGML_BACKEND_GPU) {
        // ... finally failed to compute in GPU.
-        node->backend = GGML_BACKEND_CPU;
+    // failed to run ...
    if (node->task_profile.id == 2) {
        return GGML_COMPUTE_FALLBACK;
    } else {
        return GGML_COMPUTE_OK;
    }
    return GGML_COMPUTE_OK;
 }
 // By design, fallback should happen when attempt computing tensor in GPU,
@ -164,6 +156,9 @@ int test_fallback(struct ggml_tensor *node) {
    enum ggml_compute_error err =
        ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
    if (err == GGML_COMPUTE_FALLBACK) {
        // mock setup new profile ...
        node->task_profile.id = 1;
        err = ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL,
                                            /*wsize*/ 0);
    }
@ -214,12 +209,12 @@ int main(void) {
    struct ggml_tensor node;
    memset(&node, 0, sizeof(struct ggml_tensor));
    node.task_profile.runner = mock_task_runner;
    struct ggml_task_stage *stages = node.task_profile.stages;
-    stages[0].backend = GGML_TASK_BACKEND_CPU;
+    stages[0].valid = true;
-    stages[1].backend = GGML_TASK_BACKEND_CPU;
+    stages[1].valid = true;
    stages[2].backend = GGML_TASK_BACKEND_NONE;
    int n_passed = 0;
    int n_tests = 0;
@ -277,7 +272,7 @@ int main(void) {
        struct ggml_threading_context *ctx =
            ggml_threading_start(n_threads, ggml_threading_graph_compute_thread,
-                                 mock_task_runner, 0, /*stages_time*/ NULL);
+                                 NULL, 0, /*stages_time*/ NULL);
        int t1 = (int)ggml_time_us();
@ -416,8 +411,8 @@ int main(void) {
        node.src0 = &src0;
        node.src1 = &src1;
-        node.backend = GGML_BACKEND_GPU;
+        node.task_profile.id = 2;
-        stages[1].backend = GGML_TASK_BACKEND_GPU;
+        stages[1].valid = true;
        if (test_fallback(&node) == 0) {
            ++n_passed;
            printf("[test-ggml-threading] test fallback: ok\n\n");
--- a/tests/test-ggml-tune.c
+++ b/tests/test-ggml-tune.c
@ -46,14 +46,10 @@ int main(void) {
 }
 static int bench(void) {
-    {
+    if (!ggml_cpu_has_blas()) {
        enum ggml_task_backend backends[16];
        int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
        if (n_backends < 2) {
        printf("[test-ggml-tune] skipped because no BLAS\n");
        return 0;
    }
    }
    {
        struct ggml_init_params init_params = {
@ -118,10 +114,13 @@ static int
 ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node,
                                     struct ggml_task_profile *profiles) {
    UNUSED(node);
-    profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
+    profiles[0].id = 1;
-    profiles[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+    profiles[0].stages[0].valid = true;
-    profiles[1].stages[0].backend = GGML_TASK_BACKEND_CPU;
+    profiles[0].stages[1].valid = true;
-    profiles[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
+
    profiles[0].id = 2;
    profiles[1].stages[0].valid = true;
    profiles[1].stages[1].valid = true;
    return 2;
 }