diff --git a/examples/mulmat-tune/README.md b/examples/mulmat-tune/README.md index df023757a..4e521211d 100644 --- a/examples/mulmat-tune/README.md +++ b/examples/mulmat-tune/README.md @@ -214,26 +214,19 @@ The following results are generated with Accelerate compiled. **Example** ``` -5 3B 2 6 1 +[tune] done, elapsed time: 0 seconds. +10 xB 12 4 2 -3200 3200 2 0 3 10 -16 0 0 0 16 1 0 1 0 0 0 0 -16 1 0 2 17 0 1 0 0 0 0 0 - 0 0 0 0 34 0 1 0 0 0 0 0 - 1 1 793 0 9103 2102 0 0 6014 0 - 2 2 1591 0 8034 2305 0 0 30982 0 - 4 4 2236 0 6476 2484 0 0 31388 0 - 8 7 4161 0 6623 2389 0 0 29204 0 - 16 15 8339 0 6434 2752 0 0 34303 0 - 32 32 16919 0 6915 3651 0 0 42511 0 - 64 200 34270 0 6574 4528 0 0 68212 0 - 128 188 69400 0 6325 6839 0 0 74437 0 - 256 303 134597 0 6168 11544 0 0 110180 0 - 512 687 279685 0 6337 29712 0 0 159728 0 +1024 1024 12 0 2 4 +100 110 000 1 CPU +110 101 000 2 BLAS + 1 11 309 0 1234 90 0 + 2 23 654 0 1359 215 0 + 4 44 1283 0 1362 421 0 + 8 85 2341 0 1357 347 0 -3200 8640 2 0 2 10 - - ... +1024 2048 12 0 2 4 +... ``` @@ -249,17 +242,17 @@ shape+ # head version: 1 model: "3B" | "7B" | "13B" | "30B" | "65B" -ggml_ftype: 0 - 4, 7 - 14 +ggml_ftype: 0 - 3, 7 - 14 n_shapes: number of shapes n_threads: number of threads -shape := N K m_num n_profiles -task_conf_profile+ +shape := N K src0_ggml_type src1_ggml_type n_profiles m_num +task_profile+ bench_item+ -task_conf_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize) -stage_conf: backend parallel wait -backend: 0 (NONE) | 16 (CPU) | 17 (CPU_BLAS) | 32 (GPU) | 33 (GPU_CUDA) | 34 (GPU_CL) +task_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize) id name +stage_conf(bitmap): valid parallel wait +valid: 0 (false) | 1 (true) parallel: 0 (false) | 1 (true) wait: 0 (false) | 1 (true) diff --git a/examples/mulmat-tune/mulmat-tune.cpp b/examples/mulmat-tune/mulmat-tune.cpp index da1d0a1c1..ba1cc0f8a 100644 --- a/examples/mulmat-tune/mulmat-tune.cpp +++ b/examples/mulmat-tune/mulmat-tune.cpp @@ -111,6 +111,11 @@ static void usage(char *prog) { } int main(int argc, char **argv) { + if (!ggml_cpu_has_blas()) { + fprintf(stderr, "error: this program is not built with BLAS.\n"); + return 1; + } + if (argc == 2) { if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) { usage(argv[0]); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index cf52109bc..5a4c7725a 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2207,17 +2207,12 @@ void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true); } -bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - +// NOTE: don't check matrix size, otherwise mul_mat tune will fail to run. +static bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { // TODO: find the optimal values for these if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32 && - dst->type == GGML_TYPE_F32 && - (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { + dst->type == GGML_TYPE_F32) { return true; } @@ -2539,11 +2534,17 @@ void ggml_cuda_free_scratch() { g_scratch_buffer = nullptr; } -bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ - ggml_cuda_func_t func; - const bool any_on_device = tensor->backend == GGML_BACKEND_GPU +bool ggml_cuda_is_gpu_offloading(struct ggml_tensor * tensor) { + GGML_ASSERT(tensor); + GGML_ASSERT(tensor->src0); + return tensor->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU); +} + +bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ + ggml_cuda_func_t func; + const bool any_on_device = is_gpu_offloading(tensor); switch (tensor->op) { case GGML_OP_ADD: @@ -2571,7 +2572,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ func = ggml_cuda_rms_norm; break; case GGML_OP_MUL_MAT: - if (!any_on_device/* && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)*/) { + if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) { return false; } func = ggml_cuda_mul_mat; diff --git a/ggml-cuda.h b/ggml-cuda.h index d32b44842..75ea94392 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -16,7 +16,7 @@ void ggml_init_cublas(void); void ggml_cuda_set_tensor_split(const float * tensor_split); void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +bool ggml_cuda_is_gpu_offloading(const struct ggml_tensor * src0); size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 2a1a04fca..28098793d 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1589,18 +1589,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * } } +bool ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor) { + GGML_ASSERT(tensor); + return (tensor->src0 && tensor->src0->backend == GGML_BACKEND_GPU) || + (tensor->src1 && tensor->src1->backend == GGML_BACKEND_GPU); +} -bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - - // TODO: find the optimal values for these +// NOTE: don't check matrix size, otherwise mul_mat tune will fail to run. +static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32 && - dst->type == GGML_TYPE_F32 /*&& - ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)*/) { + dst->type == GGML_TYPE_F32) { return true; } diff --git a/ggml-opencl.h b/ggml-opencl.h index a92b445c9..1de12f55a 100644 --- a/ggml-opencl.h +++ b/ggml-opencl.h @@ -9,7 +9,7 @@ extern "C" { void ggml_cl_init(void); void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +bool ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor); size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); diff --git a/ggml-threading.c b/ggml-threading.c index 7ef763c0f..dada9f3fe 100644 --- a/ggml-threading.c +++ b/ggml-threading.c @@ -376,7 +376,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) { struct ggml_compute_state_shared *shared = state->shared; GGML_ASSERT(shared); - GGML_ASSERT(shared->task_runner); + //GGML_ASSERT(shared->task_runner); shared->n_ready++; @@ -397,7 +397,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) { : shared->task_runner; enum ggml_compute_error err = runner(&state->params, state->node); - GGML_ASSERT(err == GGML_COMPUTE_OK); + GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK); ggml_spin_lock(&shared->spin); @@ -430,7 +430,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx, size_t wsize) { GGML_ASSERT(ctx); GGML_ASSERT(node); - GGML_ASSERT(ctx->shared.task_runner); + // GGML_ASSERT(ctx->shared.task_runner); ggml_task_runner *runner = ctx->shared.task_runner; if (node->task_profile.runner) { @@ -448,7 +448,7 @@ START: memset(¶ms, 0, sizeof(struct ggml_compute_params)); for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) { - if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) { + if (!node->task_profile.stages[type].valid) { continue; } @@ -519,18 +519,17 @@ START: if (err == GGML_COMPUTE_FALLBACK) { PRINT_DEBUG("[main] fallback from profile, id=%d\n", node->task_profile.id); - GGML_ASSERT(node->task_profile.stages[1].backend > - GGML_TASK_BACKEND_CPU); + GGML_ASSERT(node->task_profile.id > 1); struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]; int n = ggml_get_task_profiles(node, profiles); GGML_ASSERT(n > 0); - GGML_ASSERT(profiles[0].stages[1].backend == - GGML_TASK_BACKEND_CPU); + GGML_ASSERT(profiles[0].id == 1); memcpy(&node->task_profile, &profiles[0], - sizeof(struct ggml_task_profile)); + sizeof(struct ggml_task_profile)); runner = ctx->shared.task_runner; + GGML_ASSERT(runner); goto START; } diff --git a/ggml-threading.h b/ggml-threading.h index 189fc2ed5..81192450c 100644 --- a/ggml-threading.h +++ b/ggml-threading.h @@ -29,7 +29,9 @@ typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data); // thread: optional OS thread runner, default value: // `ggml_threading_graph_compute_thread`. // -// features: optional for configure +// task_runner: default task runner, nullable wheen tensor.runner is not NULL. +// Overridden by tensor.runner. +// features: configure threading behaviour, optional. // threading additional features. see `ggml_threading_feature`, default 0. // // stages_time: optional for collecting per-stage wall clock time. @@ -51,12 +53,6 @@ enum ggml_compute_error ggml_threading_compute_tensor(struct ggml_threading_context *ctx, struct ggml_tensor *node, void *wdata, size_t wsize); - -// This is an experimental functionality for mulmat tune, as a thin wrapper. -enum ggml_compute_error -ggml_compute_forward_wrapper(const struct ggml_compute_params *params, - struct ggml_tensor *tensor); - #ifdef __cplusplus } #endif diff --git a/ggml-tune.c b/ggml-tune.c index aeb63e957..444269ae4 100644 --- a/ggml-tune.c +++ b/ggml-tune.c @@ -24,26 +24,7 @@ static uint64_t ggml_mulmat_tune_cache_hash(int M, int N, int K) { return hash; } -static const char * -ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) { - switch (backend) { - case GGML_TASK_BACKEND_NONE: - return ""; - case GGML_TASK_BACKEND_CPU: - return "CPU"; - case GGML_TASK_BACKEND_CPU_BLAS: - return "BLAS"; - case GGML_TASK_BACKEND_GPU: - return "GPU"; - case GGML_TASK_BACKEND_GPU_CUDA: - return "CUDA"; - case GGML_TASK_BACKEND_GPU_CL: - return "CL"; - default: - GGML_ASSERT(false); - } -} - +// Return profile id, -1 when failed (such as unable to match shape). // NOTE: we can not use the profile from tune because the profiles do not // contain fields such as runner, get_size. int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M, @@ -101,20 +82,15 @@ int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M, e->K = K; #ifndef GGML_TUNE_NDEBUG - const char *names[3]; - for (int i = 0; i < 3; i++) { - names[i] = ggml_mulmat_tune_task_backend_name( - prof->stages[i].backend); - } printf("\n[tune] M: %3d, N: %5d, K: %5d, profile id: %d, " "backends: %s %s %s\n", - M, N, K, prof->id, names[0], names[1], names[2]); + M, N, K, prof->id, prof->name); #endif } } } - return prof->id; + return prof ? prof->id : -1; } void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model, @@ -283,25 +259,24 @@ static bool ggml_mulmat_tune_write_profiles( int rc; for (int i = 0; i < n_profiles; i++) { const struct ggml_task_profile *profile = &profiles[i]; - rc = fprintf(fp, "%d ", profile->id); - if (rc <= 0) { - return false; - } - for (int j = 0; j < 3; j++) { const struct ggml_task_stage *ts = &profile->stages[j]; - rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0, - ts->wait ? 1 : 0); + rc = fprintf(fp, "%1d%1d%1d", ts->valid ? 1 : 0, + ts->parallel ? 1 : 0, ts->wait ? 1 : 0); if (rc <= 0) { return false; } if (j < 2) { - rc = fprintf(fp, " "); + rc = fprintf(fp, " "); if (rc <= 0) { return false; } } } + rc = fprintf(fp, " %d %s", profile->id, profile->name); + if (rc <= 0) { + return false; + } rc = fprintf(fp, "\n"); if (rc <= 0) { return false; @@ -407,24 +382,24 @@ bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune, return ok; } -bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { +int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { GGML_ASSERT(tune); memset(tune, 0, sizeof(struct ggml_mulmat_tune)); int rc = fscanf(fp, "%d", &tune->version); if (rc <= 0) { - return false; + return 1; } if (tune->version != GGML_MULMAT_TUNE_VERSION) { fprintf(stderr, "[tune] version mismatch, run bench again\n"); - return false; + return 2; } rc = fscanf(fp, "%s %d %d %d", tune->model, (int *)&tune->ftype, &tune->n_shapes, &tune->n_threads); if (rc <= 0) { - return false; + return 3; } for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) { @@ -434,7 +409,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { (int *)&shape->src0_type, (int *)&shape->src1_type, &shape->n_profiles, &shape->m_num); if (rc <= 0) { - return false; + return 4; } { @@ -451,24 +426,24 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { for (int ip = 0; ip < shape->n_profiles; ip++) { struct ggml_task_profile *profile = &shape->profiles[ip]; - rc = fscanf(fp, "%d ", &profile->id); - if (rc <= 0) { - return false; - } - for (int j = 0; j < 3; j++) { struct ggml_task_stage *ts = &profile->stages[j]; - int backend; + int valid; int parallel; int wait; - rc = fscanf(fp, "%d %d %d", &backend, ¶llel, &wait); + rc = fscanf(fp, " %1d%1d%1d", &valid, ¶llel, &wait); if (rc <= 0) { - return false; + return 5; } - ts->backend = (enum ggml_task_backend)backend; + ts->valid = valid ? true : false; ts->parallel = parallel ? true : false; ts->wait = wait ? true : false; } + + rc = fscanf(fp, "%d %s", &profile->id, profile->name); + if (rc <= 0) { + return 6; + } } for (int i_m = 0; i_m < shape->m_num; i_m++) { @@ -477,7 +452,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { if (ip == 0) { rc = fscanf(fp, "%d", &M); if (rc <= 0) { - return false; + return 7; } } struct ggml_mulmat_tune_m *item = @@ -486,13 +461,13 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { rc = fscanf(fp, "%d %d %d", &item->stages_time[0], &item->stages_time[1], &item->stages_time[2]); if (rc <= 0) { - return false; + return 8; } } } } - return true; + return 0; } bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, @@ -535,7 +510,7 @@ bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, const struct ggml_task_profile *profile = &shape->profiles[ip]; for (int k = 0; k < 3; k++) { - if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) { + if (profile->stages[k].valid) { rc = fprintf(fp, "%9d", item->stages_time[k]); if (rc <= 0) { return false; @@ -562,8 +537,6 @@ const struct ggml_mulmat_tune_shape * ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N, const int K, enum ggml_type src0_type, enum ggml_type src1_type) { - GGML_ASSERT(N > 0 && K > 0); - for (int i = 0; i < tune->n_shapes; i++) { const struct ggml_mulmat_tune_shape *s = &tune->shapes[i]; if (s->src0_type != src0_type || s->src1_type != src1_type) { @@ -574,13 +547,17 @@ ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N, if (s->N == N && s->K == K) { return s; } - } else if (s->N > 0 && s->K == 0) { - if (s->N == N) { - return s; - } - } else if (s->N == 0 && s->K > 0) { - if (s->K == K) { - return s; + } + + if (GGML_MULMAT_N_SHAPES == 6) { + if (s->N > 0 && s->K == 0) { + if (s->N == N) { + return s; + } + } else if (s->N == 0 && s->K > 0) { + if (s->K == K) { + return s; + } } } } @@ -639,7 +616,7 @@ void ggml_mulmat_tune_estimate_time( for (int i_stage = 0; i_stage < 3; i_stage++) { const struct ggml_task_stage *stage = &profile->stages[i_stage]; - if (stage->backend == GGML_TASK_BACKEND_NONE) { + if (!stage->valid) { continue; } @@ -784,23 +761,6 @@ static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) { return sz; } -int ggml_mulmat_tune_get_builtin_task_backends( - enum ggml_task_backend *backends) { - int i = 0; - backends[i++] = GGML_TASK_BACKEND_CPU; - - if (ggml_cpu_has_cpublas()) { - backends[i++] = GGML_TASK_BACKEND_CPU_BLAS; - } - - if (ggml_cpu_has_cublas()) { - backends[i++] = GGML_TASK_BACKEND_GPU_CUDA; - } else if (ggml_cpu_has_clblast()) { - backends[i++] = GGML_TASK_BACKEND_GPU_CL; - } - return i; -} - bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, struct ggml_mulmat_tune_params *params) { GGML_ASSERT(tune); @@ -809,23 +769,6 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, memset(tune, 0, sizeof(struct ggml_mulmat_tune)); - enum ggml_task_backend backends[16]; - int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends); - if (n_backends < 2) { - fprintf(stderr, - "[tune] error: this program was not built with BLAS.\n"); - return false; - } - - if (params->model.ftype >= GGML_FTYPE_MOSTLY_Q2_K && - params->model.ftype <= GGML_FTYPE_MOSTLY_Q6_K) { -#if defined(GGML_USE_CLBLAST) - printf("[tune] error: cl implementation does not support k_quants at " - "the time of writing this code, skip.\n"); - return false; -#endif - } - bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles); if (!ok) { return false; @@ -835,12 +778,13 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, char buf[128] = {0}; int offset = 0; - for (int i = 0; i < n_backends; i++) { + for (int i = 0; i < tune->shapes[0].n_profiles; i++) { if (i > 0) { buf[offset++] = ','; buf[offset++] = ' '; } - const char *name = ggml_mulmat_tune_task_backend_name(backends[i]); + const char *name = tune->shapes[0].profiles[i].name; + GGML_ASSERT(name != NULL && strcmp(name, "") != 0); size_t len = strlen(name); memcpy(&buf[offset], name, len); offset += (int)len; @@ -848,17 +792,17 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, fprintf(stdout, "[tune] model: %s, ggml ftype: %d, " - "n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n", + "n_pass: %d, n_shapes: %d, n_threads: %d, profiles: %s\n", params->model.name, params->model.ftype, params->n_pass, - params->n_threads, tune->n_shapes, buf); + tune->n_shapes, params->n_threads, buf); } int64_t stages_time[3]; int64_t t0 = ggml_time_ms(); - struct ggml_threading_context *thrd_ctx = ggml_threading_start( - tune->n_threads, NULL, ggml_compute_forward_wrapper, - GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time); + struct ggml_threading_context *thrd_ctx = + ggml_threading_start(tune->n_threads, NULL, NULL, + GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time); for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) { const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape]; @@ -896,6 +840,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, for (int ip = 0; ip < shape->n_profiles; ip++) { const struct ggml_task_profile *profile = &shape->profiles[ip]; + // GGML_ASSERT(profile->runner); memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); @@ -911,9 +856,15 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, stages_time[j] = 0; } - enum ggml_compute_error err = ggml_threading_compute_tensor( - thrd_ctx, node, wdata, wsize); - GGML_ASSERT(err == GGML_COMPUTE_OK); + ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize); + + if (memcmp(profile, &node->task_profile, + sizeof(struct ggml_task_profile)) != 0) { + printf("[tune] error: task profile changed, tensor op: " + "%d, original id: %d, current id: %d\n", + node->op, profile->id, node->task_profile.id); + exit(1); + } for (int i = 0; i < 3; i++) { int v = (int)stages_time[i]; diff --git a/ggml-tune.h b/ggml-tune.h index 7955a50a9..addcd34db 100644 --- a/ggml-tune.h +++ b/ggml-tune.h @@ -10,7 +10,7 @@ extern "C" { #endif -#define GGML_MULMAT_TUNE_VERSION 9 +#define GGML_MULMAT_TUNE_VERSION 10 #define GGML_MULMAT_N_SHAPES 4 #define GGML_MULMAT_CACHE_LEN 16 @@ -119,7 +119,7 @@ void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune); bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp); -bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp); +int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp); const struct ggml_mulmat_tune_shape * ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, int N, int K, @@ -129,11 +129,6 @@ void ggml_mulmat_tune_estimate_time(const struct ggml_mulmat_tune_shape *shape, int M, struct ggml_mulmat_tune_time *profile_time); -const char *ggml_task_backend_name(enum ggml_task_backend backend); - -int ggml_mulmat_tune_get_builtin_task_backends( - enum ggml_task_backend *backends); - bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, struct ggml_mulmat_tune_params *params); diff --git a/ggml.c b/ggml.c index 43ec93a64..62750b20b 100644 --- a/ggml.c +++ b/ggml.c @@ -8500,15 +8500,6 @@ static void ggml_compute_forward_mul_f32( const int ith = params->ith; const int nth = params->nth; -#ifdef GGML_USE_CLBLAST - if (src1->backend == GGML_BACKEND_GPU) { - if (ith == 0) { - ggml_cl_mul(src0, src1, dst); - } - return; - } -#endif - const int64_t nr = ggml_nrows(src0); const int64_t ne00 = src0->ne[0]; @@ -9933,6 +9924,168 @@ static void ggml_compute_forward_rms_norm_back( } } +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +static void ggml_compute_forward_mul_mat_blas( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + struct ggml_tensor * src0 = dst->src0; + struct ggml_tensor * src1 = dst->src1; + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const int nb00 = src0->nb[0]; + const int nb01 = src0->nb[1]; + const int nb02 = src0->nb[2]; + const int nb03 = src0->nb[3]; + + const int nb10 = src1->nb[0]; + // const int nb11 = src1->nb[1]; + const int nb12 = src1->nb[2]; + const int nb13 = src1->nb[3]; + + const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne3 == ne03); + + const int ith = params->ith; + const int nth = params->nth; + + if (src0->type == GGML_TYPE_F32) { + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(params->nth == 1); + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); + const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + ne11, ne01, ne10, + 1.0f, y, ne10, + x, ne00, + 0.0f, d, ne01); + } + } + return; + } else if (src0->type == GGML_TYPE_F16) { + // TODO: we don't support permuted src0 + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(params->nth == 1); + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + float * const wdata = params->wdata; + { + size_t id = 0; + for (int64_t i01 = 0; i01 < ne01; ++i01) { + for (int64_t i00 = 0; i00 < ne00; ++i00) { + wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00)); + } + } + + assert(id*sizeof(float) <= params->wsize); + } + + const float * x = wdata; + const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + + // zT = y * xT + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + ne11, ne01, ne10, + 1.0f, y, ne10, + x, ne00, + 0.0f, d, ne01); + } + } + return; + } else if (ggml_is_quantized(src0->type)) { + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[src0->type]); + GGML_ASSERT(src0->data); + GGML_ASSERT(params->wdata); + + float * const wdata = params->wdata; + dequantize_row_q_t const dequantize_row_q = quantize_fns[src0->type].dequantize_row_q; + + if (params->type == GGML_TASK_INIT) { + // rows per thread + const int dr = (ne01 + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + int ir1 = MIN(ir0 + dr, ne01); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + char * data0_offset = (char *) src0->data + i03*nb03 + i02*nb02; + float * wdata_offset = wdata + i03*ne03 + i02*ne02; + for (int64_t i = ir0; i < ir1; ++i) { + dequantize_row_q(data0_offset + i*nb01, wdata_offset + i*ne00, ne00); + } + } + } + return; + } + + GGML_ASSERT(nth == 1); + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const float * x = wdata; + const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + // zT = y * xT + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + ne11, ne01, ne10, + 1.0f, y, ne10, + x, ne00, + 0.0f, d, ne01); + } + } + return; + } else { + GGML_ASSERT(false); + } +} +#endif + // CPU only static void ggml_compute_forward_mul_mat_f32( const struct ggml_compute_params * params, @@ -9947,9 +10100,6 @@ static void ggml_compute_forward_mul_mat_f32( const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - const int64_t ne10 = src1->ne[0]; -#endif const int64_t ne11 = src1->ne[1]; #ifndef NDEBUG const int64_t ne12 = src1->ne[2]; @@ -10004,37 +10154,7 @@ static void ggml_compute_forward_mul_mat_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; - GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU); - - if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - GGML_ASSERT(params->nth == 1); - GGML_ASSERT(params->type == GGML_TASK_COMPUTE); - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); - const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne00, - 0.0f, d, ne01); - } - } - //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); - - return; -#else - GGML_ASSERT(false); -#endif - } - GGML_ASSERT(params->type == GGML_TASK_COMPUTE); - GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU); // parallelize by src0 rows using ggml_vec_dot_f32 @@ -10152,57 +10272,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend; - enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; - - GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU); - - if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - GGML_ASSERT(nb10 == sizeof(float)); - GGML_ASSERT(params->nth == 1); - GGML_ASSERT(params->type == GGML_TASK_COMPUTE); - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - float * const wdata = params->wdata; - { - size_t id = 0; - for (int64_t i01 = 0; i01 < ne01; ++i01) { - for (int64_t i00 = 0; i00 < ne00; ++i00) { - wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00)); - } - } - - assert(id*sizeof(float) <= params->wsize); - } - - const float * x = wdata; - const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); - - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - - // zT = y * xT - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne00, - 0.0f, d, ne01); - } - } - - /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/ - - return; -#else - GGML_ASSERT(false); -#endif - } - - GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE); - GGML_ASSERT(init_backend == GGML_TASK_BACKEND_CPU); - GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU); - if (params->type == GGML_TASK_INIT) { ggml_fp16_t * const wdata = params->wdata; @@ -10348,68 +10417,6 @@ static void ggml_compute_forward_mul_mat_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend; - enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; - GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU); - - if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - GGML_ASSERT (init_backend == GGML_TASK_BACKEND_CPU); - GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE); - GGML_ASSERT(src0->data); - GGML_ASSERT(params->wdata); - - float * const wdata = params->wdata; - dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; - - if (params->type == GGML_TASK_INIT) { - // rows per thread - const int dr = (ne01 + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - int ir1 = MIN(ir0 + dr, ne01); - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - char * data0_offset = (char *) src0->data + i03*nb03 + i02*nb02; - float * wdata_offset = wdata + i03*ne03 + i02*ne02; - for (int64_t i = ir0; i < ir1; ++i) { - dequantize_row_q(data0_offset + i*nb01, wdata_offset + i*ne00, ne00); - } - } - } - return; - } - - GGML_ASSERT(nth == 1); - GGML_ASSERT(params->type == GGML_TASK_COMPUTE); - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - - // zT = y * xT - const float * x = wdata; - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne00, - 0.0f, d, ne01); - } - } - - return; -#else - GGML_ASSERT(false); -#endif - } - - GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE); - GGML_ASSERT(init_backend == GGML_TASK_BACKEND_CPU); - GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU); - if (params->type == GGML_TASK_INIT) { GGML_ASSERT(params->nth == 1); @@ -14257,6 +14264,7 @@ static void ggml_compute_forward_cross_entropy_loss_back( ///////////////////////////////// +// CPU only: no BLAS. static enum ggml_compute_error ggml_compute_forward(const struct ggml_compute_params * params, struct ggml_tensor * tensor) { GGML_ASSERT(params); @@ -15459,96 +15467,163 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg // ---- task profiles ---- -// TODO: replace with ggml_compute_forward_cuda +// Check the type and memeory layout for mul_mat on blas(CPU BLAS) +static bool ggml_mul_mat_check_type_mem(struct ggml_tensor *tensor) { + enum ggml_type src0_t = tensor->src0->type; + enum ggml_type src1_t = tensor->src1->type; + + // This is the minimal requirement to run mulmat with BLAS. + // Don't check matrix size because that would break tuning. + return (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 || + ggml_is_quantized(src0_t)) && + src1_t == GGML_TYPE_F32 && tensor->type == GGML_TYPE_F32 && + ggml_is_contiguous(tensor->src0) && ggml_is_contiguous(tensor->src1); +} + // DO NOT check matrix size further. #if defined(GGML_USE_CUBLAS) -static enum ggml_compute_error ggml_compute_forward_cuda( - const struct ggml_compute_params * params, - struct ggml_tensor * tensor) { - GGML_ASSERT (ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)); +// Implements ggml_task_runner. +static enum ggml_compute_error +ggml_compute_forward_cuda(const struct ggml_compute_params *params, + struct ggml_tensor *tensor) { + if (tensor->op == GGML_OP_MUL_MAT) { + GGML_ASSERT(ggml_mul_mat_check_type_mem(tensor)); + } + if (ggml_cuda_compute_forward(params, tensor)) { return GGML_COMPUTE_OK; } + GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU); - GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src1 == NULL || + tensor->src1->backend == GGML_BACKEND_CPU); + return GGML_COMPUTE_FALLBACK; } -#endif +#endif // GGML_USE_CUBLAS -// TODO: replace with ggml_cl_mul_mat. -// DO NOT check matrix size further. #if defined(GGML_USE_CLBLAST) -static enum ggml_compute_error ggml_compute_forward_cl( - const struct ggml_compute_params * params, - struct ggml_tensor * tensor) { +// Implements ggml_task_runner. +static enum ggml_compute_error +ggml_compute_forward_cl(const struct ggml_compute_params *params, + struct ggml_tensor *tensor) { switch (tensor->op) { - case GGML_OP_MUL_MAT: - GGML_ASSERT(ggml_cl_can_mul_mat(tensor->src0, tensor->src1, tensor)); - ggml_cl_mul_mat(tensor->src0, tensor->src1, tensor, params->wdata, params->wsize); - return GGML_COMPUTE_OK; - default: - break; + case GGML_OP_MUL: { + if (tensor->src1 && ggml_cl_is_gpu_offloading(tensor)) { + if (params->ith == 0) { + ggml_cl_mul(tensor->src0, tensor->src1, tensor); + return GGML_COMPUTE_OK; + } + } + } break; + case GGML_OP_MUL_MAT: { + GGML_ASSERT(ggml_mul_mat_check_type_mem(tensor)); + ggml_cl_mul_mat(tensor->src0, tensor->src1, tensor, params->wdata, + params->wsize); + return GGML_COMPUTE_OK; + } break; + default: { + } break; } - GGML_ASSERT(false); + return GGML_COMPUTE_FALLBACK; } -static int ggml_compute_forward_get_wsize_cl (struct ggml_tensor *tensor) { - switch (tensor->op) { - case GGML_OP_MUL_MAT: - return ggml_cl_mul_mat_get_wsize(tensor->src0, tensor->src1, tensor); - default: - break; - } - return -1; +// Implements ggml_task_wsize_getter. +static int ggml_compute_forward_cl_get_wsize(struct ggml_tensor *tensor) { + switch (tensor->op) { + case GGML_OP_MUL_MAT: + return ggml_cl_mul_mat_get_wsize(tensor->src0, tensor->src1, tensor); + default: + break; + } + return -1; } -#endif +#endif // GGML_USE_CLBLAST -// The wrapper for external mulmat tune tool. -enum ggml_compute_error ggml_compute_forward_wrapper(const struct ggml_compute_params *params, - struct ggml_tensor *tensor) { - // We call ggml_compute_forward because the CUDA mul_mat entry point - // was moved out of `ggml_compute_forward_mul_mat`. - return ggml_compute_forward(params, tensor); +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +// Implements ggml_task_runner. +static enum ggml_compute_error +ggml_compute_forward_blas(const struct ggml_compute_params *params, + struct ggml_tensor *tensor) { + switch (tensor->op) { + case GGML_OP_MUL_MAT: { + GGML_ASSERT(ggml_mul_mat_check_type_mem(tensor)); + ggml_compute_forward_mul_mat_blas(params, tensor); + return GGML_COMPUTE_OK; + } break; + default: { + } break; + } + + return GGML_COMPUTE_FALLBACK; } +// Implements ggml_task_wsize_getter. +static int ggml_compute_forward_blas_get_wsize(struct ggml_tensor *tensor) { + switch (tensor->op) { + case GGML_OP_MUL_MAT: { + GGML_ASSERT(tensor->src1->type == GGML_TYPE_F32); + enum ggml_type src0_t = tensor->src0->type; + + if (src0_t == GGML_TYPE_F16) { + return GGML_TYPE_SIZE[GGML_TYPE_F32] * + (tensor->src0->ne[0] * tensor->src0->ne[1]); + } else if (src0_t == GGML_TYPE_F32) { + return 0; + } else if (ggml_is_quantized(src0_t)) { + return GGML_TYPE_SIZE[GGML_TYPE_F32] * + (tensor->src0->ne[0] * tensor->src0->ne[1]); + } else { + GGML_ASSERT(false); + } + } break; + default: + break; + } + return -1; +} +#endif // GGML_USE_ACCELERATE | GGML_USE_OPENBLAS + // Implement `ggml_task_profiles_provider`. -// Fill `profiles` for the `node` and return number of profiles. +// Fill `profiles` for the `tensor` and return number of profiles. // -// NOTE: the node may be incompleted from testing or tunning, so please assert +// NOTE: the tensor may be incompleted from testing or tunning, so please assert // everything used here. // -// TODO: configure cuda for none mul_mat nodes. +// First profile is always CPU, followed by BLAS, CUDA/CL. int ggml_get_task_profiles( - struct ggml_tensor *node, + struct ggml_tensor *tensor, struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]) { - GGML_ASSERT(node); - GGML_ASSERT(node->op >= 0); + + GGML_ASSERT(tensor); + GGML_ASSERT(tensor->op >= 0); GGML_ASSERT(profiles); memset(profiles, 0, sizeof(struct ggml_task_profile) * GGML_MAX_TASK_PROFILES); struct ggml_task_profile *p = profiles; - int n_profiles = 0; - switch (node->op) { + int n_profiles = 1; + strcpy(p[0].name, "CPU"); + p[0].runner = ggml_compute_forward; + // p[0].wsize_getter = ...; + + switch (tensor->op) { case GGML_OP_CPY: case GGML_OP_DUP: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; - n_profiles = 1; + p[0].stages[1].valid = true; } break; case GGML_OP_ADD: case GGML_OP_ADD1: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } break; case GGML_OP_ACC: { - p[0].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[0].valid = true; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } break; case GGML_OP_SUB: case GGML_OP_DIV: @@ -15565,13 +15640,11 @@ int ggml_get_task_profiles( case GGML_OP_NEG: case GGML_OP_STEP: case GGML_OP_RELU: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; - n_profiles = 1; + p[0].stages[1].valid = true; } break; case GGML_OP_MUL: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } break; case GGML_OP_GELU: case GGML_OP_SILU: @@ -15579,69 +15652,32 @@ int ggml_get_task_profiles( case GGML_OP_NORM: case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM_BACK: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } break; case GGML_OP_MUL_MAT: - case GGML_OP_OUT_PROD: { - // CPU only profiles. - // CUDA/CL: see end of function. - GGML_ASSERT(node->src0); - GGML_ASSERT(node->src1); - - enum ggml_type src0_t = node->src0->type; - enum ggml_type src1_t = node->src1->type; - - GGML_ASSERT(src1_t == GGML_TYPE_F32); - - int i = 0; + case GGML_OP_OUT_PROD: { // FIXME: is this correct? + enum ggml_type src0_t = tensor->src0->type; if (src0_t == GGML_TYPE_F32) { - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[1].parallel = true; - i++; - -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; - p[i].stages[1].wait = true; - i++; -#endif + p[0].stages[1].valid = true; + p[0].stages[1].parallel = true; } else if (src0_t == GGML_TYPE_F16) { - p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[1].parallel = true; - i++; - -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; - p[i].stages[1].wait = true; - i++; -#endif + p[0].stages[0].valid = true; + p[0].stages[1].valid = true; + p[0].stages[1].parallel = true; } else if (ggml_is_quantized(src0_t)) { - p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[1].parallel = true; - i++; - -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[0].parallel = true; - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; - p[i].stages[1].wait = true; - i++; -#endif + p[0].stages[0].valid = true; + p[0].stages[1].valid = true; + p[0].stages[1].parallel = true; } - n_profiles = i; } break; case GGML_OP_SCALE: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } break; case GGML_OP_SET: { - p[0].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; - n_profiles = 1; + p[0].stages[0].valid = true; + p[0].stages[1].valid = true; } break; case GGML_OP_CONT: case GGML_OP_RESHAPE: @@ -15652,64 +15688,53 @@ int ggml_get_task_profiles( case GGML_OP_GET_ROWS_BACK: case GGML_OP_DIAG: case GGML_OP_DIAG_MASK_ZERO: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; - n_profiles = 1; + p[0].stages[1].valid = true; } break; case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX_BACK: case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } break; case GGML_OP_ALIBI: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; - n_profiles = 1; + p[0].stages[1].valid = true; } break; case GGML_OP_CLAMP: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; - n_profiles = 1; + p[0].stages[1].valid = true; } break; case GGML_OP_CONV_1D_1S: case GGML_OP_CONV_1D_2S: { - p[0].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[0].valid = true; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } break; case GGML_OP_FLASH_ATTN: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } break; case GGML_OP_FLASH_FF: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } case GGML_OP_FLASH_ATTN_BACK: { - p[0].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[0].valid = true; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } break; case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; - n_profiles = 1; + p[0].stages[1].valid = true; } break; case GGML_OP_CROSS_ENTROPY_LOSS: - p[0].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[0].valid = true; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - p[0].stages[2].backend = GGML_TASK_BACKEND_CPU; - n_profiles = 1; + p[0].stages[2].valid = true; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } break; case GGML_OP_NONE: case GGML_OP_COUNT: { @@ -15719,227 +15744,196 @@ int ggml_get_task_profiles( GGML_ASSERT(false); } -#if defined(GGML_USE_CUBLAS) - switch (node->op) { - case GGML_OP_ADD: - case GGML_OP_MUL: - case GGML_OP_SILU: - case GGML_OP_RMS_NORM: - case GGML_OP_MUL_MAT: - case GGML_OP_RESHAPE: - case GGML_OP_ROPE: { - int i = n_profiles; - p[i].runner = ggml_compute_forward_cuda; - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + if (tensor->op == GGML_OP_MUL_MAT) { + enum ggml_type src0_t = tensor->src0->type; + int i = n_profiles; + + strcpy(p[i].name, "BLAS"); + p[i].runner = ggml_compute_forward_blas; + p[i].wsize_getter = ggml_compute_forward_blas_get_wsize; + + if (src0_t == GGML_TYPE_F32) { + p[i].stages[1].valid = true; p[i].stages[1].wait = true; - ++n_profiles; - } break; - default: { - } break; + } else if (src0_t == GGML_TYPE_F16) { + p[i].stages[1].valid = true; + p[i].stages[1].wait = true; + } else if (ggml_is_quantized(src0_t)) { + p[i].stages[0].valid = true; + p[i].stages[0].parallel = true; + p[i].stages[1].valid = true; + p[i].stages[1].wait = true; + } + ++n_profiles; + } +#endif + +#if defined(GGML_USE_CUBLAS) + if (true) { // FIXME: filter supported op to avoid unnecceary fallback. + int i = n_profiles; + strcpy(p[i].name, "CUDA"); + p[i].runner = ggml_compute_forward_cuda; + p[i].stages[1].valid = true; + p[i].stages[1].wait = true; + ++n_profiles; } #elif defined(GGML_USE_CLBLAST) - switch (node->op) { - case GGML_OP_MUL_MAT: { - int i = n_profiles; - p[i].runner = ggml_compute_forward_cl; - p[i].get_wsize = ggml_compute_forward_get_wsize_cl; - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; - p[i].stages[1].wait = true; - ++n_profiles; - } break; - default: { - } break; + if (tensor->op == GGML_OP_MUL || tensor->op == GGML_OP_MUL_MAT) { + int i = n_profiles; + strcpy(p[i].name, "CL"); + p[i].runner = ggml_compute_forward_cl; + p[i].wsize_getter = ggml_compute_forward_cl_get_wsize; + p[i].stages[1].valid = true; + p[i].stages[1].wait = true; + ++n_profiles; } #endif GGML_ASSERT(n_profiles > 0 && n_profiles <= GGML_MAX_TASK_PROFILES); - for (int i = 0; i < n_profiles; i++) { - profiles[i].id = i + 1; + + for (int j = 0; j < n_profiles; j++) { + profiles[j].id = j + 1; } + return n_profiles; } -// Set task profile for GGML_OP_MUL_MAT or GGML_OP_OUT_PROD. -static const struct ggml_task_profile *ggml_mulmat_get_task_profile( - struct ggml_tensor *node, struct ggml_task_profile *profiles, - int n_profiles, struct ggml_mulmat_tune *tune, int stages_time_us[3]) { +// Try to fix task profile for given tensor, because the task profile might not +// be the most performant. +static void ggml_optimize_tensor_task_profile( + struct ggml_tensor *tensor, struct ggml_task_profile *profiles, + int n_profiles, struct ggml_mulmat_tune *tune) { + + if (tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_OUT_PROD) { + return; + } + + GGML_ASSERT(tensor); + GGML_ASSERT(tensor->op == GGML_OP_MUL_MAT || + tensor->op == GGML_OP_OUT_PROD); + GGML_ASSERT(tensor->task_profile.id == n_profiles); - GGML_ASSERT(node); - GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD); GGML_ASSERT(profiles); - GGML_ASSERT(n_profiles > 0); + GGML_ASSERT(n_profiles > 1); - enum ggml_type src0_t = node->src0->type; - enum ggml_type src1_t = node->src1->type; + int M = (int)tensor->ne[1]; + int N = (int)tensor->ne[0]; + int K = (int)tensor->src1->ne[0]; - // Type and memory layout requirements for computing mul_mat with BLAS. - bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 || - ggml_is_quantized(src0_t)) && - src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 && - ggml_is_contiguous(node->src0) && - ggml_is_contiguous(node->src1); - - int M = (int)node->ne[1]; - int N = (int)node->ne[0]; - int K = (int)node->src1->ne[0]; - - const struct ggml_task_profile *prof = NULL; - - if (cond_match) { #if defined(GGML_USE_TUNE) - if (tune != NULL) { - GGML_ASSERT(n_profiles >= 2); - int id = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t, - src1_t, stages_time_us); + if (tune != NULL && ggml_mul_mat_check_type_mem(tensor)) { + GGML_ASSERT(tensor->backend == 0 && tensor->src0->backend == 0 && + tensor->src1->backend == 0); + + GGML_ASSERT(n_profiles >= 2); + + enum ggml_type src0_t = tensor->src0->type; + enum ggml_type src1_t = tensor->src1->type; + + int stages_time_us[3]; + + int id = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t, + src1_t, stages_time_us); + if (id > 0) { + struct ggml_task_profile *prof = NULL; + for (int i = 0; i < n_profiles; i++) { if (profiles[i].id == id) { prof = &profiles[i]; - return prof; - } - } - } -#else - UNUSED(tune); - UNUSED(stages_time_us); -#endif - - if (prof == NULL && M >= 32 && N >= 32 && K >= 32) { - for (int j = 0; j < n_profiles; j++) { - enum ggml_task_backend comp_be = - profiles[j].stages[GGML_TASK_COMPUTE].backend; - switch (comp_be) { - case GGML_TASK_BACKEND_GPU_CUDA: { - GGML_ASSERT(ggml_cpu_has_cublas()); - prof = &profiles[j]; - break; - } - case GGML_TASK_BACKEND_GPU_CL: { - GGML_ASSERT(ggml_cpu_has_clblast()); - prof = &profiles[j]; - break; - } - case GGML_TASK_BACKEND_CPU_BLAS: { - GGML_ASSERT(ggml_cpu_has_cpublas()); - prof = &profiles[j]; - break; - } - default: { - break; - } - } - - if (prof) { break; } } - } - } - if (prof == NULL) { - prof = &profiles[0]; - GGML_ASSERT(prof->stages[1].backend == GGML_TASK_BACKEND_CPU); - } - - return prof; -} - -void ggml_graph_compute_set_tensor_task_proile(struct ggml_tensor *node, - struct ggml_cgraph *cgraph) { - // Pre-specified. - for (int i = 0; i < 3; i++) { - if (node->task_profile.stages[i].backend > 0) { - return; - } - } - - struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]; - int n_profiles = ggml_get_task_profiles(node, profiles); - - const struct ggml_task_profile *profile = NULL; - - // GPU offloading. A special case of pre-specified task_profile. - if (node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT) { - if (node->op != GGML_OP_MUL_MAT && node->op != GGML_OP_OUT_PROD) { - enum ggml_task_backend be; - if (ggml_cpu_has_cublas()) { - be = GGML_TASK_BACKEND_GPU_CUDA; - } else if (ggml_cpu_has_clblast()) { - be = GGML_TASK_BACKEND_GPU_CL; - } else { - GGML_ASSERT(false); - } - - for (int j = 0; j < n_profiles; j++) { - if (profiles[j].stages[1].backend == be) { - profile = &profiles[j]; - break; - } - } - GGML_ASSERT(profile); - GGML_ASSERT(!cgraph->tune); - - memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); - return; - } - } - - // mul_mat: GGML_OP_MUL_MAT and GGML_OP_OUT_PROD. - if (node->op == GGML_OP_MUL_MAT) { -#if defined(GGML_USE_TUNE) - GGML_ASSERT(node->backend == GGML_BACKEND_CPU); - - int stages_time_us[3]; - profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, - cgraph->tune, stages_time_us); - GGML_ASSERT(profile); - - memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); - - if (cgraph->tune) { - memcpy(&node->task_profile, profile, + if (prof) { + memcpy(&tensor->task_profile, prof, sizeof(struct ggml_task_profile)); - // Do not wait if the estimated execution time is too small - // (e.g. less than 0.1 ms) - // TODO: need bench actual wait/notify time, see - // ggml-threading.c - for (int j = 0; j < 3; j++) { - if (node->task_profile.stages[j].wait) { - if (stages_time_us[j] < 100) { - node->task_profile.stages[j].wait = false; + // Do not wait if the estimated execution time is too small + // (e.g. less than 0.1 ms) + // TODO: need bench actual wait/notify time, see + // ggml-threading.c + for (int j = 0; j < 3; j++) { + if (tensor->task_profile.stages[j].wait) { + if (stages_time_us[j] < 100) { + tensor->task_profile.stages[j].wait = false; + } } } + return; } } - return; + } #else - profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, NULL, - NULL); - GGML_ASSERT(profile); - memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); - return; + UNUSED(tune); #endif - } else if (node->op == GGML_OP_OUT_PROD) { // FIXME: is this correct? - profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, NULL, - NULL); - GGML_ASSERT(profile); - memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); - return; + + // Guess the optimal matrix size. + bool size_match = (M >= 32 && N >= 32 && K >= 32); + UNUSED(size_match); + + for (int i = n_profiles - 1; i >= 0; --i) { + const char *name = profiles[i].name; + + if (strcmp(name, "CUDA") == 0) { +#if defined(GGML_USE_CUBLAS) + if ((size_match || ggml_cuda_is_gpu_offloading(tensor)) && + ggml_mul_mat_check_type_mem(tensor)) { + memcpy(&tensor->task_profile, &profiles[i], + sizeof(struct ggml_task_profile)); + return; + } +#endif + } + + if (strcmp(name, "CL") == 0) { +#if defined(GGML_USE_CLBLAST) + if ((size_match || ggml_cl_is_gpu_offloading(tensor)) && + ggml_mul_mat_check_type_mem(tensor)) { + memcpy(&tensor->task_profile, &profiles[i], + sizeof(struct ggml_task_profile)); + return; + } +#endif + } + + if (strcmp(name, "BLAS") == 0) { +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + if (size_match && ggml_mul_mat_check_type_mem(tensor)) { + memcpy(&tensor->task_profile, &profiles[0], + sizeof(struct ggml_task_profile)); + return; + } +#endif + } } - // default. - profile = &profiles[0]; - GGML_ASSERT(profile->stages[1].backend == GGML_TASK_BACKEND_CPU); - memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); + memcpy(&tensor->task_profile, &profiles[0], + sizeof(struct ggml_task_profile)); +} + +static void ggml_set_tensor_task_profile(struct ggml_tensor *tensor, + struct ggml_mulmat_tune *tune) { + struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]; + int n_profiles = ggml_get_task_profiles(tensor, profiles); + GGML_ASSERT(n_profiles > 0); + + // By default use profile with the largest id. + // Profile id starts from 1. + memcpy(&tensor->task_profile, &profiles[n_profiles - 1], + sizeof(struct ggml_task_profile)); + + if (n_profiles > 1) { + GGML_ASSERT(tensor->task_profile.id > 1); + ggml_optimize_tensor_task_profile(tensor, profiles, n_profiles, tune); + } + + GGML_ASSERT(tensor->task_profile.id > 0); } void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { int n_threads = cgraph->n_threads; - struct ggml_threading_context *thrd_ctx = ggml_threading_start( - n_threads, NULL, ggml_compute_forward, - GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL); - // initialize tasks + work buffer { // int64_t t0 = ggml_time_us(); @@ -15952,25 +15946,26 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) GGML_ASSERT (node->op != GGML_OP_NONE); - struct ggml_task_stage *stages = node->task_profile.stages; + if (node->task_profile.id == 0) { + ggml_set_tensor_task_profile(node, cgraph->tune); + } - ggml_graph_compute_set_tensor_task_proile(node, cgraph); + struct ggml_task_stage *stages = node->task_profile.stages; // // Allocate temp buffer `wdata` for CPU. // NOTE: GPU MAY fallback to CPU, so we have to cover all possible cases. // - if (node->task_profile.get_wsize) { - int sz = node->task_profile.get_wsize(node); + if (node->task_profile.wsize_getter) { + int sz = node->task_profile.wsize_getter(node); if (sz >= 0) { work_size = MAX(work_size, (size_t)sz); + // FIXME: is it safe to continue in case fallback? continue; } } - //printf("op: %d, comp backend: %d\n", node->op, node->task_profile.stages[1].backend); - // compute stage n_tasks. int n_tasks = stages[1].parallel ? n_threads : 1; @@ -16034,35 +16029,17 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_OUT_PROD: // FIXME: is this correct? { size_t cur = 0; - enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend; - GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE); - if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { - GGML_ASSERT(ggml_cpu_has_cpublas()); - GGML_ASSERT(node->src1->type == GGML_TYPE_F32); + GGML_ASSERT(node->src1->type == GGML_TYPE_F32); - if (node->src0->type == GGML_TYPE_F32) { - cur = 0; - } else if (node->src0->type == GGML_TYPE_F16) { - // here we need memory just for single 2D matrix from src0 - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); - } else if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); - } else { - GGML_ASSERT(false); - } - } else { // CPU or GPU fallback - GGML_ASSERT(node->src1->type == GGML_TYPE_F32); - - if (node->src0->type == GGML_TYPE_F32) { - cur = 0; - } else if (node->src0->type == GGML_TYPE_F16) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); - } else if (ggml_is_quantized(node->src0->type)) { - const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type; - cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q]; - } else { - GGML_ASSERT(false); - } + if (node->src0->type == GGML_TYPE_F32) { + cur = 0; + } else if (node->src0->type == GGML_TYPE_F16) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); + } else if (ggml_is_quantized(node->src0->type)) { + const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type; + cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q]; + } else { + GGML_ASSERT(false); } work_size = MAX(work_size, cur); @@ -16218,6 +16195,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) const int64_t perf_start_cycles = ggml_perf_cycles(); const int64_t perf_start_time_us = ggml_perf_time_us(); + struct ggml_threading_context *thrd_ctx = ggml_threading_start(n_threads, + NULL, ggml_compute_forward, GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL); + for (int i = 0; i < cgraph->n_nodes; i++) { GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes); diff --git a/ggml.h b/ggml.h index d4d5d3521..554645ba8 100644 --- a/ggml.h +++ b/ggml.h @@ -362,29 +362,10 @@ extern "C" { static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); - // As part of task config profile solution, `ggml_task_backend` defines - // backends for each task stage. Similar to `ggml_tensor.backend`, - // `ggml_tensor.task_profile` generalizes how to configure tensor computing - // at per task-stage level. - // - // The following enum values are designed as combination of hardware and - // optional software interface. - enum ggml_task_backend { - GGML_TASK_BACKEND_NONE = 0, - - // [0x10, 0x1F]: CPU - GGML_TASK_BACKEND_CPU = 0x10, - GGML_TASK_BACKEND_CPU_BLAS = 0x11, - - // [0x20 - 0x2F]: GPU - GGML_TASK_BACKEND_GPU = 0x20, - GGML_TASK_BACKEND_GPU_CUDA = 0x21, - GGML_TASK_BACKEND_GPU_CL = 0x22, - }; - // config for computing one of the 3 task stages of a tensor. struct ggml_task_stage { - enum ggml_task_backend backend; + bool valid; + bool parallel; // hint idle workers go waiting, valid only when parallel is false. bool wait; @@ -407,13 +388,16 @@ extern "C" { // Get wsize for node computing. // When return -1: should be explained as `fallback to CPU`, caller MUST // determine how much memory to reserve for this node. - typedef int (ggml_task_get_wsize)(struct ggml_tensor *tensor); + typedef int (ggml_task_wsize_getter)(struct ggml_tensor *tensor); // config for computing a tensor. struct ggml_task_profile { // profile id, start from 1. int id; + // Required, not empty, no whitespaces. + char name[16]; + // index 0: INIT, 1: COMPUTE, 2: FINALIZE struct ggml_task_stage stages[3]; @@ -421,7 +405,7 @@ extern "C" { ggml_task_runner *runner; // Optional function to return required wsize for wdata. - ggml_task_get_wsize *get_wsize; + ggml_task_wsize_getter *wsize_getter; // Optional flag for development. // MUST be used only in testing codes. diff --git a/llama.cpp b/llama.cpp index 06555e1dd..e6bddffd5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2744,8 +2744,9 @@ struct llama_context * llama_init_from_file( } #ifdef GGML_USE_TUNE -bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) { - GGML_ASSERT (ctx->model.n_gpu_layers == 0); +bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, + const char *fname) { + GGML_ASSERT(ctx->model.n_gpu_layers == 0); printf("\n"); @@ -2755,7 +2756,7 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons enum ggml_ftype ggml_ftype; switch (hparams->ftype) { - case LLAMA_FTYPE_ALL_F32: + case LLAMA_FTYPE_ALL_F32: ggml_ftype = GGML_FTYPE_ALL_F32; break; case LLAMA_FTYPE_MOSTLY_F16: @@ -2767,9 +2768,6 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons case LLAMA_FTYPE_MOSTLY_Q4_1: ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1; break; - case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: - ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1_SOME_F16; - break; case LLAMA_FTYPE_MOSTLY_Q5_0: ggml_ftype = GGML_FTYPE_MOSTLY_Q5_0; break; @@ -2799,8 +2797,8 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons ggml_ftype = GGML_FTYPE_MOSTLY_Q6_K; break; default: - throw std::runtime_error( - format("invalid output file type %d\n", hparams->ftype)); + fprintf(stderr, "[tune] unsupported file type %d\n", hparams->ftype); + return false; } int n_vocab = hparams->n_vocab; @@ -2808,30 +2806,36 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons int n_rot = hparams->n_rot; int n_mult = hparams->n_mult; - int n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult; + int n_ff = ((2 * (4 * n_embd) / 3 + n_mult - 1) / n_mult) * n_mult; struct ggml_mulmat_tune_params params = { - /*.model =*/ { - /* .name =*/ model_name, - /* .ftype =*/ ggml_ftype, - /* .n_vocab =*/ n_vocab, - /* .n_embd =*/ n_embd, - /* .n_ff =*/ n_ff, - /* .n_rot =*/ n_rot, + /*.model =*/{ + /* .name =*/model_name, + /* .ftype =*/ggml_ftype, + /* .n_vocab =*/n_vocab, + /* .n_embd =*/n_embd, + /* .n_ff =*/n_ff, + /* .n_rot =*/n_rot, }, - /* .m_num =*/ 8, - /* .n_pass =*/ 1, - /* .n_threads =*/ n_threads, - /* .prrogress =*/ true, - /* .output_console =*/ false, - /* .fname =*/ fname, + /* .m_num =*/8, + /* .n_pass =*/1, + /* .n_threads =*/n_threads, + /* .prrogress =*/true, + /* .output_console =*/false, + /* .fname =*/fname, }; bool empty_fname = !fname || strcmp(fname, "") == 0; - ctx->tune = new(struct ggml_mulmat_tune); + ctx->tune = new (struct ggml_mulmat_tune); if (!ctx->tune) { - throw std::runtime_error(format("failed to allocate memory for tune\n")); + fprintf(stderr, "[tune] failed to allocate memory for tune\n"); + return false; + } + + if (!ggml_cpu_has_blas()) { + fprintf(stderr, "[tune] this program is not built with BLAS, abort.\n"); + return false; } if (tune) { @@ -2844,31 +2848,30 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons ggml_mulmat_tune_free(ctx->tune); return true; } - } else { - if (empty_fname) { - return false; - } + } else if (empty_fname) { + return false; } if (!empty_fname) { FILE *fp = fopen(fname, "r"); if (!fp) { - fprintf(stderr, "[tune] failed to open file %s.\n", - fname); + fprintf(stderr, "[tune] failed to open file %s.\n", fname); + return false; } else { - bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp); + int rc = ggml_mulmat_tune_read_data(ctx->tune, fp); fclose(fp); - if (!ok) { + if (rc != 0) { fprintf(stderr, - "[tune] failed to read data from %s\n", - fname); + "[tune] failed to read data from %s, error code: %d\n", + fname, rc); return false; } fprintf(stderr, "[tune] loaded data from %s\n", fname); - ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads); + bool ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, + params.n_threads); if (!ok) { return false; } diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c index 2079fe144..e904fc10d 100644 --- a/tests/test-ggml-threading.c +++ b/tests/test-ggml-threading.c @@ -41,9 +41,8 @@ static const int n_repeat = 10; // counter with array. static int work_done_arr[MAX_N_THREADS]; -static enum ggml_compute_error -mock_task_runner(const struct ggml_compute_params *params, - struct ggml_tensor *node) { +static enum ggml_compute_error mock_task_runner(const struct ggml_compute_params *params, + struct ggml_tensor *node) { int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000; if (node->task_profile.stages[params->type].parallel) { loops /= params->nth; @@ -80,20 +79,15 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) { int t0 = (int)ggml_time_us(); - struct ggml_threading_context *ctx = ggml_threading_start( - n_threads, NULL, mock_task_runner, features, /*stages_time*/ NULL); + node->task_profile.runner = mock_task_runner; + + struct ggml_threading_context *ctx = + ggml_threading_start(n_threads, NULL, NULL, features, /*stages_time*/ NULL); int t1 = (int)ggml_time_us(); for (int i = 0; i < n_repeat; i++) { - enum ggml_compute_error err = ggml_threading_compute_tensor( - ctx, node, /*wdata*/ NULL, /*wsize*/ 0); - if (err != GGML_COMPUTE_OK) { - ggml_threading_stop(ctx); - printf("ggml_threading_compute_tensor failed with error: %d.\n", - err); - return 1; - } + ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0); } int t2 = (int)ggml_time_us(); @@ -107,7 +101,7 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) { int expect = 0; for (int i = 0; i < 3; i++) { const struct ggml_task_stage *ts = &stages[i]; - if (ts->backend != GGML_TASK_BACKEND_NONE) { + if (ts->valid) { if (ts->parallel) { expect += n_threads; } else { @@ -144,14 +138,12 @@ static enum ggml_compute_error mock_task_runner_fallback(const struct ggml_compute_params *params, struct ggml_tensor *node) { UNUSED(params); - if (node->backend == GGML_BACKEND_GPU) { - // ... finally failed to compute in GPU. - node->backend = GGML_BACKEND_CPU; + // failed to run ... + if (node->task_profile.id == 2) { return GGML_COMPUTE_FALLBACK; - } else { - return GGML_COMPUTE_OK; } + return GGML_COMPUTE_OK; } // By design, fallback should happen when attempt computing tensor in GPU, @@ -164,6 +156,9 @@ int test_fallback(struct ggml_tensor *node) { enum ggml_compute_error err = ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0); if (err == GGML_COMPUTE_FALLBACK) { + // mock setup new profile ... + node->task_profile.id = 1; + err = ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0); } @@ -214,12 +209,12 @@ int main(void) { struct ggml_tensor node; memset(&node, 0, sizeof(struct ggml_tensor)); + node.task_profile.runner = mock_task_runner; struct ggml_task_stage *stages = node.task_profile.stages; - stages[0].backend = GGML_TASK_BACKEND_CPU; - stages[1].backend = GGML_TASK_BACKEND_CPU; - stages[2].backend = GGML_TASK_BACKEND_NONE; + stages[0].valid = true; + stages[1].valid = true; int n_passed = 0; int n_tests = 0; @@ -277,7 +272,7 @@ int main(void) { struct ggml_threading_context *ctx = ggml_threading_start(n_threads, ggml_threading_graph_compute_thread, - mock_task_runner, 0, /*stages_time*/ NULL); + NULL, 0, /*stages_time*/ NULL); int t1 = (int)ggml_time_us(); @@ -416,8 +411,8 @@ int main(void) { node.src0 = &src0; node.src1 = &src1; - node.backend = GGML_BACKEND_GPU; - stages[1].backend = GGML_TASK_BACKEND_GPU; + node.task_profile.id = 2; + stages[1].valid = true; if (test_fallback(&node) == 0) { ++n_passed; printf("[test-ggml-threading] test fallback: ok\n\n"); diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c index 4339881e5..97fd6cfbf 100644 --- a/tests/test-ggml-tune.c +++ b/tests/test-ggml-tune.c @@ -46,13 +46,9 @@ int main(void) { } static int bench(void) { - { - enum ggml_task_backend backends[16]; - int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends); - if (n_backends < 2) { - printf("[test-ggml-tune] skipped because no BLAS\n"); - return 0; - } + if (!ggml_cpu_has_blas()) { + printf("[test-ggml-tune] skipped because no BLAS\n"); + return 0; } { @@ -118,10 +114,13 @@ static int ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node, struct ggml_task_profile *profiles) { UNUSED(node); - profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU; - profiles[0].stages[1].backend = GGML_TASK_BACKEND_CPU; - profiles[1].stages[0].backend = GGML_TASK_BACKEND_CPU; - profiles[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; + profiles[0].id = 1; + profiles[0].stages[0].valid = true; + profiles[0].stages[1].valid = true; + + profiles[0].id = 2; + profiles[1].stages[0].valid = true; + profiles[1].stages[1].valid = true; return 2; }