bulk refactoring task profile and related to run CL GPU offloading.

* removed ggml_task_backend, infavour of ggml_task_profile.runner and newly added id and name.
* extracted mul_mat blas codes into ggml_compute_forward_mul_mat_blas,
  thus align with CUDA/CL a bit more and make it easier to fix profile and run tune.
* rewrote task profile and update/add some cuda/cl codes, finnaly made CL GPU offloading work.
* misc minor fix/update to tune, the data format was changed.
This commit is contained in:
mqy 2023-06-18 12:29:16 +08:00
parent 6b83a3e16f
commit 06b00827a0
15 changed files with 673 additions and 773 deletions

View File

@ -214,25 +214,18 @@ The following results are generated with Accelerate compiled.
**Example** **Example**
``` ```
5 3B 2 6 1 [tune] done, elapsed time: 0 seconds.
10 xB 12 4 2
3200 3200 2 0 3 10 1024 1024 12 0 2 4
16 0 0 0 16 1 0 1 0 0 0 0 100 110 000 1 CPU
16 1 0 2 17 0 1 0 0 0 0 0 110 101 000 2 BLAS
0 0 0 0 34 0 1 0 0 0 0 0 1 11 309 0 1234 90 0
1 1 793 0 9103 2102 0 0 6014 0 2 23 654 0 1359 215 0
2 2 1591 0 8034 2305 0 0 30982 0 4 44 1283 0 1362 421 0
4 4 2236 0 6476 2484 0 0 31388 0 8 85 2341 0 1357 347 0
8 7 4161 0 6623 2389 0 0 29204 0
16 15 8339 0 6434 2752 0 0 34303 0
32 32 16919 0 6915 3651 0 0 42511 0
64 200 34270 0 6574 4528 0 0 68212 0
128 188 69400 0 6325 6839 0 0 74437 0
256 303 134597 0 6168 11544 0 0 110180 0
512 687 279685 0 6337 29712 0 0 159728 0
3200 8640 2 0 2 10
1024 2048 12 0 2 4
... ...
``` ```
@ -249,17 +242,17 @@ shape+
# head # head
version: 1 version: 1
model: "3B" | "7B" | "13B" | "30B" | "65B" model: "3B" | "7B" | "13B" | "30B" | "65B"
ggml_ftype: 0 - 4, 7 - 14 ggml_ftype: 0 - 3, 7 - 14
n_shapes: number of shapes n_shapes: number of shapes
n_threads: number of threads n_threads: number of threads
shape := N K m_num n_profiles shape := N K src0_ggml_type src1_ggml_type n_profiles m_num
task_conf_profile+ task_profile+
bench_item+ bench_item+
task_conf_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize) task_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize) id name
stage_conf: backend parallel wait stage_conf(bitmap): valid parallel wait
backend: 0 (NONE) | 16 (CPU) | 17 (CPU_BLAS) | 32 (GPU) | 33 (GPU_CUDA) | 34 (GPU_CL) valid: 0 (false) | 1 (true)
parallel: 0 (false) | 1 (true) parallel: 0 (false) | 1 (true)
wait: 0 (false) | 1 (true) wait: 0 (false) | 1 (true)

View File

@ -111,6 +111,11 @@ static void usage(char *prog) {
} }
int main(int argc, char **argv) { int main(int argc, char **argv) {
if (!ggml_cpu_has_blas()) {
fprintf(stderr, "error: this program is not built with BLAS.\n");
return 1;
}
if (argc == 2) { if (argc == 2) {
if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) { if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) {
usage(argv[0]); usage(argv[0]);

View File

@ -2207,17 +2207,12 @@ void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true); ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
} }
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { // NOTE: don't check matrix size, otherwise mul_mat tune will fail to run.
const int64_t ne10 = src1->ne[0]; static bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];
// TODO: find the optimal values for these // TODO: find the optimal values for these
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
src1->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 &&
dst->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
return true; return true;
} }
@ -2539,11 +2534,17 @@ void ggml_cuda_free_scratch() {
g_scratch_buffer = nullptr; g_scratch_buffer = nullptr;
} }
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ bool ggml_cuda_is_gpu_offloading(struct ggml_tensor * tensor) {
ggml_cuda_func_t func; GGML_ASSERT(tensor);
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU GGML_ASSERT(tensor->src0);
return tensor->backend == GGML_BACKEND_GPU
|| tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT || tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT
|| (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU); || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
}
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
ggml_cuda_func_t func;
const bool any_on_device = is_gpu_offloading(tensor);
switch (tensor->op) { switch (tensor->op) {
case GGML_OP_ADD: case GGML_OP_ADD:
@ -2571,7 +2572,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
func = ggml_cuda_rms_norm; func = ggml_cuda_rms_norm;
break; break;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
if (!any_on_device/* && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)*/) { if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
return false; return false;
} }
func = ggml_cuda_mul_mat; func = ggml_cuda_mul_mat;

View File

@ -16,7 +16,7 @@ void ggml_init_cublas(void);
void ggml_cuda_set_tensor_split(const float * tensor_split); void ggml_cuda_set_tensor_split(const float * tensor_split);
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); bool ggml_cuda_is_gpu_offloading(const struct ggml_tensor * src0);
size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);

View File

@ -1589,18 +1589,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
} }
} }
bool ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor) {
GGML_ASSERT(tensor);
return (tensor->src0 && tensor->src0->backend == GGML_BACKEND_GPU) ||
(tensor->src1 && tensor->src1->backend == GGML_BACKEND_GPU);
}
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { // NOTE: don't check matrix size, otherwise mul_mat tune will fail to run.
const int64_t ne10 = src1->ne[0]; static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];
// TODO: find the optimal values for these
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
src1->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 &&
dst->type == GGML_TYPE_F32 /*&& dst->type == GGML_TYPE_F32) {
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)*/) {
return true; return true;
} }

View File

@ -9,7 +9,7 @@ extern "C" {
void ggml_cl_init(void); void ggml_cl_init(void);
void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); bool ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor);
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);

View File

@ -376,7 +376,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
struct ggml_compute_state_shared *shared = state->shared; struct ggml_compute_state_shared *shared = state->shared;
GGML_ASSERT(shared); GGML_ASSERT(shared);
GGML_ASSERT(shared->task_runner); //GGML_ASSERT(shared->task_runner);
shared->n_ready++; shared->n_ready++;
@ -397,7 +397,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
: shared->task_runner; : shared->task_runner;
enum ggml_compute_error err = runner(&state->params, state->node); enum ggml_compute_error err = runner(&state->params, state->node);
GGML_ASSERT(err == GGML_COMPUTE_OK); GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK);
ggml_spin_lock(&shared->spin); ggml_spin_lock(&shared->spin);
@ -430,7 +430,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
size_t wsize) { size_t wsize) {
GGML_ASSERT(ctx); GGML_ASSERT(ctx);
GGML_ASSERT(node); GGML_ASSERT(node);
GGML_ASSERT(ctx->shared.task_runner); // GGML_ASSERT(ctx->shared.task_runner);
ggml_task_runner *runner = ctx->shared.task_runner; ggml_task_runner *runner = ctx->shared.task_runner;
if (node->task_profile.runner) { if (node->task_profile.runner) {
@ -448,7 +448,7 @@ START:
memset(&params, 0, sizeof(struct ggml_compute_params)); memset(&params, 0, sizeof(struct ggml_compute_params));
for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) { for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) {
if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) { if (!node->task_profile.stages[type].valid) {
continue; continue;
} }
@ -519,18 +519,17 @@ START:
if (err == GGML_COMPUTE_FALLBACK) { if (err == GGML_COMPUTE_FALLBACK) {
PRINT_DEBUG("[main] fallback from profile, id=%d\n", PRINT_DEBUG("[main] fallback from profile, id=%d\n",
node->task_profile.id); node->task_profile.id);
GGML_ASSERT(node->task_profile.stages[1].backend > GGML_ASSERT(node->task_profile.id > 1);
GGML_TASK_BACKEND_CPU);
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]; struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
int n = ggml_get_task_profiles(node, profiles); int n = ggml_get_task_profiles(node, profiles);
GGML_ASSERT(n > 0); GGML_ASSERT(n > 0);
GGML_ASSERT(profiles[0].stages[1].backend == GGML_ASSERT(profiles[0].id == 1);
GGML_TASK_BACKEND_CPU);
memcpy(&node->task_profile, &profiles[0], memcpy(&node->task_profile, &profiles[0],
sizeof(struct ggml_task_profile)); sizeof(struct ggml_task_profile));
runner = ctx->shared.task_runner; runner = ctx->shared.task_runner;
GGML_ASSERT(runner);
goto START; goto START;
} }

View File

@ -29,7 +29,9 @@ typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data);
// thread: optional OS thread runner, default value: // thread: optional OS thread runner, default value:
// `ggml_threading_graph_compute_thread`. // `ggml_threading_graph_compute_thread`.
// //
// features: optional for configure // task_runner: default task runner, nullable wheen tensor.runner is not NULL.
// Overridden by tensor.runner.
// features: configure threading behaviour, optional.
// threading additional features. see `ggml_threading_feature`, default 0. // threading additional features. see `ggml_threading_feature`, default 0.
// //
// stages_time: optional for collecting per-stage wall clock time. // stages_time: optional for collecting per-stage wall clock time.
@ -51,12 +53,6 @@ enum ggml_compute_error
ggml_threading_compute_tensor(struct ggml_threading_context *ctx, ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
struct ggml_tensor *node, void *wdata, struct ggml_tensor *node, void *wdata,
size_t wsize); size_t wsize);
// This is an experimental functionality for mulmat tune, as a thin wrapper.
enum ggml_compute_error
ggml_compute_forward_wrapper(const struct ggml_compute_params *params,
struct ggml_tensor *tensor);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -24,26 +24,7 @@ static uint64_t ggml_mulmat_tune_cache_hash(int M, int N, int K) {
return hash; return hash;
} }
static const char * // Return profile id, -1 when failed (such as unable to match shape).
ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) {
switch (backend) {
case GGML_TASK_BACKEND_NONE:
return "";
case GGML_TASK_BACKEND_CPU:
return "CPU";
case GGML_TASK_BACKEND_CPU_BLAS:
return "BLAS";
case GGML_TASK_BACKEND_GPU:
return "GPU";
case GGML_TASK_BACKEND_GPU_CUDA:
return "CUDA";
case GGML_TASK_BACKEND_GPU_CL:
return "CL";
default:
GGML_ASSERT(false);
}
}
// NOTE: we can not use the profile from tune because the profiles do not // NOTE: we can not use the profile from tune because the profiles do not
// contain fields such as runner, get_size. // contain fields such as runner, get_size.
int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M, int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
@ -101,20 +82,15 @@ int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
e->K = K; e->K = K;
#ifndef GGML_TUNE_NDEBUG #ifndef GGML_TUNE_NDEBUG
const char *names[3];
for (int i = 0; i < 3; i++) {
names[i] = ggml_mulmat_tune_task_backend_name(
prof->stages[i].backend);
}
printf("\n[tune] M: %3d, N: %5d, K: %5d, profile id: %d, " printf("\n[tune] M: %3d, N: %5d, K: %5d, profile id: %d, "
"backends: %s %s %s\n", "backends: %s %s %s\n",
M, N, K, prof->id, names[0], names[1], names[2]); M, N, K, prof->id, prof->name);
#endif #endif
} }
} }
} }
return prof->id; return prof ? prof->id : -1;
} }
void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model, void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
@ -283,15 +259,10 @@ static bool ggml_mulmat_tune_write_profiles(
int rc; int rc;
for (int i = 0; i < n_profiles; i++) { for (int i = 0; i < n_profiles; i++) {
const struct ggml_task_profile *profile = &profiles[i]; const struct ggml_task_profile *profile = &profiles[i];
rc = fprintf(fp, "%d ", profile->id);
if (rc <= 0) {
return false;
}
for (int j = 0; j < 3; j++) { for (int j = 0; j < 3; j++) {
const struct ggml_task_stage *ts = &profile->stages[j]; const struct ggml_task_stage *ts = &profile->stages[j];
rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0, rc = fprintf(fp, "%1d%1d%1d", ts->valid ? 1 : 0,
ts->wait ? 1 : 0); ts->parallel ? 1 : 0, ts->wait ? 1 : 0);
if (rc <= 0) { if (rc <= 0) {
return false; return false;
} }
@ -302,6 +273,10 @@ static bool ggml_mulmat_tune_write_profiles(
} }
} }
} }
rc = fprintf(fp, " %d %s", profile->id, profile->name);
if (rc <= 0) {
return false;
}
rc = fprintf(fp, "\n"); rc = fprintf(fp, "\n");
if (rc <= 0) { if (rc <= 0) {
return false; return false;
@ -407,24 +382,24 @@ bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
return ok; return ok;
} }
bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
GGML_ASSERT(tune); GGML_ASSERT(tune);
memset(tune, 0, sizeof(struct ggml_mulmat_tune)); memset(tune, 0, sizeof(struct ggml_mulmat_tune));
int rc = fscanf(fp, "%d", &tune->version); int rc = fscanf(fp, "%d", &tune->version);
if (rc <= 0) { if (rc <= 0) {
return false; return 1;
} }
if (tune->version != GGML_MULMAT_TUNE_VERSION) { if (tune->version != GGML_MULMAT_TUNE_VERSION) {
fprintf(stderr, "[tune] version mismatch, run bench again\n"); fprintf(stderr, "[tune] version mismatch, run bench again\n");
return false; return 2;
} }
rc = fscanf(fp, "%s %d %d %d", tune->model, (int *)&tune->ftype, rc = fscanf(fp, "%s %d %d %d", tune->model, (int *)&tune->ftype,
&tune->n_shapes, &tune->n_threads); &tune->n_shapes, &tune->n_threads);
if (rc <= 0) { if (rc <= 0) {
return false; return 3;
} }
for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) { for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
@ -434,7 +409,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
(int *)&shape->src0_type, (int *)&shape->src1_type, (int *)&shape->src0_type, (int *)&shape->src1_type,
&shape->n_profiles, &shape->m_num); &shape->n_profiles, &shape->m_num);
if (rc <= 0) { if (rc <= 0) {
return false; return 4;
} }
{ {
@ -451,24 +426,24 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
for (int ip = 0; ip < shape->n_profiles; ip++) { for (int ip = 0; ip < shape->n_profiles; ip++) {
struct ggml_task_profile *profile = &shape->profiles[ip]; struct ggml_task_profile *profile = &shape->profiles[ip];
rc = fscanf(fp, "%d ", &profile->id);
if (rc <= 0) {
return false;
}
for (int j = 0; j < 3; j++) { for (int j = 0; j < 3; j++) {
struct ggml_task_stage *ts = &profile->stages[j]; struct ggml_task_stage *ts = &profile->stages[j];
int backend; int valid;
int parallel; int parallel;
int wait; int wait;
rc = fscanf(fp, "%d %d %d", &backend, &parallel, &wait); rc = fscanf(fp, " %1d%1d%1d", &valid, &parallel, &wait);
if (rc <= 0) { if (rc <= 0) {
return false; return 5;
} }
ts->backend = (enum ggml_task_backend)backend; ts->valid = valid ? true : false;
ts->parallel = parallel ? true : false; ts->parallel = parallel ? true : false;
ts->wait = wait ? true : false; ts->wait = wait ? true : false;
} }
rc = fscanf(fp, "%d %s", &profile->id, profile->name);
if (rc <= 0) {
return 6;
}
} }
for (int i_m = 0; i_m < shape->m_num; i_m++) { for (int i_m = 0; i_m < shape->m_num; i_m++) {
@ -477,7 +452,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
if (ip == 0) { if (ip == 0) {
rc = fscanf(fp, "%d", &M); rc = fscanf(fp, "%d", &M);
if (rc <= 0) { if (rc <= 0) {
return false; return 7;
} }
} }
struct ggml_mulmat_tune_m *item = struct ggml_mulmat_tune_m *item =
@ -486,13 +461,13 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
rc = fscanf(fp, "%d %d %d", &item->stages_time[0], rc = fscanf(fp, "%d %d %d", &item->stages_time[0],
&item->stages_time[1], &item->stages_time[2]); &item->stages_time[1], &item->stages_time[2]);
if (rc <= 0) { if (rc <= 0) {
return false; return 8;
} }
} }
} }
} }
return true; return 0;
} }
bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
@ -535,7 +510,7 @@ bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
const struct ggml_task_profile *profile = &shape->profiles[ip]; const struct ggml_task_profile *profile = &shape->profiles[ip];
for (int k = 0; k < 3; k++) { for (int k = 0; k < 3; k++) {
if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) { if (profile->stages[k].valid) {
rc = fprintf(fp, "%9d", item->stages_time[k]); rc = fprintf(fp, "%9d", item->stages_time[k]);
if (rc <= 0) { if (rc <= 0) {
return false; return false;
@ -562,8 +537,6 @@ const struct ggml_mulmat_tune_shape *
ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N, ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N,
const int K, enum ggml_type src0_type, const int K, enum ggml_type src0_type,
enum ggml_type src1_type) { enum ggml_type src1_type) {
GGML_ASSERT(N > 0 && K > 0);
for (int i = 0; i < tune->n_shapes; i++) { for (int i = 0; i < tune->n_shapes; i++) {
const struct ggml_mulmat_tune_shape *s = &tune->shapes[i]; const struct ggml_mulmat_tune_shape *s = &tune->shapes[i];
if (s->src0_type != src0_type || s->src1_type != src1_type) { if (s->src0_type != src0_type || s->src1_type != src1_type) {
@ -574,7 +547,10 @@ ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N,
if (s->N == N && s->K == K) { if (s->N == N && s->K == K) {
return s; return s;
} }
} else if (s->N > 0 && s->K == 0) { }
if (GGML_MULMAT_N_SHAPES == 6) {
if (s->N > 0 && s->K == 0) {
if (s->N == N) { if (s->N == N) {
return s; return s;
} }
@ -584,6 +560,7 @@ ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N,
} }
} }
} }
}
return NULL; return NULL;
} }
@ -639,7 +616,7 @@ void ggml_mulmat_tune_estimate_time(
for (int i_stage = 0; i_stage < 3; i_stage++) { for (int i_stage = 0; i_stage < 3; i_stage++) {
const struct ggml_task_stage *stage = &profile->stages[i_stage]; const struct ggml_task_stage *stage = &profile->stages[i_stage];
if (stage->backend == GGML_TASK_BACKEND_NONE) { if (!stage->valid) {
continue; continue;
} }
@ -784,23 +761,6 @@ static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) {
return sz; return sz;
} }
int ggml_mulmat_tune_get_builtin_task_backends(
enum ggml_task_backend *backends) {
int i = 0;
backends[i++] = GGML_TASK_BACKEND_CPU;
if (ggml_cpu_has_cpublas()) {
backends[i++] = GGML_TASK_BACKEND_CPU_BLAS;
}
if (ggml_cpu_has_cublas()) {
backends[i++] = GGML_TASK_BACKEND_GPU_CUDA;
} else if (ggml_cpu_has_clblast()) {
backends[i++] = GGML_TASK_BACKEND_GPU_CL;
}
return i;
}
bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
struct ggml_mulmat_tune_params *params) { struct ggml_mulmat_tune_params *params) {
GGML_ASSERT(tune); GGML_ASSERT(tune);
@ -809,23 +769,6 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
memset(tune, 0, sizeof(struct ggml_mulmat_tune)); memset(tune, 0, sizeof(struct ggml_mulmat_tune));
enum ggml_task_backend backends[16];
int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
if (n_backends < 2) {
fprintf(stderr,
"[tune] error: this program was not built with BLAS.\n");
return false;
}
if (params->model.ftype >= GGML_FTYPE_MOSTLY_Q2_K &&
params->model.ftype <= GGML_FTYPE_MOSTLY_Q6_K) {
#if defined(GGML_USE_CLBLAST)
printf("[tune] error: cl implementation does not support k_quants at "
"the time of writing this code, skip.\n");
return false;
#endif
}
bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles); bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles);
if (!ok) { if (!ok) {
return false; return false;
@ -835,12 +778,13 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
char buf[128] = {0}; char buf[128] = {0};
int offset = 0; int offset = 0;
for (int i = 0; i < n_backends; i++) { for (int i = 0; i < tune->shapes[0].n_profiles; i++) {
if (i > 0) { if (i > 0) {
buf[offset++] = ','; buf[offset++] = ',';
buf[offset++] = ' '; buf[offset++] = ' ';
} }
const char *name = ggml_mulmat_tune_task_backend_name(backends[i]); const char *name = tune->shapes[0].profiles[i].name;
GGML_ASSERT(name != NULL && strcmp(name, "") != 0);
size_t len = strlen(name); size_t len = strlen(name);
memcpy(&buf[offset], name, len); memcpy(&buf[offset], name, len);
offset += (int)len; offset += (int)len;
@ -848,16 +792,16 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
fprintf(stdout, fprintf(stdout,
"[tune] model: %s, ggml ftype: %d, " "[tune] model: %s, ggml ftype: %d, "
"n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n", "n_pass: %d, n_shapes: %d, n_threads: %d, profiles: %s\n",
params->model.name, params->model.ftype, params->n_pass, params->model.name, params->model.ftype, params->n_pass,
params->n_threads, tune->n_shapes, buf); tune->n_shapes, params->n_threads, buf);
} }
int64_t stages_time[3]; int64_t stages_time[3];
int64_t t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
struct ggml_threading_context *thrd_ctx = ggml_threading_start( struct ggml_threading_context *thrd_ctx =
tune->n_threads, NULL, ggml_compute_forward_wrapper, ggml_threading_start(tune->n_threads, NULL, NULL,
GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time); GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time);
for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) { for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
@ -896,6 +840,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
for (int ip = 0; ip < shape->n_profiles; ip++) { for (int ip = 0; ip < shape->n_profiles; ip++) {
const struct ggml_task_profile *profile = &shape->profiles[ip]; const struct ggml_task_profile *profile = &shape->profiles[ip];
// GGML_ASSERT(profile->runner);
memcpy(&node->task_profile, profile, memcpy(&node->task_profile, profile,
sizeof(struct ggml_task_profile)); sizeof(struct ggml_task_profile));
@ -911,9 +856,15 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
stages_time[j] = 0; stages_time[j] = 0;
} }
enum ggml_compute_error err = ggml_threading_compute_tensor( ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
thrd_ctx, node, wdata, wsize);
GGML_ASSERT(err == GGML_COMPUTE_OK); if (memcmp(profile, &node->task_profile,
sizeof(struct ggml_task_profile)) != 0) {
printf("[tune] error: task profile changed, tensor op: "
"%d, original id: %d, current id: %d\n",
node->op, profile->id, node->task_profile.id);
exit(1);
}
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
int v = (int)stages_time[i]; int v = (int)stages_time[i];

View File

@ -10,7 +10,7 @@
extern "C" { extern "C" {
#endif #endif
#define GGML_MULMAT_TUNE_VERSION 9 #define GGML_MULMAT_TUNE_VERSION 10
#define GGML_MULMAT_N_SHAPES 4 #define GGML_MULMAT_N_SHAPES 4
#define GGML_MULMAT_CACHE_LEN 16 #define GGML_MULMAT_CACHE_LEN 16
@ -119,7 +119,7 @@ void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune);
bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp); bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp);
bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp); int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp);
const struct ggml_mulmat_tune_shape * const struct ggml_mulmat_tune_shape *
ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, int N, int K, ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, int N, int K,
@ -129,11 +129,6 @@ void ggml_mulmat_tune_estimate_time(const struct ggml_mulmat_tune_shape *shape,
int M, int M,
struct ggml_mulmat_tune_time *profile_time); struct ggml_mulmat_tune_time *profile_time);
const char *ggml_task_backend_name(enum ggml_task_backend backend);
int ggml_mulmat_tune_get_builtin_task_backends(
enum ggml_task_backend *backends);
bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
struct ggml_mulmat_tune_params *params); struct ggml_mulmat_tune_params *params);

906
ggml.c

File diff suppressed because it is too large Load Diff

30
ggml.h
View File

@ -362,29 +362,10 @@ extern "C" {
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
// As part of task config profile solution, `ggml_task_backend` defines
// backends for each task stage. Similar to `ggml_tensor.backend`,
// `ggml_tensor.task_profile` generalizes how to configure tensor computing
// at per task-stage level.
//
// The following enum values are designed as combination of hardware and
// optional software interface.
enum ggml_task_backend {
GGML_TASK_BACKEND_NONE = 0,
// [0x10, 0x1F]: CPU
GGML_TASK_BACKEND_CPU = 0x10,
GGML_TASK_BACKEND_CPU_BLAS = 0x11,
// [0x20 - 0x2F]: GPU
GGML_TASK_BACKEND_GPU = 0x20,
GGML_TASK_BACKEND_GPU_CUDA = 0x21,
GGML_TASK_BACKEND_GPU_CL = 0x22,
};
// config for computing one of the 3 task stages of a tensor. // config for computing one of the 3 task stages of a tensor.
struct ggml_task_stage { struct ggml_task_stage {
enum ggml_task_backend backend; bool valid;
bool parallel; bool parallel;
// hint idle workers go waiting, valid only when parallel is false. // hint idle workers go waiting, valid only when parallel is false.
bool wait; bool wait;
@ -407,13 +388,16 @@ extern "C" {
// Get wsize for node computing. // Get wsize for node computing.
// When return -1: should be explained as `fallback to CPU`, caller MUST // When return -1: should be explained as `fallback to CPU`, caller MUST
// determine how much memory to reserve for this node. // determine how much memory to reserve for this node.
typedef int (ggml_task_get_wsize)(struct ggml_tensor *tensor); typedef int (ggml_task_wsize_getter)(struct ggml_tensor *tensor);
// config for computing a tensor. // config for computing a tensor.
struct ggml_task_profile { struct ggml_task_profile {
// profile id, start from 1. // profile id, start from 1.
int id; int id;
// Required, not empty, no whitespaces.
char name[16];
// index 0: INIT, 1: COMPUTE, 2: FINALIZE // index 0: INIT, 1: COMPUTE, 2: FINALIZE
struct ggml_task_stage stages[3]; struct ggml_task_stage stages[3];
@ -421,7 +405,7 @@ extern "C" {
ggml_task_runner *runner; ggml_task_runner *runner;
// Optional function to return required wsize for wdata. // Optional function to return required wsize for wdata.
ggml_task_get_wsize *get_wsize; ggml_task_wsize_getter *wsize_getter;
// Optional flag for development. // Optional flag for development.
// MUST be used only in testing codes. // MUST be used only in testing codes.

View File

@ -2744,7 +2744,8 @@ struct llama_context * llama_init_from_file(
} }
#ifdef GGML_USE_TUNE #ifdef GGML_USE_TUNE
bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) { bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune,
const char *fname) {
GGML_ASSERT(ctx->model.n_gpu_layers == 0); GGML_ASSERT(ctx->model.n_gpu_layers == 0);
printf("\n"); printf("\n");
@ -2767,9 +2768,6 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
case LLAMA_FTYPE_MOSTLY_Q4_1: case LLAMA_FTYPE_MOSTLY_Q4_1:
ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1; ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1;
break; break;
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1_SOME_F16;
break;
case LLAMA_FTYPE_MOSTLY_Q5_0: case LLAMA_FTYPE_MOSTLY_Q5_0:
ggml_ftype = GGML_FTYPE_MOSTLY_Q5_0; ggml_ftype = GGML_FTYPE_MOSTLY_Q5_0;
break; break;
@ -2799,8 +2797,8 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
ggml_ftype = GGML_FTYPE_MOSTLY_Q6_K; ggml_ftype = GGML_FTYPE_MOSTLY_Q6_K;
break; break;
default: default:
throw std::runtime_error( fprintf(stderr, "[tune] unsupported file type %d\n", hparams->ftype);
format("invalid output file type %d\n", hparams->ftype)); return false;
} }
int n_vocab = hparams->n_vocab; int n_vocab = hparams->n_vocab;
@ -2831,7 +2829,13 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
ctx->tune = new (struct ggml_mulmat_tune); ctx->tune = new (struct ggml_mulmat_tune);
if (!ctx->tune) { if (!ctx->tune) {
throw std::runtime_error(format("failed to allocate memory for tune\n")); fprintf(stderr, "[tune] failed to allocate memory for tune\n");
return false;
}
if (!ggml_cpu_has_blas()) {
fprintf(stderr, "[tune] this program is not built with BLAS, abort.\n");
return false;
} }
if (tune) { if (tune) {
@ -2844,31 +2848,30 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
ggml_mulmat_tune_free(ctx->tune); ggml_mulmat_tune_free(ctx->tune);
return true; return true;
} }
} else { } else if (empty_fname) {
if (empty_fname) {
return false; return false;
} }
}
if (!empty_fname) { if (!empty_fname) {
FILE *fp = fopen(fname, "r"); FILE *fp = fopen(fname, "r");
if (!fp) { if (!fp) {
fprintf(stderr, "[tune] failed to open file %s.\n", fprintf(stderr, "[tune] failed to open file %s.\n", fname);
fname); return false;
} else { } else {
bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp); int rc = ggml_mulmat_tune_read_data(ctx->tune, fp);
fclose(fp); fclose(fp);
if (!ok) { if (rc != 0) {
fprintf(stderr, fprintf(stderr,
"[tune] failed to read data from %s\n", "[tune] failed to read data from %s, error code: %d\n",
fname); fname, rc);
return false; return false;
} }
fprintf(stderr, "[tune] loaded data from %s\n", fname); fprintf(stderr, "[tune] loaded data from %s\n", fname);
ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads); bool ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype,
params.n_threads);
if (!ok) { if (!ok) {
return false; return false;
} }

View File

@ -41,8 +41,7 @@ static const int n_repeat = 10;
// counter with array. // counter with array.
static int work_done_arr[MAX_N_THREADS]; static int work_done_arr[MAX_N_THREADS];
static enum ggml_compute_error static enum ggml_compute_error mock_task_runner(const struct ggml_compute_params *params,
mock_task_runner(const struct ggml_compute_params *params,
struct ggml_tensor *node) { struct ggml_tensor *node) {
int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000; int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000;
if (node->task_profile.stages[params->type].parallel) { if (node->task_profile.stages[params->type].parallel) {
@ -80,20 +79,15 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
int t0 = (int)ggml_time_us(); int t0 = (int)ggml_time_us();
struct ggml_threading_context *ctx = ggml_threading_start( node->task_profile.runner = mock_task_runner;
n_threads, NULL, mock_task_runner, features, /*stages_time*/ NULL);
struct ggml_threading_context *ctx =
ggml_threading_start(n_threads, NULL, NULL, features, /*stages_time*/ NULL);
int t1 = (int)ggml_time_us(); int t1 = (int)ggml_time_us();
for (int i = 0; i < n_repeat; i++) { for (int i = 0; i < n_repeat; i++) {
enum ggml_compute_error err = ggml_threading_compute_tensor( ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
if (err != GGML_COMPUTE_OK) {
ggml_threading_stop(ctx);
printf("ggml_threading_compute_tensor failed with error: %d.\n",
err);
return 1;
}
} }
int t2 = (int)ggml_time_us(); int t2 = (int)ggml_time_us();
@ -107,7 +101,7 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
int expect = 0; int expect = 0;
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
const struct ggml_task_stage *ts = &stages[i]; const struct ggml_task_stage *ts = &stages[i];
if (ts->backend != GGML_TASK_BACKEND_NONE) { if (ts->valid) {
if (ts->parallel) { if (ts->parallel) {
expect += n_threads; expect += n_threads;
} else { } else {
@ -144,14 +138,12 @@ static enum ggml_compute_error
mock_task_runner_fallback(const struct ggml_compute_params *params, mock_task_runner_fallback(const struct ggml_compute_params *params,
struct ggml_tensor *node) { struct ggml_tensor *node) {
UNUSED(params); UNUSED(params);
if (node->backend == GGML_BACKEND_GPU) {
// ... finally failed to compute in GPU.
node->backend = GGML_BACKEND_CPU; // failed to run ...
if (node->task_profile.id == 2) {
return GGML_COMPUTE_FALLBACK; return GGML_COMPUTE_FALLBACK;
} else {
return GGML_COMPUTE_OK;
} }
return GGML_COMPUTE_OK;
} }
// By design, fallback should happen when attempt computing tensor in GPU, // By design, fallback should happen when attempt computing tensor in GPU,
@ -164,6 +156,9 @@ int test_fallback(struct ggml_tensor *node) {
enum ggml_compute_error err = enum ggml_compute_error err =
ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0); ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
if (err == GGML_COMPUTE_FALLBACK) { if (err == GGML_COMPUTE_FALLBACK) {
// mock setup new profile ...
node->task_profile.id = 1;
err = ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, err = ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL,
/*wsize*/ 0); /*wsize*/ 0);
} }
@ -214,12 +209,12 @@ int main(void) {
struct ggml_tensor node; struct ggml_tensor node;
memset(&node, 0, sizeof(struct ggml_tensor)); memset(&node, 0, sizeof(struct ggml_tensor));
node.task_profile.runner = mock_task_runner;
struct ggml_task_stage *stages = node.task_profile.stages; struct ggml_task_stage *stages = node.task_profile.stages;
stages[0].backend = GGML_TASK_BACKEND_CPU; stages[0].valid = true;
stages[1].backend = GGML_TASK_BACKEND_CPU; stages[1].valid = true;
stages[2].backend = GGML_TASK_BACKEND_NONE;
int n_passed = 0; int n_passed = 0;
int n_tests = 0; int n_tests = 0;
@ -277,7 +272,7 @@ int main(void) {
struct ggml_threading_context *ctx = struct ggml_threading_context *ctx =
ggml_threading_start(n_threads, ggml_threading_graph_compute_thread, ggml_threading_start(n_threads, ggml_threading_graph_compute_thread,
mock_task_runner, 0, /*stages_time*/ NULL); NULL, 0, /*stages_time*/ NULL);
int t1 = (int)ggml_time_us(); int t1 = (int)ggml_time_us();
@ -416,8 +411,8 @@ int main(void) {
node.src0 = &src0; node.src0 = &src0;
node.src1 = &src1; node.src1 = &src1;
node.backend = GGML_BACKEND_GPU; node.task_profile.id = 2;
stages[1].backend = GGML_TASK_BACKEND_GPU; stages[1].valid = true;
if (test_fallback(&node) == 0) { if (test_fallback(&node) == 0) {
++n_passed; ++n_passed;
printf("[test-ggml-threading] test fallback: ok\n\n"); printf("[test-ggml-threading] test fallback: ok\n\n");

View File

@ -46,14 +46,10 @@ int main(void) {
} }
static int bench(void) { static int bench(void) {
{ if (!ggml_cpu_has_blas()) {
enum ggml_task_backend backends[16];
int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
if (n_backends < 2) {
printf("[test-ggml-tune] skipped because no BLAS\n"); printf("[test-ggml-tune] skipped because no BLAS\n");
return 0; return 0;
} }
}
{ {
struct ggml_init_params init_params = { struct ggml_init_params init_params = {
@ -118,10 +114,13 @@ static int
ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node, ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node,
struct ggml_task_profile *profiles) { struct ggml_task_profile *profiles) {
UNUSED(node); UNUSED(node);
profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU; profiles[0].id = 1;
profiles[0].stages[1].backend = GGML_TASK_BACKEND_CPU; profiles[0].stages[0].valid = true;
profiles[1].stages[0].backend = GGML_TASK_BACKEND_CPU; profiles[0].stages[1].valid = true;
profiles[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
profiles[0].id = 2;
profiles[1].stages[0].valid = true;
profiles[1].stages[1].valid = true;
return 2; return 2;
} }