mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-10 18:51:45 +00:00
bulk refactoring task profile and related to run CL GPU offloading.
* removed ggml_task_backend, infavour of ggml_task_profile.runner and newly added id and name. * extracted mul_mat blas codes into ggml_compute_forward_mul_mat_blas, thus align with CUDA/CL a bit more and make it easier to fix profile and run tune. * rewrote task profile and update/add some cuda/cl codes, finnaly made CL GPU offloading work. * misc minor fix/update to tune, the data format was changed.
This commit is contained in:
parent
6b83a3e16f
commit
06b00827a0
@ -214,26 +214,19 @@ The following results are generated with Accelerate compiled.
|
||||
**Example**
|
||||
|
||||
```
|
||||
5 3B 2 6 1
|
||||
[tune] done, elapsed time: 0 seconds.
|
||||
10 xB 12 4 2
|
||||
|
||||
3200 3200 2 0 3 10
|
||||
16 0 0 0 16 1 0 1 0 0 0 0
|
||||
16 1 0 2 17 0 1 0 0 0 0 0
|
||||
0 0 0 0 34 0 1 0 0 0 0 0
|
||||
1 1 793 0 9103 2102 0 0 6014 0
|
||||
2 2 1591 0 8034 2305 0 0 30982 0
|
||||
4 4 2236 0 6476 2484 0 0 31388 0
|
||||
8 7 4161 0 6623 2389 0 0 29204 0
|
||||
16 15 8339 0 6434 2752 0 0 34303 0
|
||||
32 32 16919 0 6915 3651 0 0 42511 0
|
||||
64 200 34270 0 6574 4528 0 0 68212 0
|
||||
128 188 69400 0 6325 6839 0 0 74437 0
|
||||
256 303 134597 0 6168 11544 0 0 110180 0
|
||||
512 687 279685 0 6337 29712 0 0 159728 0
|
||||
1024 1024 12 0 2 4
|
||||
100 110 000 1 CPU
|
||||
110 101 000 2 BLAS
|
||||
1 11 309 0 1234 90 0
|
||||
2 23 654 0 1359 215 0
|
||||
4 44 1283 0 1362 421 0
|
||||
8 85 2341 0 1357 347 0
|
||||
|
||||
3200 8640 2 0 2 10
|
||||
|
||||
...
|
||||
1024 2048 12 0 2 4
|
||||
...
|
||||
|
||||
```
|
||||
|
||||
@ -249,17 +242,17 @@ shape+
|
||||
# head
|
||||
version: 1
|
||||
model: "3B" | "7B" | "13B" | "30B" | "65B"
|
||||
ggml_ftype: 0 - 4, 7 - 14
|
||||
ggml_ftype: 0 - 3, 7 - 14
|
||||
n_shapes: number of shapes
|
||||
n_threads: number of threads
|
||||
|
||||
shape := N K m_num n_profiles
|
||||
task_conf_profile+
|
||||
shape := N K src0_ggml_type src1_ggml_type n_profiles m_num
|
||||
task_profile+
|
||||
bench_item+
|
||||
|
||||
task_conf_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize)
|
||||
stage_conf: backend parallel wait
|
||||
backend: 0 (NONE) | 16 (CPU) | 17 (CPU_BLAS) | 32 (GPU) | 33 (GPU_CUDA) | 34 (GPU_CL)
|
||||
task_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize) id name
|
||||
stage_conf(bitmap): valid parallel wait
|
||||
valid: 0 (false) | 1 (true)
|
||||
parallel: 0 (false) | 1 (true)
|
||||
wait: 0 (false) | 1 (true)
|
||||
|
||||
|
@ -111,6 +111,11 @@ static void usage(char *prog) {
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (!ggml_cpu_has_blas()) {
|
||||
fprintf(stderr, "error: this program is not built with BLAS.\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (argc == 2) {
|
||||
if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) {
|
||||
usage(argv[0]);
|
||||
|
25
ggml-cuda.cu
25
ggml-cuda.cu
@ -2207,17 +2207,12 @@ void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml
|
||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
|
||||
}
|
||||
|
||||
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
|
||||
const int64_t ne0 = dst->ne[0];
|
||||
const int64_t ne1 = dst->ne[1];
|
||||
|
||||
// NOTE: don't check matrix size, otherwise mul_mat tune will fail to run.
|
||||
static bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
// TODO: find the optimal values for these
|
||||
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||
src1->type == GGML_TYPE_F32 &&
|
||||
dst->type == GGML_TYPE_F32 &&
|
||||
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
||||
dst->type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -2539,11 +2534,17 @@ void ggml_cuda_free_scratch() {
|
||||
g_scratch_buffer = nullptr;
|
||||
}
|
||||
|
||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
||||
ggml_cuda_func_t func;
|
||||
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
||||
bool ggml_cuda_is_gpu_offloading(struct ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor);
|
||||
GGML_ASSERT(tensor->src0);
|
||||
return tensor->backend == GGML_BACKEND_GPU
|
||||
|| tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT
|
||||
|| (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
|
||||
}
|
||||
|
||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
||||
ggml_cuda_func_t func;
|
||||
const bool any_on_device = is_gpu_offloading(tensor);
|
||||
|
||||
switch (tensor->op) {
|
||||
case GGML_OP_ADD:
|
||||
@ -2571,7 +2572,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||
func = ggml_cuda_rms_norm;
|
||||
break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
if (!any_on_device/* && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)*/) {
|
||||
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_mul_mat;
|
||||
|
@ -16,7 +16,7 @@ void ggml_init_cublas(void);
|
||||
void ggml_cuda_set_tensor_split(const float * tensor_split);
|
||||
|
||||
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
bool ggml_cuda_is_gpu_offloading(const struct ggml_tensor * src0);
|
||||
size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
||||
|
||||
|
@ -1589,18 +1589,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
}
|
||||
}
|
||||
|
||||
bool ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor);
|
||||
return (tensor->src0 && tensor->src0->backend == GGML_BACKEND_GPU) ||
|
||||
(tensor->src1 && tensor->src1->backend == GGML_BACKEND_GPU);
|
||||
}
|
||||
|
||||
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
|
||||
const int64_t ne0 = dst->ne[0];
|
||||
const int64_t ne1 = dst->ne[1];
|
||||
|
||||
// TODO: find the optimal values for these
|
||||
// NOTE: don't check matrix size, otherwise mul_mat tune will fail to run.
|
||||
static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||
src1->type == GGML_TYPE_F32 &&
|
||||
dst->type == GGML_TYPE_F32 /*&&
|
||||
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)*/) {
|
||||
dst->type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -9,7 +9,7 @@ extern "C" {
|
||||
void ggml_cl_init(void);
|
||||
|
||||
void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
bool ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor);
|
||||
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
||||
|
||||
|
@ -376,7 +376,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
|
||||
|
||||
struct ggml_compute_state_shared *shared = state->shared;
|
||||
GGML_ASSERT(shared);
|
||||
GGML_ASSERT(shared->task_runner);
|
||||
//GGML_ASSERT(shared->task_runner);
|
||||
|
||||
shared->n_ready++;
|
||||
|
||||
@ -397,7 +397,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
|
||||
: shared->task_runner;
|
||||
enum ggml_compute_error err = runner(&state->params, state->node);
|
||||
|
||||
GGML_ASSERT(err == GGML_COMPUTE_OK);
|
||||
GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK);
|
||||
|
||||
ggml_spin_lock(&shared->spin);
|
||||
|
||||
@ -430,7 +430,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
|
||||
size_t wsize) {
|
||||
GGML_ASSERT(ctx);
|
||||
GGML_ASSERT(node);
|
||||
GGML_ASSERT(ctx->shared.task_runner);
|
||||
// GGML_ASSERT(ctx->shared.task_runner);
|
||||
|
||||
ggml_task_runner *runner = ctx->shared.task_runner;
|
||||
if (node->task_profile.runner) {
|
||||
@ -448,7 +448,7 @@ START:
|
||||
memset(¶ms, 0, sizeof(struct ggml_compute_params));
|
||||
|
||||
for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) {
|
||||
if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) {
|
||||
if (!node->task_profile.stages[type].valid) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -519,18 +519,17 @@ START:
|
||||
if (err == GGML_COMPUTE_FALLBACK) {
|
||||
PRINT_DEBUG("[main] fallback from profile, id=%d\n",
|
||||
node->task_profile.id);
|
||||
GGML_ASSERT(node->task_profile.stages[1].backend >
|
||||
GGML_TASK_BACKEND_CPU);
|
||||
GGML_ASSERT(node->task_profile.id > 1);
|
||||
|
||||
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
|
||||
int n = ggml_get_task_profiles(node, profiles);
|
||||
GGML_ASSERT(n > 0);
|
||||
GGML_ASSERT(profiles[0].stages[1].backend ==
|
||||
GGML_TASK_BACKEND_CPU);
|
||||
GGML_ASSERT(profiles[0].id == 1);
|
||||
|
||||
memcpy(&node->task_profile, &profiles[0],
|
||||
sizeof(struct ggml_task_profile));
|
||||
sizeof(struct ggml_task_profile));
|
||||
runner = ctx->shared.task_runner;
|
||||
GGML_ASSERT(runner);
|
||||
|
||||
goto START;
|
||||
}
|
||||
|
@ -29,7 +29,9 @@ typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data);
|
||||
// thread: optional OS thread runner, default value:
|
||||
// `ggml_threading_graph_compute_thread`.
|
||||
//
|
||||
// features: optional for configure
|
||||
// task_runner: default task runner, nullable wheen tensor.runner is not NULL.
|
||||
// Overridden by tensor.runner.
|
||||
// features: configure threading behaviour, optional.
|
||||
// threading additional features. see `ggml_threading_feature`, default 0.
|
||||
//
|
||||
// stages_time: optional for collecting per-stage wall clock time.
|
||||
@ -51,12 +53,6 @@ enum ggml_compute_error
|
||||
ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
|
||||
struct ggml_tensor *node, void *wdata,
|
||||
size_t wsize);
|
||||
|
||||
// This is an experimental functionality for mulmat tune, as a thin wrapper.
|
||||
enum ggml_compute_error
|
||||
ggml_compute_forward_wrapper(const struct ggml_compute_params *params,
|
||||
struct ggml_tensor *tensor);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
165
ggml-tune.c
165
ggml-tune.c
@ -24,26 +24,7 @@ static uint64_t ggml_mulmat_tune_cache_hash(int M, int N, int K) {
|
||||
return hash;
|
||||
}
|
||||
|
||||
static const char *
|
||||
ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) {
|
||||
switch (backend) {
|
||||
case GGML_TASK_BACKEND_NONE:
|
||||
return "";
|
||||
case GGML_TASK_BACKEND_CPU:
|
||||
return "CPU";
|
||||
case GGML_TASK_BACKEND_CPU_BLAS:
|
||||
return "BLAS";
|
||||
case GGML_TASK_BACKEND_GPU:
|
||||
return "GPU";
|
||||
case GGML_TASK_BACKEND_GPU_CUDA:
|
||||
return "CUDA";
|
||||
case GGML_TASK_BACKEND_GPU_CL:
|
||||
return "CL";
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
// Return profile id, -1 when failed (such as unable to match shape).
|
||||
// NOTE: we can not use the profile from tune because the profiles do not
|
||||
// contain fields such as runner, get_size.
|
||||
int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
|
||||
@ -101,20 +82,15 @@ int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
|
||||
e->K = K;
|
||||
|
||||
#ifndef GGML_TUNE_NDEBUG
|
||||
const char *names[3];
|
||||
for (int i = 0; i < 3; i++) {
|
||||
names[i] = ggml_mulmat_tune_task_backend_name(
|
||||
prof->stages[i].backend);
|
||||
}
|
||||
printf("\n[tune] M: %3d, N: %5d, K: %5d, profile id: %d, "
|
||||
"backends: %s %s %s\n",
|
||||
M, N, K, prof->id, names[0], names[1], names[2]);
|
||||
M, N, K, prof->id, prof->name);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return prof->id;
|
||||
return prof ? prof->id : -1;
|
||||
}
|
||||
|
||||
void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
|
||||
@ -283,25 +259,24 @@ static bool ggml_mulmat_tune_write_profiles(
|
||||
int rc;
|
||||
for (int i = 0; i < n_profiles; i++) {
|
||||
const struct ggml_task_profile *profile = &profiles[i];
|
||||
rc = fprintf(fp, "%d ", profile->id);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int j = 0; j < 3; j++) {
|
||||
const struct ggml_task_stage *ts = &profile->stages[j];
|
||||
rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0,
|
||||
ts->wait ? 1 : 0);
|
||||
rc = fprintf(fp, "%1d%1d%1d", ts->valid ? 1 : 0,
|
||||
ts->parallel ? 1 : 0, ts->wait ? 1 : 0);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
if (j < 2) {
|
||||
rc = fprintf(fp, " ");
|
||||
rc = fprintf(fp, " ");
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
rc = fprintf(fp, " %d %s", profile->id, profile->name);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
rc = fprintf(fp, "\n");
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
@ -407,24 +382,24 @@ bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
|
||||
return ok;
|
||||
}
|
||||
|
||||
bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
|
||||
int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
|
||||
GGML_ASSERT(tune);
|
||||
memset(tune, 0, sizeof(struct ggml_mulmat_tune));
|
||||
|
||||
int rc = fscanf(fp, "%d", &tune->version);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (tune->version != GGML_MULMAT_TUNE_VERSION) {
|
||||
fprintf(stderr, "[tune] version mismatch, run bench again\n");
|
||||
return false;
|
||||
return 2;
|
||||
}
|
||||
|
||||
rc = fscanf(fp, "%s %d %d %d", tune->model, (int *)&tune->ftype,
|
||||
&tune->n_shapes, &tune->n_threads);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
return 3;
|
||||
}
|
||||
|
||||
for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
|
||||
@ -434,7 +409,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
|
||||
(int *)&shape->src0_type, (int *)&shape->src1_type,
|
||||
&shape->n_profiles, &shape->m_num);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
return 4;
|
||||
}
|
||||
|
||||
{
|
||||
@ -451,24 +426,24 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
|
||||
for (int ip = 0; ip < shape->n_profiles; ip++) {
|
||||
struct ggml_task_profile *profile = &shape->profiles[ip];
|
||||
|
||||
rc = fscanf(fp, "%d ", &profile->id);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int j = 0; j < 3; j++) {
|
||||
struct ggml_task_stage *ts = &profile->stages[j];
|
||||
int backend;
|
||||
int valid;
|
||||
int parallel;
|
||||
int wait;
|
||||
rc = fscanf(fp, "%d %d %d", &backend, ¶llel, &wait);
|
||||
rc = fscanf(fp, " %1d%1d%1d", &valid, ¶llel, &wait);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
return 5;
|
||||
}
|
||||
ts->backend = (enum ggml_task_backend)backend;
|
||||
ts->valid = valid ? true : false;
|
||||
ts->parallel = parallel ? true : false;
|
||||
ts->wait = wait ? true : false;
|
||||
}
|
||||
|
||||
rc = fscanf(fp, "%d %s", &profile->id, profile->name);
|
||||
if (rc <= 0) {
|
||||
return 6;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i_m = 0; i_m < shape->m_num; i_m++) {
|
||||
@ -477,7 +452,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
|
||||
if (ip == 0) {
|
||||
rc = fscanf(fp, "%d", &M);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
return 7;
|
||||
}
|
||||
}
|
||||
struct ggml_mulmat_tune_m *item =
|
||||
@ -486,13 +461,13 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
|
||||
rc = fscanf(fp, "%d %d %d", &item->stages_time[0],
|
||||
&item->stages_time[1], &item->stages_time[2]);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
return 8;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
|
||||
@ -535,7 +510,7 @@ bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
|
||||
|
||||
const struct ggml_task_profile *profile = &shape->profiles[ip];
|
||||
for (int k = 0; k < 3; k++) {
|
||||
if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) {
|
||||
if (profile->stages[k].valid) {
|
||||
rc = fprintf(fp, "%9d", item->stages_time[k]);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
@ -562,8 +537,6 @@ const struct ggml_mulmat_tune_shape *
|
||||
ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N,
|
||||
const int K, enum ggml_type src0_type,
|
||||
enum ggml_type src1_type) {
|
||||
GGML_ASSERT(N > 0 && K > 0);
|
||||
|
||||
for (int i = 0; i < tune->n_shapes; i++) {
|
||||
const struct ggml_mulmat_tune_shape *s = &tune->shapes[i];
|
||||
if (s->src0_type != src0_type || s->src1_type != src1_type) {
|
||||
@ -574,13 +547,17 @@ ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N,
|
||||
if (s->N == N && s->K == K) {
|
||||
return s;
|
||||
}
|
||||
} else if (s->N > 0 && s->K == 0) {
|
||||
if (s->N == N) {
|
||||
return s;
|
||||
}
|
||||
} else if (s->N == 0 && s->K > 0) {
|
||||
if (s->K == K) {
|
||||
return s;
|
||||
}
|
||||
|
||||
if (GGML_MULMAT_N_SHAPES == 6) {
|
||||
if (s->N > 0 && s->K == 0) {
|
||||
if (s->N == N) {
|
||||
return s;
|
||||
}
|
||||
} else if (s->N == 0 && s->K > 0) {
|
||||
if (s->K == K) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -639,7 +616,7 @@ void ggml_mulmat_tune_estimate_time(
|
||||
|
||||
for (int i_stage = 0; i_stage < 3; i_stage++) {
|
||||
const struct ggml_task_stage *stage = &profile->stages[i_stage];
|
||||
if (stage->backend == GGML_TASK_BACKEND_NONE) {
|
||||
if (!stage->valid) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -784,23 +761,6 @@ static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) {
|
||||
return sz;
|
||||
}
|
||||
|
||||
int ggml_mulmat_tune_get_builtin_task_backends(
|
||||
enum ggml_task_backend *backends) {
|
||||
int i = 0;
|
||||
backends[i++] = GGML_TASK_BACKEND_CPU;
|
||||
|
||||
if (ggml_cpu_has_cpublas()) {
|
||||
backends[i++] = GGML_TASK_BACKEND_CPU_BLAS;
|
||||
}
|
||||
|
||||
if (ggml_cpu_has_cublas()) {
|
||||
backends[i++] = GGML_TASK_BACKEND_GPU_CUDA;
|
||||
} else if (ggml_cpu_has_clblast()) {
|
||||
backends[i++] = GGML_TASK_BACKEND_GPU_CL;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
|
||||
struct ggml_mulmat_tune_params *params) {
|
||||
GGML_ASSERT(tune);
|
||||
@ -809,23 +769,6 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
|
||||
|
||||
memset(tune, 0, sizeof(struct ggml_mulmat_tune));
|
||||
|
||||
enum ggml_task_backend backends[16];
|
||||
int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
|
||||
if (n_backends < 2) {
|
||||
fprintf(stderr,
|
||||
"[tune] error: this program was not built with BLAS.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (params->model.ftype >= GGML_FTYPE_MOSTLY_Q2_K &&
|
||||
params->model.ftype <= GGML_FTYPE_MOSTLY_Q6_K) {
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
printf("[tune] error: cl implementation does not support k_quants at "
|
||||
"the time of writing this code, skip.\n");
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles);
|
||||
if (!ok) {
|
||||
return false;
|
||||
@ -835,12 +778,13 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
|
||||
char buf[128] = {0};
|
||||
int offset = 0;
|
||||
|
||||
for (int i = 0; i < n_backends; i++) {
|
||||
for (int i = 0; i < tune->shapes[0].n_profiles; i++) {
|
||||
if (i > 0) {
|
||||
buf[offset++] = ',';
|
||||
buf[offset++] = ' ';
|
||||
}
|
||||
const char *name = ggml_mulmat_tune_task_backend_name(backends[i]);
|
||||
const char *name = tune->shapes[0].profiles[i].name;
|
||||
GGML_ASSERT(name != NULL && strcmp(name, "") != 0);
|
||||
size_t len = strlen(name);
|
||||
memcpy(&buf[offset], name, len);
|
||||
offset += (int)len;
|
||||
@ -848,17 +792,17 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
|
||||
|
||||
fprintf(stdout,
|
||||
"[tune] model: %s, ggml ftype: %d, "
|
||||
"n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n",
|
||||
"n_pass: %d, n_shapes: %d, n_threads: %d, profiles: %s\n",
|
||||
params->model.name, params->model.ftype, params->n_pass,
|
||||
params->n_threads, tune->n_shapes, buf);
|
||||
tune->n_shapes, params->n_threads, buf);
|
||||
}
|
||||
|
||||
int64_t stages_time[3];
|
||||
int64_t t0 = ggml_time_ms();
|
||||
|
||||
struct ggml_threading_context *thrd_ctx = ggml_threading_start(
|
||||
tune->n_threads, NULL, ggml_compute_forward_wrapper,
|
||||
GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time);
|
||||
struct ggml_threading_context *thrd_ctx =
|
||||
ggml_threading_start(tune->n_threads, NULL, NULL,
|
||||
GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time);
|
||||
|
||||
for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
|
||||
const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
|
||||
@ -896,6 +840,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
|
||||
|
||||
for (int ip = 0; ip < shape->n_profiles; ip++) {
|
||||
const struct ggml_task_profile *profile = &shape->profiles[ip];
|
||||
// GGML_ASSERT(profile->runner);
|
||||
|
||||
memcpy(&node->task_profile, profile,
|
||||
sizeof(struct ggml_task_profile));
|
||||
@ -911,9 +856,15 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
|
||||
stages_time[j] = 0;
|
||||
}
|
||||
|
||||
enum ggml_compute_error err = ggml_threading_compute_tensor(
|
||||
thrd_ctx, node, wdata, wsize);
|
||||
GGML_ASSERT(err == GGML_COMPUTE_OK);
|
||||
ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
|
||||
|
||||
if (memcmp(profile, &node->task_profile,
|
||||
sizeof(struct ggml_task_profile)) != 0) {
|
||||
printf("[tune] error: task profile changed, tensor op: "
|
||||
"%d, original id: %d, current id: %d\n",
|
||||
node->op, profile->id, node->task_profile.id);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
int v = (int)stages_time[i];
|
||||
|
@ -10,7 +10,7 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_MULMAT_TUNE_VERSION 9
|
||||
#define GGML_MULMAT_TUNE_VERSION 10
|
||||
#define GGML_MULMAT_N_SHAPES 4
|
||||
#define GGML_MULMAT_CACHE_LEN 16
|
||||
|
||||
@ -119,7 +119,7 @@ void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune);
|
||||
|
||||
bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp);
|
||||
|
||||
bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp);
|
||||
int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp);
|
||||
|
||||
const struct ggml_mulmat_tune_shape *
|
||||
ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, int N, int K,
|
||||
@ -129,11 +129,6 @@ void ggml_mulmat_tune_estimate_time(const struct ggml_mulmat_tune_shape *shape,
|
||||
int M,
|
||||
struct ggml_mulmat_tune_time *profile_time);
|
||||
|
||||
const char *ggml_task_backend_name(enum ggml_task_backend backend);
|
||||
|
||||
int ggml_mulmat_tune_get_builtin_task_backends(
|
||||
enum ggml_task_backend *backends);
|
||||
|
||||
bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
|
||||
struct ggml_mulmat_tune_params *params);
|
||||
|
||||
|
30
ggml.h
30
ggml.h
@ -362,29 +362,10 @@ extern "C" {
|
||||
|
||||
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
||||
|
||||
// As part of task config profile solution, `ggml_task_backend` defines
|
||||
// backends for each task stage. Similar to `ggml_tensor.backend`,
|
||||
// `ggml_tensor.task_profile` generalizes how to configure tensor computing
|
||||
// at per task-stage level.
|
||||
//
|
||||
// The following enum values are designed as combination of hardware and
|
||||
// optional software interface.
|
||||
enum ggml_task_backend {
|
||||
GGML_TASK_BACKEND_NONE = 0,
|
||||
|
||||
// [0x10, 0x1F]: CPU
|
||||
GGML_TASK_BACKEND_CPU = 0x10,
|
||||
GGML_TASK_BACKEND_CPU_BLAS = 0x11,
|
||||
|
||||
// [0x20 - 0x2F]: GPU
|
||||
GGML_TASK_BACKEND_GPU = 0x20,
|
||||
GGML_TASK_BACKEND_GPU_CUDA = 0x21,
|
||||
GGML_TASK_BACKEND_GPU_CL = 0x22,
|
||||
};
|
||||
|
||||
// config for computing one of the 3 task stages of a tensor.
|
||||
struct ggml_task_stage {
|
||||
enum ggml_task_backend backend;
|
||||
bool valid;
|
||||
|
||||
bool parallel;
|
||||
// hint idle workers go waiting, valid only when parallel is false.
|
||||
bool wait;
|
||||
@ -407,13 +388,16 @@ extern "C" {
|
||||
// Get wsize for node computing.
|
||||
// When return -1: should be explained as `fallback to CPU`, caller MUST
|
||||
// determine how much memory to reserve for this node.
|
||||
typedef int (ggml_task_get_wsize)(struct ggml_tensor *tensor);
|
||||
typedef int (ggml_task_wsize_getter)(struct ggml_tensor *tensor);
|
||||
|
||||
// config for computing a tensor.
|
||||
struct ggml_task_profile {
|
||||
// profile id, start from 1.
|
||||
int id;
|
||||
|
||||
// Required, not empty, no whitespaces.
|
||||
char name[16];
|
||||
|
||||
// index 0: INIT, 1: COMPUTE, 2: FINALIZE
|
||||
struct ggml_task_stage stages[3];
|
||||
|
||||
@ -421,7 +405,7 @@ extern "C" {
|
||||
ggml_task_runner *runner;
|
||||
|
||||
// Optional function to return required wsize for wdata.
|
||||
ggml_task_get_wsize *get_wsize;
|
||||
ggml_task_wsize_getter *wsize_getter;
|
||||
|
||||
// Optional flag for development.
|
||||
// MUST be used only in testing codes.
|
||||
|
73
llama.cpp
73
llama.cpp
@ -2744,8 +2744,9 @@ struct llama_context * llama_init_from_file(
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_TUNE
|
||||
bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) {
|
||||
GGML_ASSERT (ctx->model.n_gpu_layers == 0);
|
||||
bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune,
|
||||
const char *fname) {
|
||||
GGML_ASSERT(ctx->model.n_gpu_layers == 0);
|
||||
|
||||
printf("\n");
|
||||
|
||||
@ -2755,7 +2756,7 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
|
||||
|
||||
enum ggml_ftype ggml_ftype;
|
||||
switch (hparams->ftype) {
|
||||
case LLAMA_FTYPE_ALL_F32:
|
||||
case LLAMA_FTYPE_ALL_F32:
|
||||
ggml_ftype = GGML_FTYPE_ALL_F32;
|
||||
break;
|
||||
case LLAMA_FTYPE_MOSTLY_F16:
|
||||
@ -2767,9 +2768,6 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1:
|
||||
ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1;
|
||||
break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
||||
ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1_SOME_F16;
|
||||
break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_0:
|
||||
ggml_ftype = GGML_FTYPE_MOSTLY_Q5_0;
|
||||
break;
|
||||
@ -2799,8 +2797,8 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
|
||||
ggml_ftype = GGML_FTYPE_MOSTLY_Q6_K;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
format("invalid output file type %d\n", hparams->ftype));
|
||||
fprintf(stderr, "[tune] unsupported file type %d\n", hparams->ftype);
|
||||
return false;
|
||||
}
|
||||
|
||||
int n_vocab = hparams->n_vocab;
|
||||
@ -2808,30 +2806,36 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
|
||||
int n_rot = hparams->n_rot;
|
||||
|
||||
int n_mult = hparams->n_mult;
|
||||
int n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult;
|
||||
int n_ff = ((2 * (4 * n_embd) / 3 + n_mult - 1) / n_mult) * n_mult;
|
||||
|
||||
struct ggml_mulmat_tune_params params = {
|
||||
/*.model =*/ {
|
||||
/* .name =*/ model_name,
|
||||
/* .ftype =*/ ggml_ftype,
|
||||
/* .n_vocab =*/ n_vocab,
|
||||
/* .n_embd =*/ n_embd,
|
||||
/* .n_ff =*/ n_ff,
|
||||
/* .n_rot =*/ n_rot,
|
||||
/*.model =*/{
|
||||
/* .name =*/model_name,
|
||||
/* .ftype =*/ggml_ftype,
|
||||
/* .n_vocab =*/n_vocab,
|
||||
/* .n_embd =*/n_embd,
|
||||
/* .n_ff =*/n_ff,
|
||||
/* .n_rot =*/n_rot,
|
||||
},
|
||||
/* .m_num =*/ 8,
|
||||
/* .n_pass =*/ 1,
|
||||
/* .n_threads =*/ n_threads,
|
||||
/* .prrogress =*/ true,
|
||||
/* .output_console =*/ false,
|
||||
/* .fname =*/ fname,
|
||||
/* .m_num =*/8,
|
||||
/* .n_pass =*/1,
|
||||
/* .n_threads =*/n_threads,
|
||||
/* .prrogress =*/true,
|
||||
/* .output_console =*/false,
|
||||
/* .fname =*/fname,
|
||||
};
|
||||
|
||||
bool empty_fname = !fname || strcmp(fname, "") == 0;
|
||||
|
||||
ctx->tune = new(struct ggml_mulmat_tune);
|
||||
ctx->tune = new (struct ggml_mulmat_tune);
|
||||
if (!ctx->tune) {
|
||||
throw std::runtime_error(format("failed to allocate memory for tune\n"));
|
||||
fprintf(stderr, "[tune] failed to allocate memory for tune\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ggml_cpu_has_blas()) {
|
||||
fprintf(stderr, "[tune] this program is not built with BLAS, abort.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (tune) {
|
||||
@ -2844,31 +2848,30 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
|
||||
ggml_mulmat_tune_free(ctx->tune);
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
if (empty_fname) {
|
||||
return false;
|
||||
}
|
||||
} else if (empty_fname) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!empty_fname) {
|
||||
FILE *fp = fopen(fname, "r");
|
||||
if (!fp) {
|
||||
fprintf(stderr, "[tune] failed to open file %s.\n",
|
||||
fname);
|
||||
fprintf(stderr, "[tune] failed to open file %s.\n", fname);
|
||||
return false;
|
||||
} else {
|
||||
bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp);
|
||||
int rc = ggml_mulmat_tune_read_data(ctx->tune, fp);
|
||||
fclose(fp);
|
||||
|
||||
if (!ok) {
|
||||
if (rc != 0) {
|
||||
fprintf(stderr,
|
||||
"[tune] failed to read data from %s\n",
|
||||
fname);
|
||||
"[tune] failed to read data from %s, error code: %d\n",
|
||||
fname, rc);
|
||||
return false;
|
||||
}
|
||||
|
||||
fprintf(stderr, "[tune] loaded data from %s\n", fname);
|
||||
|
||||
ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads);
|
||||
bool ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype,
|
||||
params.n_threads);
|
||||
if (!ok) {
|
||||
return false;
|
||||
}
|
||||
|
@ -41,9 +41,8 @@ static const int n_repeat = 10;
|
||||
// counter with array.
|
||||
static int work_done_arr[MAX_N_THREADS];
|
||||
|
||||
static enum ggml_compute_error
|
||||
mock_task_runner(const struct ggml_compute_params *params,
|
||||
struct ggml_tensor *node) {
|
||||
static enum ggml_compute_error mock_task_runner(const struct ggml_compute_params *params,
|
||||
struct ggml_tensor *node) {
|
||||
int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000;
|
||||
if (node->task_profile.stages[params->type].parallel) {
|
||||
loops /= params->nth;
|
||||
@ -80,20 +79,15 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
|
||||
|
||||
int t0 = (int)ggml_time_us();
|
||||
|
||||
struct ggml_threading_context *ctx = ggml_threading_start(
|
||||
n_threads, NULL, mock_task_runner, features, /*stages_time*/ NULL);
|
||||
node->task_profile.runner = mock_task_runner;
|
||||
|
||||
struct ggml_threading_context *ctx =
|
||||
ggml_threading_start(n_threads, NULL, NULL, features, /*stages_time*/ NULL);
|
||||
|
||||
int t1 = (int)ggml_time_us();
|
||||
|
||||
for (int i = 0; i < n_repeat; i++) {
|
||||
enum ggml_compute_error err = ggml_threading_compute_tensor(
|
||||
ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
|
||||
if (err != GGML_COMPUTE_OK) {
|
||||
ggml_threading_stop(ctx);
|
||||
printf("ggml_threading_compute_tensor failed with error: %d.\n",
|
||||
err);
|
||||
return 1;
|
||||
}
|
||||
ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
|
||||
}
|
||||
|
||||
int t2 = (int)ggml_time_us();
|
||||
@ -107,7 +101,7 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
|
||||
int expect = 0;
|
||||
for (int i = 0; i < 3; i++) {
|
||||
const struct ggml_task_stage *ts = &stages[i];
|
||||
if (ts->backend != GGML_TASK_BACKEND_NONE) {
|
||||
if (ts->valid) {
|
||||
if (ts->parallel) {
|
||||
expect += n_threads;
|
||||
} else {
|
||||
@ -144,14 +138,12 @@ static enum ggml_compute_error
|
||||
mock_task_runner_fallback(const struct ggml_compute_params *params,
|
||||
struct ggml_tensor *node) {
|
||||
UNUSED(params);
|
||||
if (node->backend == GGML_BACKEND_GPU) {
|
||||
// ... finally failed to compute in GPU.
|
||||
|
||||
node->backend = GGML_BACKEND_CPU;
|
||||
// failed to run ...
|
||||
if (node->task_profile.id == 2) {
|
||||
return GGML_COMPUTE_FALLBACK;
|
||||
} else {
|
||||
return GGML_COMPUTE_OK;
|
||||
}
|
||||
return GGML_COMPUTE_OK;
|
||||
}
|
||||
|
||||
// By design, fallback should happen when attempt computing tensor in GPU,
|
||||
@ -164,6 +156,9 @@ int test_fallback(struct ggml_tensor *node) {
|
||||
enum ggml_compute_error err =
|
||||
ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
|
||||
if (err == GGML_COMPUTE_FALLBACK) {
|
||||
// mock setup new profile ...
|
||||
node->task_profile.id = 1;
|
||||
|
||||
err = ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL,
|
||||
/*wsize*/ 0);
|
||||
}
|
||||
@ -214,12 +209,12 @@ int main(void) {
|
||||
|
||||
struct ggml_tensor node;
|
||||
memset(&node, 0, sizeof(struct ggml_tensor));
|
||||
node.task_profile.runner = mock_task_runner;
|
||||
|
||||
struct ggml_task_stage *stages = node.task_profile.stages;
|
||||
|
||||
stages[0].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[2].backend = GGML_TASK_BACKEND_NONE;
|
||||
stages[0].valid = true;
|
||||
stages[1].valid = true;
|
||||
|
||||
int n_passed = 0;
|
||||
int n_tests = 0;
|
||||
@ -277,7 +272,7 @@ int main(void) {
|
||||
|
||||
struct ggml_threading_context *ctx =
|
||||
ggml_threading_start(n_threads, ggml_threading_graph_compute_thread,
|
||||
mock_task_runner, 0, /*stages_time*/ NULL);
|
||||
NULL, 0, /*stages_time*/ NULL);
|
||||
|
||||
int t1 = (int)ggml_time_us();
|
||||
|
||||
@ -416,8 +411,8 @@ int main(void) {
|
||||
node.src0 = &src0;
|
||||
node.src1 = &src1;
|
||||
|
||||
node.backend = GGML_BACKEND_GPU;
|
||||
stages[1].backend = GGML_TASK_BACKEND_GPU;
|
||||
node.task_profile.id = 2;
|
||||
stages[1].valid = true;
|
||||
if (test_fallback(&node) == 0) {
|
||||
++n_passed;
|
||||
printf("[test-ggml-threading] test fallback: ok\n\n");
|
||||
|
@ -46,13 +46,9 @@ int main(void) {
|
||||
}
|
||||
|
||||
static int bench(void) {
|
||||
{
|
||||
enum ggml_task_backend backends[16];
|
||||
int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
|
||||
if (n_backends < 2) {
|
||||
printf("[test-ggml-tune] skipped because no BLAS\n");
|
||||
return 0;
|
||||
}
|
||||
if (!ggml_cpu_has_blas()) {
|
||||
printf("[test-ggml-tune] skipped because no BLAS\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
{
|
||||
@ -118,10 +114,13 @@ static int
|
||||
ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node,
|
||||
struct ggml_task_profile *profiles) {
|
||||
UNUSED(node);
|
||||
profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
|
||||
profiles[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
profiles[1].stages[0].backend = GGML_TASK_BACKEND_CPU;
|
||||
profiles[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
|
||||
profiles[0].id = 1;
|
||||
profiles[0].stages[0].valid = true;
|
||||
profiles[0].stages[1].valid = true;
|
||||
|
||||
profiles[0].id = 2;
|
||||
profiles[1].stages[0].valid = true;
|
||||
profiles[1].stages[1].valid = true;
|
||||
|
||||
return 2;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user