From 7c05049f8b0ba6e090e5a9d5bb11a6d4c74e4a3f Mon Sep 17 00:00:00 2001 From: mqy Date: Thu, 15 Jun 2023 14:06:11 +0800 Subject: [PATCH] tunning: check GPU offloading before loading model --- examples/common.cpp | 8 ++++++++ examples/perplexity/perplexity.cpp | 2 +- ggml-tune.c | 22 +++++++++++----------- llama.cpp | 12 +++++------- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index fd6df4947..09ce484a1 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -435,6 +435,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { process_escapes(params.prompt); } +#ifdef GGML_USE_TUNE + if (params.n_gpu_layers > 0) { + if (params.tune || !params.tune_file.empty()) { + fprintf(stderr, "[tune] error: tunning and GPU offloading cannot be used at the same time, abort.\n"); + exit(1); + } + } +#endif return true; } diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 2cdd9db06..473220516 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -159,7 +159,7 @@ int main(int argc, char ** argv) { } #ifdef GGML_USE_TUNE - if (params.tune || !params.tune_file.empty()){ + if (params.tune || !params.tune_file.empty()) { bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str()); if (!ok || (params.tune && !params.tune_file.empty())) { llama_free(ctx); diff --git a/ggml-tune.c b/ggml-tune.c index 52ca96bf3..0a52443e4 100644 --- a/ggml-tune.c +++ b/ggml-tune.c @@ -104,7 +104,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile( prof->stages[i].backend); } printf( - "\n[mulmat tune] M: %3d, N: %5d, K: %5d, backends of the " + "\n[tune] M: %3d, N: %5d, K: %5d, backends of the " "fastest profile: %s %s %s\n", M, N, K, names[0], names[1], names[2]); #endif @@ -358,7 +358,7 @@ bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune, bool ok = ggml_mulmat_tune_validate_internal(tune, model, ftype, n_threads, errbuf, sizeof(errbuf)); if (!ok) { - fprintf(stderr, "[mulmat tune] error: %s. run bench again.\n", errbuf); + fprintf(stderr, "[tune] error: %s. run bench again.\n", errbuf); } return ok; @@ -371,7 +371,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { } if (tune->version != GGML_MULMAT_TUNE_VERSION) { - fprintf(stderr, "[mulmat tune] version mismatch, run bench again\n"); + fprintf(stderr, "[tune] version mismatch, run bench again\n"); return false; } @@ -396,7 +396,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { (shape->n_profiles * shape->m_num); shape->items = malloc(item_size); if (shape->items == NULL) { - fprintf(stderr, "[mulmat tune] failed to allocate memory\n"); + fprintf(stderr, "[tune] failed to allocate memory\n"); return false; } memset(shape->items, 0, item_size); @@ -708,7 +708,7 @@ static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) { if (!buf) { fprintf(stderr, - "[mulmat tune] error: failed to allocate %zu MiB memory", + "[tune] error: failed to allocate %zu MiB memory", sz / 1024 / 1024); return 0; } @@ -745,7 +745,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends); if (n_backends < 2) { fprintf(stderr, - "[mulmat tune] error: this program was not built with BLAS.\n"); + "[tune] error: this program was not built with BLAS.\n"); return false; } @@ -770,7 +770,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, } fprintf(stdout, - "[mulmat tune] model: %s, ggml ftype: %d, " + "[tune] model: %s, ggml ftype: %d, " "n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n", params->model.name, params->model.ftype, params->n_pass, params->n_threads, tune->n_shapes, buf); @@ -871,7 +871,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, ggml_threading_stop(thrd_ctx); - fprintf(stdout, "[mulmat tune] done, elapsed time: %d seconds.\n", + fprintf(stdout, "[tune] done, elapsed time: %d seconds.\n", (int)(ggml_time_ms() - t0) / 1000); // output @@ -880,7 +880,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, FILE *fp = fopen(params->fname, "w"); if (!fp) { fprintf(stderr, - "[mulmat tune] warn: failed to open file `%s`, print to " + "[tune] warn: failed to open file `%s`, print to " "console instead\n\n", params->fname); params->output_console = 1; @@ -889,12 +889,12 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, fclose(fp); if (ok) { - fprintf(stdout, "[mulmat tune] data was written to `%s`\n", + fprintf(stdout, "[tune] data was written to `%s`\n", params->fname); } else { fprintf( stderr, - "[mulmat tune] warn: failed to write file `%s`, print to " + "[tune] warn: failed to write file `%s`, print to " "console instead\n\n", params->fname); params->output_console = 1; diff --git a/llama.cpp b/llama.cpp index acc0e59f7..06555e1dd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2745,11 +2745,9 @@ struct llama_context * llama_init_from_file( #ifdef GGML_USE_TUNE bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) { + GGML_ASSERT (ctx->model.n_gpu_layers == 0); + printf("\n"); - if (ctx->model.n_gpu_layers != 0) { - fprintf(stderr, "[mulmat tune] error: is disabled by GPU offloading\n"); - return false; - } const char *model_name = llama_model_type_name(ctx->model.type); @@ -2855,7 +2853,7 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons if (!empty_fname) { FILE *fp = fopen(fname, "r"); if (!fp) { - fprintf(stderr, "[mulmat tune] failed to open file %s.\n", + fprintf(stderr, "[tune] failed to open file %s.\n", fname); } else { bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp); @@ -2863,12 +2861,12 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons if (!ok) { fprintf(stderr, - "[mulmat tune] failed to read data from %s\n", + "[tune] failed to read data from %s\n", fname); return false; } - fprintf(stderr, "[mulmat tune] loaded data from %s\n", fname); + fprintf(stderr, "[tune] loaded data from %s\n", fname); ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads); if (!ok) {