From 7c05049f8b0ba6e090e5a9d5bb11a6d4c74e4a3f Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Thu, 15 Jun 2023 14:06:11 +0800
Subject: [PATCH] tunning: check GPU offloading before loading model

---
 examples/common.cpp                |  8 ++++++++
 examples/perplexity/perplexity.cpp |  2 +-
 ggml-tune.c                        | 22 +++++++++++-----------
 llama.cpp                          | 12 +++++-------
 4 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/examples/common.cpp b/examples/common.cpp
index fd6df4947..09ce484a1 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -435,6 +435,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         process_escapes(params.prompt);
     }
 
+#ifdef GGML_USE_TUNE
+     if (params.n_gpu_layers > 0) {
+        if (params.tune || !params.tune_file.empty()) {
+            fprintf(stderr, "[tune] error: tunning and GPU offloading cannot be used at the same time, abort.\n");
+            exit(1);
+        }
+     }
+#endif
     return true;
 }
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 2cdd9db06..473220516 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -159,7 +159,7 @@ int main(int argc, char ** argv) {
     }
 
 #ifdef GGML_USE_TUNE
-    if (params.tune || !params.tune_file.empty()){
+    if (params.tune || !params.tune_file.empty()) {
         bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
         if (!ok || (params.tune && !params.tune_file.empty())) {
             llama_free(ctx);
diff --git a/ggml-tune.c b/ggml-tune.c
index 52ca96bf3..0a52443e4 100644
--- a/ggml-tune.c
+++ b/ggml-tune.c
@@ -104,7 +104,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
                         prof->stages[i].backend);
                 }
                 printf(
-                    "\n[mulmat tune] M: %3d, N: %5d, K: %5d, backends of the "
+                    "\n[tune] M: %3d, N: %5d, K: %5d, backends of the "
                     "fastest profile: %s %s %s\n",
                     M, N, K, names[0], names[1], names[2]);
 #endif
@@ -358,7 +358,7 @@ bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
     bool ok = ggml_mulmat_tune_validate_internal(tune, model, ftype, n_threads,
                                                  errbuf, sizeof(errbuf));
     if (!ok) {
-        fprintf(stderr, "[mulmat tune] error: %s. run bench again.\n", errbuf);
+        fprintf(stderr, "[tune] error: %s. run bench again.\n", errbuf);
     }
 
     return ok;
@@ -371,7 +371,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
     }
 
     if (tune->version != GGML_MULMAT_TUNE_VERSION) {
-        fprintf(stderr, "[mulmat tune] version mismatch, run bench again\n");
+        fprintf(stderr, "[tune] version mismatch, run bench again\n");
         return false;
     }
 
@@ -396,7 +396,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
                                (shape->n_profiles * shape->m_num);
             shape->items = malloc(item_size);
             if (shape->items == NULL) {
-                fprintf(stderr, "[mulmat tune] failed to allocate memory\n");
+                fprintf(stderr, "[tune] failed to allocate memory\n");
                 return false;
             }
             memset(shape->items, 0, item_size);
@@ -708,7 +708,7 @@ static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) {
 
     if (!buf) {
         fprintf(stderr,
-                "[mulmat tune] error: failed to allocate %zu MiB memory",
+                "[tune] error: failed to allocate %zu MiB memory",
                 sz / 1024 / 1024);
         return 0;
     }
@@ -745,7 +745,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
     int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
     if (n_backends < 2) {
         fprintf(stderr,
-                "[mulmat tune] error: this program was not built with BLAS.\n");
+                "[tune] error: this program was not built with BLAS.\n");
         return false;
     }
 
@@ -770,7 +770,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
         }
 
         fprintf(stdout,
-                "[mulmat tune] model: %s, ggml ftype: %d, "
+                "[tune] model: %s, ggml ftype: %d, "
                 "n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n",
                 params->model.name, params->model.ftype, params->n_pass,
                 params->n_threads, tune->n_shapes, buf);
@@ -871,7 +871,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
 
     ggml_threading_stop(thrd_ctx);
 
-    fprintf(stdout, "[mulmat tune] done, elapsed time: %d seconds.\n",
+    fprintf(stdout, "[tune] done, elapsed time: %d seconds.\n",
             (int)(ggml_time_ms() - t0) / 1000);
 
     // output
@@ -880,7 +880,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
         FILE *fp = fopen(params->fname, "w");
         if (!fp) {
             fprintf(stderr,
-                    "[mulmat tune] warn: failed to open file `%s`, print to "
+                    "[tune] warn: failed to open file `%s`, print to "
                     "console instead\n\n",
                     params->fname);
             params->output_console = 1;
@@ -889,12 +889,12 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
             fclose(fp);
 
             if (ok) {
-                fprintf(stdout, "[mulmat tune] data was written to `%s`\n",
+                fprintf(stdout, "[tune] data was written to `%s`\n",
                         params->fname);
             } else {
                 fprintf(
                     stderr,
-                    "[mulmat tune] warn: failed to write file `%s`, print to "
+                    "[tune] warn: failed to write file `%s`, print to "
                     "console instead\n\n",
                     params->fname);
                 params->output_console = 1;
diff --git a/llama.cpp b/llama.cpp
index acc0e59f7..06555e1dd 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2745,11 +2745,9 @@ struct llama_context * llama_init_from_file(
 
 #ifdef GGML_USE_TUNE
 bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) {
+    GGML_ASSERT (ctx->model.n_gpu_layers == 0);
+
     printf("\n");
-    if (ctx->model.n_gpu_layers != 0) {
-        fprintf(stderr, "[mulmat tune] error: is disabled by GPU offloading\n");
-        return false;
-    }
 
     const char *model_name = llama_model_type_name(ctx->model.type);
 
@@ -2855,7 +2853,7 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
     if (!empty_fname) {
         FILE *fp = fopen(fname, "r");
         if (!fp) {
-            fprintf(stderr, "[mulmat tune] failed to open file %s.\n",
+            fprintf(stderr, "[tune] failed to open file %s.\n",
                     fname);
         } else {
             bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp);
@@ -2863,12 +2861,12 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
 
             if (!ok) {
                 fprintf(stderr,
-                        "[mulmat tune] failed to read data from %s\n",
+                        "[tune] failed to read data from %s\n",
                         fname);
                 return false;
             }
 
-            fprintf(stderr, "[mulmat tune] loaded data from %s\n", fname);
+            fprintf(stderr, "[tune] loaded data from %s\n", fname);
 
             ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads);
             if (!ok) {