mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 03:01:45 +00:00
tunning: check GPU offloading before loading model
This commit is contained in:
parent
bb590f1482
commit
7c05049f8b
@ -435,6 +435,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||||||
process_escapes(params.prompt);
|
process_escapes(params.prompt);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_TUNE
|
||||||
|
if (params.n_gpu_layers > 0) {
|
||||||
|
if (params.tune || !params.tune_file.empty()) {
|
||||||
|
fprintf(stderr, "[tune] error: tunning and GPU offloading cannot be used at the same time, abort.\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -159,7 +159,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_TUNE
|
#ifdef GGML_USE_TUNE
|
||||||
if (params.tune || !params.tune_file.empty()){
|
if (params.tune || !params.tune_file.empty()) {
|
||||||
bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
|
bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
|
||||||
if (!ok || (params.tune && !params.tune_file.empty())) {
|
if (!ok || (params.tune && !params.tune_file.empty())) {
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
22
ggml-tune.c
22
ggml-tune.c
@ -104,7 +104,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
|
|||||||
prof->stages[i].backend);
|
prof->stages[i].backend);
|
||||||
}
|
}
|
||||||
printf(
|
printf(
|
||||||
"\n[mulmat tune] M: %3d, N: %5d, K: %5d, backends of the "
|
"\n[tune] M: %3d, N: %5d, K: %5d, backends of the "
|
||||||
"fastest profile: %s %s %s\n",
|
"fastest profile: %s %s %s\n",
|
||||||
M, N, K, names[0], names[1], names[2]);
|
M, N, K, names[0], names[1], names[2]);
|
||||||
#endif
|
#endif
|
||||||
@ -358,7 +358,7 @@ bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
|
|||||||
bool ok = ggml_mulmat_tune_validate_internal(tune, model, ftype, n_threads,
|
bool ok = ggml_mulmat_tune_validate_internal(tune, model, ftype, n_threads,
|
||||||
errbuf, sizeof(errbuf));
|
errbuf, sizeof(errbuf));
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
fprintf(stderr, "[mulmat tune] error: %s. run bench again.\n", errbuf);
|
fprintf(stderr, "[tune] error: %s. run bench again.\n", errbuf);
|
||||||
}
|
}
|
||||||
|
|
||||||
return ok;
|
return ok;
|
||||||
@ -371,7 +371,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (tune->version != GGML_MULMAT_TUNE_VERSION) {
|
if (tune->version != GGML_MULMAT_TUNE_VERSION) {
|
||||||
fprintf(stderr, "[mulmat tune] version mismatch, run bench again\n");
|
fprintf(stderr, "[tune] version mismatch, run bench again\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -396,7 +396,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
|
|||||||
(shape->n_profiles * shape->m_num);
|
(shape->n_profiles * shape->m_num);
|
||||||
shape->items = malloc(item_size);
|
shape->items = malloc(item_size);
|
||||||
if (shape->items == NULL) {
|
if (shape->items == NULL) {
|
||||||
fprintf(stderr, "[mulmat tune] failed to allocate memory\n");
|
fprintf(stderr, "[tune] failed to allocate memory\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
memset(shape->items, 0, item_size);
|
memset(shape->items, 0, item_size);
|
||||||
@ -708,7 +708,7 @@ static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) {
|
|||||||
|
|
||||||
if (!buf) {
|
if (!buf) {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"[mulmat tune] error: failed to allocate %zu MiB memory",
|
"[tune] error: failed to allocate %zu MiB memory",
|
||||||
sz / 1024 / 1024);
|
sz / 1024 / 1024);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -745,7 +745,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
|
|||||||
int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
|
int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
|
||||||
if (n_backends < 2) {
|
if (n_backends < 2) {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"[mulmat tune] error: this program was not built with BLAS.\n");
|
"[tune] error: this program was not built with BLAS.\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -770,7 +770,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
|
|||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stdout,
|
fprintf(stdout,
|
||||||
"[mulmat tune] model: %s, ggml ftype: %d, "
|
"[tune] model: %s, ggml ftype: %d, "
|
||||||
"n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n",
|
"n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n",
|
||||||
params->model.name, params->model.ftype, params->n_pass,
|
params->model.name, params->model.ftype, params->n_pass,
|
||||||
params->n_threads, tune->n_shapes, buf);
|
params->n_threads, tune->n_shapes, buf);
|
||||||
@ -871,7 +871,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
|
|||||||
|
|
||||||
ggml_threading_stop(thrd_ctx);
|
ggml_threading_stop(thrd_ctx);
|
||||||
|
|
||||||
fprintf(stdout, "[mulmat tune] done, elapsed time: %d seconds.\n",
|
fprintf(stdout, "[tune] done, elapsed time: %d seconds.\n",
|
||||||
(int)(ggml_time_ms() - t0) / 1000);
|
(int)(ggml_time_ms() - t0) / 1000);
|
||||||
|
|
||||||
// output
|
// output
|
||||||
@ -880,7 +880,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
|
|||||||
FILE *fp = fopen(params->fname, "w");
|
FILE *fp = fopen(params->fname, "w");
|
||||||
if (!fp) {
|
if (!fp) {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"[mulmat tune] warn: failed to open file `%s`, print to "
|
"[tune] warn: failed to open file `%s`, print to "
|
||||||
"console instead\n\n",
|
"console instead\n\n",
|
||||||
params->fname);
|
params->fname);
|
||||||
params->output_console = 1;
|
params->output_console = 1;
|
||||||
@ -889,12 +889,12 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
|
|||||||
fclose(fp);
|
fclose(fp);
|
||||||
|
|
||||||
if (ok) {
|
if (ok) {
|
||||||
fprintf(stdout, "[mulmat tune] data was written to `%s`\n",
|
fprintf(stdout, "[tune] data was written to `%s`\n",
|
||||||
params->fname);
|
params->fname);
|
||||||
} else {
|
} else {
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr,
|
stderr,
|
||||||
"[mulmat tune] warn: failed to write file `%s`, print to "
|
"[tune] warn: failed to write file `%s`, print to "
|
||||||
"console instead\n\n",
|
"console instead\n\n",
|
||||||
params->fname);
|
params->fname);
|
||||||
params->output_console = 1;
|
params->output_console = 1;
|
||||||
|
12
llama.cpp
12
llama.cpp
@ -2745,11 +2745,9 @@ struct llama_context * llama_init_from_file(
|
|||||||
|
|
||||||
#ifdef GGML_USE_TUNE
|
#ifdef GGML_USE_TUNE
|
||||||
bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) {
|
bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) {
|
||||||
|
GGML_ASSERT (ctx->model.n_gpu_layers == 0);
|
||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
if (ctx->model.n_gpu_layers != 0) {
|
|
||||||
fprintf(stderr, "[mulmat tune] error: is disabled by GPU offloading\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
const char *model_name = llama_model_type_name(ctx->model.type);
|
const char *model_name = llama_model_type_name(ctx->model.type);
|
||||||
|
|
||||||
@ -2855,7 +2853,7 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
|
|||||||
if (!empty_fname) {
|
if (!empty_fname) {
|
||||||
FILE *fp = fopen(fname, "r");
|
FILE *fp = fopen(fname, "r");
|
||||||
if (!fp) {
|
if (!fp) {
|
||||||
fprintf(stderr, "[mulmat tune] failed to open file %s.\n",
|
fprintf(stderr, "[tune] failed to open file %s.\n",
|
||||||
fname);
|
fname);
|
||||||
} else {
|
} else {
|
||||||
bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp);
|
bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp);
|
||||||
@ -2863,12 +2861,12 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
|
|||||||
|
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"[mulmat tune] failed to read data from %s\n",
|
"[tune] failed to read data from %s\n",
|
||||||
fname);
|
fname);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "[mulmat tune] loaded data from %s\n", fname);
|
fprintf(stderr, "[tune] loaded data from %s\n", fname);
|
||||||
|
|
||||||
ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads);
|
ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads);
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
|
Loading…
Reference in New Issue
Block a user