mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 03:01:45 +00:00
bulk refactored task profile to support complete fallback; enable tune by default for ease of dev
This commit is contained in:
parent
1b041d7737
commit
48016f685c
@ -78,7 +78,7 @@ option(LLAMA_K_QUANTS "llama: use k-quants"
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" OFF)
|
||||
option(LLAMA_MULMAT_TUNE "llama: mulmat tune" OFF)
|
||||
option(LLAMA_TUNE "llama: mulmat tune" ON)
|
||||
|
||||
#
|
||||
# Build info header
|
||||
@ -278,9 +278,9 @@ if (LLAMA_METAL)
|
||||
)
|
||||
endif()
|
||||
|
||||
if (LLAMA_MULMAT_TUNE)
|
||||
add_compile_definitions(GGML_USE_MULMAT_TUNE)
|
||||
add_compile_definitions(GGML_MULMAT_TUNE_NDEBUG)
|
||||
if (LLAMA_TUNE)
|
||||
add_compile_definitions(GGML_USE_TUNE)
|
||||
add_compile_definitions(GGML_TUNE_NDEBUG)
|
||||
endif()
|
||||
|
||||
if (LLAMA_K_QUANTS)
|
||||
|
8
Makefile
8
Makefile
@ -231,14 +231,14 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
|
||||
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
|
||||
endif
|
||||
|
||||
ifdef LLAMA_NO_K_QUANTS
|
||||
ifndef LLAMA_NO_K_QUANTS
|
||||
k_quants.o: k_quants.c k_quants.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif # LLAMA_NO_K_QUANTS
|
||||
|
||||
ifdef LLAMA_MULMAT_TUNE
|
||||
CFLAGS += -DGGML_USE_MULMAT_TUNE -DGGML_MULMAT_TUNE_NDEBUG
|
||||
CXXFLAGS += -DGGML_USE_MULMAT_TUNE
|
||||
ifndef LLAMA_NO_TUNE
|
||||
CFLAGS += -DGGML_USE_TUNE -DGGML_TUNE_NDEBUG
|
||||
CXXFLAGS += -DGGML_USE_TUNE
|
||||
endif
|
||||
|
||||
#
|
||||
|
@ -345,7 +345,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
params.mem_test = true;
|
||||
} else if (arg == "--export") {
|
||||
params.export_cgraph = true;
|
||||
#ifdef GGML_USE_MULMAT_TUNE
|
||||
#ifdef GGML_USE_TUNE
|
||||
} else if (arg == "--tune") {
|
||||
params.tune = true;
|
||||
} else if (arg == "--tune-file") {
|
||||
@ -354,7 +354,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.tune_file = argv[i];
|
||||
#endif // GGML_USE_MULMAT_TUNE
|
||||
#endif // GGML_USE_TUNE
|
||||
} else if (arg == "--verbose-prompt") {
|
||||
params.verbose_prompt = true;
|
||||
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
||||
@ -508,7 +508,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
#endif
|
||||
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
||||
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
|
||||
#ifdef GGML_USE_MULMAT_TUNE
|
||||
#ifdef GGML_USE_TUNE
|
||||
fprintf(stderr, " --tune mulmat tune enable. If tune-file is set then exit after bench\n");
|
||||
fprintf(stderr, " --tune-file FILE mulmat tune data file. If tune is true, then write bench result to this file, else load the file and run\n");
|
||||
#endif
|
||||
|
@ -117,7 +117,7 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_MULMAT_TUNE
|
||||
#ifdef GGML_USE_TUNE
|
||||
if (params.tune || !params.tune_file.empty()) {
|
||||
bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
|
||||
if (!ok || (params.tune && !params.tune_file.empty())) {
|
||||
|
@ -23,13 +23,13 @@ run bench ahead of time (saving tens of seconds), but there are two shortcomings
|
||||
|
||||
Makefile:
|
||||
```
|
||||
make clean && LLAMA_MULMAT_TUNE=1 make
|
||||
make clean && make
|
||||
```
|
||||
|
||||
CMake (with BLAS):
|
||||
```
|
||||
cmake --build . --target clean
|
||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON
|
||||
cmake .. -DLLAMA_BLAS=ON
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
@ -52,13 +52,13 @@ Run examples:
|
||||
|
||||
Makefile:
|
||||
```
|
||||
make clean && LLAMA_MULMAT_TUNE=1 make
|
||||
make clean && make
|
||||
```
|
||||
|
||||
CMake (with BLAS)
|
||||
```
|
||||
cmake --build . --target clean
|
||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON
|
||||
cmake .. -DLLAMA_BLAS=ON
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
@ -103,22 +103,29 @@ setup properly.
|
||||
General steps:
|
||||
|
||||
1. run `./mulmat-tune -h` to see how to build for misc vendors.
|
||||
you can build with `GGML_MULMAT_TUNE_NDEBUG=` to enable the the debug, e.g:
|
||||
To enable the debug, comment out `-DGGML_TUNE_NDEBUG` from makefile then run:
|
||||
|
||||
```
|
||||
make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1 LLAMA_NO_ACCELERATE=1 LLAMA_CLBLAST=1 make
|
||||
make clean; make
|
||||
```
|
||||
|
||||
On `macOS`, `ACCELERATE` is enabled by default. When `ACCELERATE` is built along
|
||||
with `CUDA` or `CL`, you may not see `CUDA` or `CL` from debug because `CPU`
|
||||
or `CPU_BLAS` is more faster (as of the estimation from mulmat tune).
|
||||
or `CPU_BLAS` is more faster (as of the estimation from mulmat tune), try run
|
||||
with `-t 1`?
|
||||
2. create a small prompt file:
|
||||
|
||||
```
|
||||
head -n 5 ./models/wikitext-2-raw/wiki.valid.raw > ./models/wiki.valid-5.raw
|
||||
```
|
||||
|
||||
3. run any of the following example commands.
|
||||
|
||||
```
|
||||
./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 1 -b 32
|
||||
./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 4 -b 64
|
||||
```
|
||||
|
||||
* `--mlock` is recommended for `macOS`, you may not want to use it.
|
||||
* don't change `-c 128`: too large `context size` causes 0 perplexity trunk.
|
||||
* `-t` is the number of threads, recommend `1`, `2`, `4` or `6`.
|
||||
|
@ -262,8 +262,6 @@ int main(int argc, char **argv) {
|
||||
struct ggml_mulmat_tune_params params;
|
||||
memset(¶ms, 0, sizeof(struct ggml_mulmat_tune_params));
|
||||
|
||||
ggml_mulmat_init_task_profiles();
|
||||
|
||||
ggml_mulmat_tune_model_init(¶ms.model, model_name, ftype);
|
||||
params.m_num = m_num;
|
||||
params.n_pass = n_pass;
|
||||
|
@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_MULMAT_TUNE
|
||||
#ifdef GGML_USE_TUNE
|
||||
if (params.tune || !params.tune_file.empty()){
|
||||
bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
|
||||
if (!ok || (params.tune && !params.tune_file.empty())) {
|
||||
|
@ -394,7 +394,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
|
||||
enum ggml_compute_error err =
|
||||
shared->task_runner(&state->params, state->node);
|
||||
|
||||
GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK);
|
||||
GGML_ASSERT(err == GGML_COMPUTE_OK);
|
||||
|
||||
ggml_spin_lock(&shared->spin);
|
||||
|
||||
@ -433,7 +433,11 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
|
||||
|
||||
// This is the params for main thread.
|
||||
struct ggml_compute_params params;
|
||||
enum ggml_compute_error err;
|
||||
enum ggml_compute_error err = GGML_COMPUTE_OK;
|
||||
|
||||
START:
|
||||
|
||||
memset(¶ms, 0, sizeof(struct ggml_compute_params));
|
||||
|
||||
for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) {
|
||||
if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) {
|
||||
@ -504,11 +508,19 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
|
||||
}
|
||||
|
||||
if (err != GGML_COMPUTE_OK) {
|
||||
if (err == GGML_COMPUTE_FALLBACK) {
|
||||
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
|
||||
int n = ggml_get_task_profiles(node, profiles);
|
||||
GGML_ASSERT(n > 0);
|
||||
memcpy(&node->task_profile, &profiles[0],
|
||||
sizeof(struct ggml_task_profile));
|
||||
goto START;
|
||||
}
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
return GGML_COMPUTE_OK;
|
||||
return err;
|
||||
}
|
||||
|
||||
struct ggml_threading_context *
|
||||
|
59
ggml-tune.c
59
ggml-tune.c
@ -55,7 +55,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
|
||||
|
||||
struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0};
|
||||
|
||||
struct ggml_task_profile *prof = NULL;
|
||||
const struct ggml_task_profile *prof = NULL;
|
||||
|
||||
if (e->M == M && e->N == N && e->K == K) {
|
||||
prof = e->profile;
|
||||
@ -97,10 +97,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
|
||||
e->N = N;
|
||||
e->K = K;
|
||||
|
||||
// to disable this, build with
|
||||
// `make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1
|
||||
// make`
|
||||
#if !defined(GGML_MULMAT_TUNE_NDEBUG)
|
||||
#ifndef GGML_TUNE_NDEBUG
|
||||
const char *names[3];
|
||||
for (int i = 0; i < 3; i++) {
|
||||
names[i] = ggml_mulmat_tune_task_backend_name(
|
||||
@ -163,8 +160,8 @@ void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
|
||||
|
||||
bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
|
||||
struct ggml_mulmat_tune_params *params,
|
||||
struct ggml_task_profile_factory *pf) {
|
||||
|
||||
ggml_task_profiles_provider *profiles_provider) {
|
||||
GGML_ASSERT(profiles_provider);
|
||||
struct ggml_mulmat_tune_model *model = ¶ms->model;
|
||||
|
||||
memset(tune, 0, sizeof(struct ggml_mulmat_tune));
|
||||
@ -208,8 +205,20 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
|
||||
|
||||
for (int i = 0; i < tune->n_shapes; i++) {
|
||||
struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
|
||||
shape->n_profiles = ggml_mulmat_get_task_profiles(
|
||||
pf, shape->src0_type, shape->src1_type, &shape->profiles);
|
||||
|
||||
struct ggml_tensor src0 = {
|
||||
.type = shape->src0_type,
|
||||
};
|
||||
struct ggml_tensor src1 = {
|
||||
.type = shape->src1_type,
|
||||
};
|
||||
struct ggml_tensor node = {
|
||||
.op = GGML_OP_MUL_MAT,
|
||||
.src0 = &src0,
|
||||
.src1 = &src1,
|
||||
};
|
||||
|
||||
shape->n_profiles = profiles_provider(&node, shape->profiles);
|
||||
if (shape->n_profiles == 0) {
|
||||
// allowed for testing.
|
||||
continue;
|
||||
@ -304,9 +313,20 @@ ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
|
||||
for (int i = 0; i < tune->n_shapes; i++) {
|
||||
const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
|
||||
|
||||
struct ggml_task_profile *builtin_profiles = NULL;
|
||||
int n_profiles = ggml_mulmat_get_task_profiles(
|
||||
NULL, shape->src0_type, shape->src1_type, &builtin_profiles);
|
||||
struct ggml_tensor src0 = {
|
||||
.type = shape->src0_type,
|
||||
};
|
||||
struct ggml_tensor src1 = {
|
||||
.type = shape->src1_type,
|
||||
};
|
||||
struct ggml_tensor node = {
|
||||
.op = GGML_OP_MUL_MAT,
|
||||
.src0 = &src0,
|
||||
.src1 = &src1,
|
||||
};
|
||||
|
||||
struct ggml_task_profile builtin_profiles[GGML_MAX_TASK_PROFILES];
|
||||
int n_profiles = ggml_get_task_profiles(&node, builtin_profiles);
|
||||
|
||||
if (n_profiles != shape->n_profiles) {
|
||||
snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
|
||||
@ -382,13 +402,6 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
|
||||
memset(shape->items, 0, item_size);
|
||||
}
|
||||
|
||||
{
|
||||
size_t sz = sizeof(struct ggml_task_profile) * shape->n_profiles;
|
||||
shape->profiles = malloc(sz);
|
||||
GGML_ASSERT(shape->profiles);
|
||||
memset(shape->profiles, 0, sz);
|
||||
}
|
||||
|
||||
for (int ip = 0; ip < shape->n_profiles; ip++) {
|
||||
struct ggml_task_profile *profile = &shape->profiles[ip];
|
||||
for (int j = 0; j < 3; j++) {
|
||||
@ -468,7 +481,7 @@ bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_task_profile *profile = &shape->profiles[ip];
|
||||
const struct ggml_task_profile *profile = &shape->profiles[ip];
|
||||
for (int k = 0; k < 3; k++) {
|
||||
if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) {
|
||||
rc = fprintf(fp, "%9d", item->stages_time[k]);
|
||||
@ -537,7 +550,7 @@ void ggml_mulmat_tune_estimate_time(
|
||||
const int max_m = shape->items[m_num - 1].M;
|
||||
|
||||
for (int ip = 0; ip < shape->n_profiles; ip++) {
|
||||
struct ggml_task_profile *profile = &shape->profiles[ip];
|
||||
const struct ggml_task_profile *profile = &shape->profiles[ip];
|
||||
profile_time[ip].total_time = 0;
|
||||
profile_time[ip].profile = profile;
|
||||
|
||||
@ -573,7 +586,7 @@ void ggml_mulmat_tune_estimate_time(
|
||||
GGML_ASSERT(p0 && p1);
|
||||
|
||||
for (int i_stage = 0; i_stage < 3; i_stage++) {
|
||||
struct ggml_task_stage *stage = &profile->stages[i_stage];
|
||||
const struct ggml_task_stage *stage = &profile->stages[i_stage];
|
||||
if (stage->backend == GGML_TASK_BACKEND_NONE) {
|
||||
continue;
|
||||
}
|
||||
@ -736,7 +749,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ok = ggml_mulmat_tune_init(tune, params, NULL);
|
||||
bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles);
|
||||
if (!ok) {
|
||||
return false;
|
||||
}
|
||||
|
@ -46,7 +46,7 @@ struct ggml_mulmat_tune_shape {
|
||||
enum ggml_type src1_type;
|
||||
|
||||
int n_profiles;
|
||||
struct ggml_task_profile *profiles;
|
||||
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
|
||||
|
||||
int m_num;
|
||||
int *arr_m;
|
||||
@ -69,7 +69,7 @@ struct ggml_mulmat_tune {
|
||||
};
|
||||
|
||||
struct ggml_mulmat_tune_time {
|
||||
struct ggml_task_profile *profile;
|
||||
const struct ggml_task_profile *profile;
|
||||
int stage_time[3];
|
||||
int total_time;
|
||||
};
|
||||
@ -78,7 +78,7 @@ struct mm_cache_element {
|
||||
int M;
|
||||
int N;
|
||||
int K;
|
||||
struct ggml_task_profile *profile;
|
||||
const struct ggml_task_profile *profile;
|
||||
int stages_time[3];
|
||||
};
|
||||
|
||||
@ -108,7 +108,7 @@ void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
|
||||
|
||||
bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
|
||||
struct ggml_mulmat_tune_params *params,
|
||||
struct ggml_task_profile_factory *profile_factory);
|
||||
ggml_task_profiles_provider *profiles_provider);
|
||||
|
||||
void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune);
|
||||
|
||||
|
535
ggml.c
535
ggml.c
@ -144,7 +144,7 @@ inline static void* ggml_aligned_malloc(size_t size) {
|
||||
#include "ggml-opencl.h"
|
||||
#endif
|
||||
|
||||
#if defined(GGML_USE_MULMAT_TUNE)
|
||||
#if defined(GGML_USE_TUNE)
|
||||
#include "ggml-tune.h"
|
||||
#endif
|
||||
|
||||
@ -4043,8 +4043,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||
ggml_cl_init();
|
||||
#endif
|
||||
|
||||
ggml_mulmat_init_task_profiles();
|
||||
|
||||
is_first_call = false;
|
||||
}
|
||||
|
||||
@ -15524,20 +15522,89 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
|
||||
return result;
|
||||
}
|
||||
|
||||
// ---- mulmat task profiles ----
|
||||
// ---- task profiles ----
|
||||
|
||||
static struct ggml_task_profile_factory default_task_profile_factory = {0};
|
||||
// Implement `ggml_task_profiles_provider`.
|
||||
// Fill `profiles` for the `node` and return number of profiles.
|
||||
//
|
||||
// NOTE: the node may be incompleted from testing or tunning, so please assert
|
||||
// everything used here.
|
||||
inline int ggml_get_task_profiles(
|
||||
struct ggml_tensor *node,
|
||||
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]) {
|
||||
GGML_ASSERT(node);
|
||||
GGML_ASSERT(node->op >= 0);
|
||||
GGML_ASSERT(profiles);
|
||||
|
||||
// TODO: thread unsafe. Should be initialized once.
|
||||
void ggml_mulmat_init_task_profiles(void) {
|
||||
const size_t sz = sizeof(struct ggml_task_profile_factory);
|
||||
memset(&default_task_profile_factory, 0, sz);
|
||||
memset(profiles, 0,
|
||||
sizeof(struct ggml_task_profile) * GGML_MAX_TASK_PROFILES);
|
||||
|
||||
struct ggml_task_profile *p = profiles;
|
||||
int n_profiles = 0;
|
||||
|
||||
switch (node->op) {
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_DUP: {
|
||||
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
n_profiles = 1;
|
||||
} break;
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ADD1: {
|
||||
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[0].stages[1].parallel = true;
|
||||
n_profiles = 1;
|
||||
} break;
|
||||
case GGML_OP_ACC: {
|
||||
p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[0].stages[1].parallel = true;
|
||||
n_profiles = 1;
|
||||
} break;
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SQRT:
|
||||
case GGML_OP_LOG:
|
||||
case GGML_OP_SUM:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_MEAN:
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_REPEAT_BACK:
|
||||
case GGML_OP_ABS:
|
||||
case GGML_OP_SGN:
|
||||
case GGML_OP_NEG:
|
||||
case GGML_OP_STEP:
|
||||
case GGML_OP_RELU: {
|
||||
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
n_profiles = 1;
|
||||
} break;
|
||||
case GGML_OP_MUL: {
|
||||
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[0].stages[1].parallel = true;
|
||||
n_profiles = 1;
|
||||
} break;
|
||||
case GGML_OP_GELU:
|
||||
case GGML_OP_SILU:
|
||||
case GGML_OP_SILU_BACK:
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_RMS_NORM_BACK: {
|
||||
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[0].stages[1].parallel = true;
|
||||
n_profiles = 1;
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_OUT_PROD: {
|
||||
GGML_ASSERT(node->src0);
|
||||
GGML_ASSERT(node->src1);
|
||||
|
||||
enum ggml_type src0_t = node->src0->type;
|
||||
enum ggml_type src1_t = node->src1->type;
|
||||
|
||||
GGML_ASSERT(src1_t == GGML_TYPE_F32);
|
||||
|
||||
// f32
|
||||
{
|
||||
struct ggml_task_profile *p = default_task_profile_factory.f32_f32;
|
||||
int i = 0;
|
||||
|
||||
if (src0_t == GGML_TYPE_F32) {
|
||||
p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[i].stages[1].parallel = true;
|
||||
i++;
|
||||
@ -15557,14 +15624,7 @@ void ggml_mulmat_init_task_profiles(void) {
|
||||
p[i].stages[1].wait = true;
|
||||
i++;
|
||||
#endif
|
||||
default_task_profile_factory.n_f32_f32 = i;
|
||||
}
|
||||
|
||||
// f16
|
||||
{
|
||||
struct ggml_task_profile *p = default_task_profile_factory.f16_f32;
|
||||
int i = 0;
|
||||
|
||||
} else if (src0_t == GGML_TYPE_F16) {
|
||||
p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[i].stages[1].parallel = true;
|
||||
@ -15585,14 +15645,7 @@ void ggml_mulmat_init_task_profiles(void) {
|
||||
p[i].stages[1].wait = true;
|
||||
i++;
|
||||
#endif
|
||||
default_task_profile_factory.n_f16_f32 = i;
|
||||
}
|
||||
|
||||
// qxx
|
||||
{
|
||||
struct ggml_task_profile *p = default_task_profile_factory.qxx_f32;
|
||||
int i = 0;
|
||||
|
||||
} else if (ggml_is_quantized(src0_t)) {
|
||||
p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[i].stages[1].parallel = true;
|
||||
@ -15615,73 +15668,108 @@ void ggml_mulmat_init_task_profiles(void) {
|
||||
p[i].stages[1].wait = true;
|
||||
i++;
|
||||
#endif
|
||||
default_task_profile_factory.n_qxx_f32 = i;
|
||||
}
|
||||
n_profiles = i;
|
||||
} break;
|
||||
case GGML_OP_SCALE: {
|
||||
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[0].stages[1].parallel = true;
|
||||
n_profiles = 1;
|
||||
} break;
|
||||
case GGML_OP_SET: {
|
||||
p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
n_profiles = 1;
|
||||
} break;
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_GET_ROWS:
|
||||
case GGML_OP_GET_ROWS_BACK:
|
||||
case GGML_OP_DIAG:
|
||||
case GGML_OP_DIAG_MASK_ZERO: {
|
||||
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
n_profiles = 1;
|
||||
} break;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_SOFT_MAX_BACK:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_ROPE_BACK: {
|
||||
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[0].stages[1].parallel = true;
|
||||
n_profiles = 1;
|
||||
} break;
|
||||
case GGML_OP_ALIBI: {
|
||||
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
n_profiles = 1;
|
||||
} break;
|
||||
case GGML_OP_CLAMP: {
|
||||
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
n_profiles = 1;
|
||||
} break;
|
||||
case GGML_OP_CONV_1D_1S:
|
||||
case GGML_OP_CONV_1D_2S: {
|
||||
p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[0].stages[1].parallel = true;
|
||||
n_profiles = 1;
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN: {
|
||||
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[0].stages[1].parallel = true;
|
||||
n_profiles = 1;
|
||||
} break;
|
||||
case GGML_OP_FLASH_FF: {
|
||||
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[0].stages[1].parallel = true;
|
||||
n_profiles = 1;
|
||||
}
|
||||
|
||||
int ggml_mulmat_get_task_profiles(struct ggml_task_profile_factory *pf,
|
||||
enum ggml_type src0_t, enum ggml_type src1_t,
|
||||
struct ggml_task_profile **profiles) {
|
||||
GGML_ASSERT(profiles);
|
||||
|
||||
if (pf == NULL) {
|
||||
pf = &default_task_profile_factory;
|
||||
}
|
||||
|
||||
GGML_ASSERT(src1_t == GGML_TYPE_F32);
|
||||
|
||||
if (src0_t == GGML_TYPE_F32) {
|
||||
*profiles = pf->f32_f32;
|
||||
return pf->n_f32_f32;
|
||||
}
|
||||
|
||||
if (src0_t == GGML_TYPE_F16) {
|
||||
*profiles = pf->f16_f32;
|
||||
return pf->n_f16_f32;
|
||||
}
|
||||
|
||||
if (ggml_is_quantized(src0_t)) {
|
||||
*profiles = pf->qxx_f32;
|
||||
return pf->n_qxx_f32;
|
||||
}
|
||||
|
||||
case GGML_OP_FLASH_ATTN_BACK: {
|
||||
p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[0].stages[1].parallel = true;
|
||||
n_profiles = 1;
|
||||
} break;
|
||||
case GGML_OP_MAP_UNARY:
|
||||
case GGML_OP_MAP_BINARY: {
|
||||
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
n_profiles = 1;
|
||||
} break;
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||
p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[0].stages[1].parallel = true;
|
||||
p[0].stages[2].backend = GGML_TASK_BACKEND_CPU;
|
||||
n_profiles = 1;
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS_BACK: {
|
||||
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
p[0].stages[1].parallel = true;
|
||||
n_profiles = 1;
|
||||
} break;
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_COUNT: {
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
static const struct ggml_task_profile *
|
||||
ggml_mulmat_get_default_task_profile(struct ggml_task_profile_factory *pf,
|
||||
enum ggml_type src0_type,
|
||||
enum ggml_type src1_type) {
|
||||
GGML_ASSERT(src1_type == GGML_TYPE_F32);
|
||||
if (pf == NULL) {
|
||||
pf = &default_task_profile_factory;
|
||||
}
|
||||
|
||||
struct ggml_task_profile *p = NULL;
|
||||
|
||||
if (src0_type == GGML_TYPE_F32) {
|
||||
p = &pf->f32_f32[0];
|
||||
} else if (src0_type == GGML_TYPE_F16) {
|
||||
p = &pf->f16_f32[0];
|
||||
} else if (ggml_is_quantized(src0_type)) {
|
||||
p = &pf->qxx_f32[0];
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
GGML_ASSERT(p->stages[i].backend == GGML_TASK_BACKEND_CPU ||
|
||||
p->stages[i].backend == GGML_TASK_BACKEND_NONE);
|
||||
}
|
||||
|
||||
return p;
|
||||
GGML_ASSERT(n_profiles > 0 && n_profiles <= GGML_MAX_TASK_PROFILES);
|
||||
return n_profiles;
|
||||
}
|
||||
|
||||
// Set task profile for GGML_OP_MUL_MAT or GGML_OP_OUT_PROD.
|
||||
static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node,
|
||||
struct ggml_mulmat_tune *tune) {
|
||||
static const struct ggml_task_profile *ggml_mulmat_get_task_profile(
|
||||
struct ggml_tensor *node, struct ggml_task_profile *profiles,
|
||||
int n_profiles, struct ggml_mulmat_tune *tune, int stages_time_us[3]) {
|
||||
|
||||
GGML_ASSERT(node);
|
||||
GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD);
|
||||
GGML_ASSERT(profiles);
|
||||
GGML_ASSERT(n_profiles >= 2);
|
||||
|
||||
enum ggml_type src0_t = node->src0->type;
|
||||
enum ggml_type src1_t = node->src1->type;
|
||||
@ -15697,42 +15785,26 @@ static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node,
|
||||
int N = (int)node->ne[0];
|
||||
int K = (int)node->src1->ne[0];
|
||||
|
||||
struct ggml_task_profile *profiles = NULL;
|
||||
int n_profiles = ggml_mulmat_get_task_profiles(NULL, src0_t, src1_t, &profiles);
|
||||
GGML_ASSERT(n_profiles >= 2);
|
||||
GGML_ASSERT(profiles);
|
||||
|
||||
const struct ggml_task_profile *prof = NULL;
|
||||
|
||||
if (cond_match) {
|
||||
#if defined(GGML_USE_MULMAT_TUNE)
|
||||
#if defined(GGML_USE_TUNE)
|
||||
if (tune != NULL) {
|
||||
int stages_time_us[3];
|
||||
prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t, src1_t, stages_time_us);
|
||||
prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t,
|
||||
src1_t, stages_time_us);
|
||||
if (prof != NULL) {
|
||||
GGML_ASSERT(prof);
|
||||
memcpy(&node->task_profile, prof, sizeof(struct ggml_task_profile));
|
||||
// Do not wait if the estimated execution time is too small (e.g. less than 0.1 ms)
|
||||
// TODO: need bench actual wait/notify time, see ggml-threading.c
|
||||
for (int i = 0; i < 3; i++) {
|
||||
if (node->task_profile.stages[i].wait) {
|
||||
if (stages_time_us[i] < 100) {
|
||||
node->task_profile.stages[i].wait = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
return prof;
|
||||
}
|
||||
}
|
||||
#else
|
||||
UNUSED(tune);
|
||||
UNUSED(stages_time_us);
|
||||
#endif
|
||||
|
||||
if (prof == NULL && M >= 32 && N >= 32 && K >= 32) {
|
||||
for (int j = 0; j < n_profiles; j++) {
|
||||
enum ggml_task_backend comp_be =
|
||||
profiles[j].stages[GGML_TASK_COMPUTE].backend;
|
||||
|
||||
switch (comp_be) {
|
||||
case GGML_TASK_BACKEND_GPU_CUDA: {
|
||||
GGML_ASSERT(ggml_cpu_has_cublas());
|
||||
@ -15753,76 +15825,131 @@ static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node,
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (prof) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (prof == NULL) {
|
||||
prof = ggml_mulmat_get_default_task_profile(NULL, src0_t, src1_t);
|
||||
prof = &profiles[0];
|
||||
GGML_ASSERT(prof->stages[1].backend == GGML_TASK_BACKEND_CPU);
|
||||
}
|
||||
|
||||
GGML_ASSERT(prof);
|
||||
memcpy(&node->task_profile, prof, sizeof(struct ggml_task_profile));
|
||||
return prof;
|
||||
}
|
||||
|
||||
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
|
||||
int n_threads = cgraph->n_threads;
|
||||
|
||||
if (ggml_cpu_has_blas()) {
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
struct ggml_tensor *node = cgraph->nodes[i];
|
||||
|
||||
memset(&node->task_profile, 0, sizeof(struct ggml_task_profile));
|
||||
struct ggml_task_stage *stages = node->task_profile.stages;
|
||||
|
||||
// Adapt node->backend: assume GPU at COMPUTE stage.
|
||||
if (node->backend > GGML_BACKEND_CPU) {
|
||||
stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_NONE;
|
||||
stages[GGML_TASK_FINALIZE].backend = GGML_TASK_BACKEND_NONE;
|
||||
|
||||
stages[GGML_TASK_COMPUTE].parallel = false;
|
||||
bool wait = (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL);
|
||||
stages[GGML_TASK_COMPUTE].wait = wait;
|
||||
if (ggml_cpu_has_cublas()) {
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_GPU_CUDA;
|
||||
} else if (ggml_cpu_has_clblast()) {
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_GPU_CL;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
} else if (node->op == GGML_OP_MUL_MAT) {
|
||||
struct ggml_mulmat_tune * tune = NULL;
|
||||
#if defined(GGML_USE_MULMAT_TUNE)
|
||||
tune = cgraph->tune;
|
||||
#endif
|
||||
ggml_mulmat_set_tensor_task_profile(node, tune);
|
||||
} else if (node->op == GGML_OP_OUT_PROD) {
|
||||
ggml_mulmat_set_tensor_task_profile(node, NULL);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_threading_context *thrd_ctx = ggml_threading_start(
|
||||
n_threads, ggml_threading_graph_compute_thread, ggml_compute_forward,
|
||||
GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL);
|
||||
|
||||
// initialize tasks + work buffer
|
||||
{
|
||||
// int64_t t0 = ggml_time_us();
|
||||
|
||||
size_t work_size = 0;
|
||||
|
||||
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
|
||||
|
||||
// thread scheduling for the different operations
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[i];
|
||||
if (node->op == GGML_OP_NONE || node->op == GGML_OP_CONT) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int n_profiles = ggml_get_task_profiles(node, profiles);
|
||||
|
||||
const struct ggml_task_profile *profile = NULL;
|
||||
|
||||
// Adapt node->backend: assume GPU at COMPUTE stage.
|
||||
if (node->backend == GGML_BACKEND_GPU ||
|
||||
node->backend == GGML_BACKEND_GPU_SPLIT) {
|
||||
enum ggml_task_backend be;
|
||||
if (ggml_cpu_has_cublas()) {
|
||||
be = GGML_TASK_BACKEND_GPU_CUDA;
|
||||
} else if (ggml_cpu_has_clblast()) {
|
||||
be = GGML_TASK_BACKEND_GPU_CL;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
for (int j = 0; j < n_profiles; j++) {
|
||||
if (profiles[j].stages[1].backend == be) {
|
||||
profile = &profiles[j];
|
||||
break;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(profile);
|
||||
} else {
|
||||
GGML_ASSERT(node->backend == GGML_BACKEND_CPU);
|
||||
}
|
||||
|
||||
bool profile_copied = false;
|
||||
|
||||
if (node->op == GGML_OP_MUL_MAT) {
|
||||
#if defined(GGML_USE_TUNE)
|
||||
int stages_time_us[3];
|
||||
profile = ggml_mulmat_get_task_profile(
|
||||
node, profiles, n_profiles, cgraph->tune, stages_time_us);
|
||||
GGML_ASSERT(profile);
|
||||
|
||||
if (cgraph->tune) {
|
||||
memcpy(&node->task_profile, profile,
|
||||
sizeof(struct ggml_task_profile));
|
||||
profile_copied = true;
|
||||
|
||||
// Do not wait if the estimated execution time is too small
|
||||
// (e.g. less than 0.1 ms)
|
||||
// TODO: need bench actual wait/notify time, see
|
||||
// ggml-threading.c
|
||||
for (int j = 0; j< 3; j++) {
|
||||
if (node->task_profile.stages[j].wait) {
|
||||
if (stages_time_us[j] < 100) {
|
||||
node->task_profile.stages[j].wait = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
profile = ggml_mulmat_get_task_profile(node, profiles,
|
||||
n_profiles, NULL, NULL);
|
||||
GGML_ASSERT(profile);
|
||||
#endif
|
||||
} else if (node->op == GGML_OP_OUT_PROD) { // FIXME: is is right?
|
||||
profile = ggml_mulmat_get_task_profile(node, profiles,
|
||||
n_profiles, NULL, NULL);
|
||||
GGML_ASSERT(profile);
|
||||
} else {
|
||||
profile = &profiles[0];
|
||||
GGML_ASSERT(profile->stages[1].backend ==
|
||||
GGML_TASK_BACKEND_CPU);
|
||||
}
|
||||
|
||||
if (!profile_copied) {
|
||||
memcpy(&node->task_profile, profile,
|
||||
sizeof(struct ggml_task_profile));
|
||||
}
|
||||
|
||||
struct ggml_task_stage *stages = node->task_profile.stages;
|
||||
|
||||
// compute stage n_tasks.
|
||||
int n_tasks = stages[1].parallel ? n_threads : 1;
|
||||
|
||||
// Allocate temp buffer `wdata` for CPU.
|
||||
// NOTE: GPU MAY fallback to CPU, so we have to cover all possible cases.
|
||||
switch (node->op) {
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_DUP:
|
||||
{
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
|
||||
size_t cur = 0;
|
||||
if (ggml_is_quantized(node->type)) {
|
||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads;
|
||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks;
|
||||
}
|
||||
|
||||
work_size = MAX(work_size, cur);
|
||||
@ -15830,27 +15957,20 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ADD1:
|
||||
{
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[GGML_TASK_COMPUTE].parallel = true;
|
||||
|
||||
size_t cur = 0;
|
||||
|
||||
if (ggml_is_quantized(node->src0->type)) {
|
||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads;
|
||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks;
|
||||
}
|
||||
|
||||
work_size = MAX(work_size, cur);
|
||||
} break;
|
||||
case GGML_OP_ACC:
|
||||
{
|
||||
stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[GGML_TASK_COMPUTE].parallel = true;
|
||||
|
||||
size_t cur = 0;
|
||||
|
||||
if (ggml_is_quantized(node->src0->type)) {
|
||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads;
|
||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks;
|
||||
}
|
||||
|
||||
work_size = MAX(work_size, cur);
|
||||
@ -15870,16 +15990,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||
case GGML_OP_NEG:
|
||||
case GGML_OP_STEP:
|
||||
case GGML_OP_RELU:
|
||||
{
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
|
||||
} break;
|
||||
case GGML_OP_MUL:
|
||||
{
|
||||
if (stages[GGML_TASK_COMPUTE].backend == GGML_TASK_BACKEND_NONE) {
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[GGML_TASK_COMPUTE].parallel = true;
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_GELU:
|
||||
case GGML_OP_SILU:
|
||||
case GGML_OP_SILU_BACK:
|
||||
@ -15887,28 +15998,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
{
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[GGML_TASK_COMPUTE].parallel = true;
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_OUT_PROD:
|
||||
case GGML_OP_OUT_PROD: // FIXME: is is right?
|
||||
{
|
||||
size_t cur = 0;
|
||||
enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend;
|
||||
GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE);
|
||||
|
||||
// TODO: remove this check once we are sure `ggml_mulmat_set_tensor_task_profile()` is correct.
|
||||
if ((comp_backend & GGML_TASK_BACKEND_GPU) || comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
|
||||
enum ggml_type src0_t = node->src0->type;
|
||||
enum ggml_type src1_t = node->src1->type;
|
||||
bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 ||
|
||||
ggml_is_quantized(src0_t)) &&
|
||||
src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 &&
|
||||
ggml_is_contiguous(node->src0) &&
|
||||
ggml_is_contiguous(node->src1);
|
||||
GGML_ASSERT(cond_match);
|
||||
}
|
||||
|
||||
if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
|
||||
@ -15930,7 +16027,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
} else if (comp_backend == GGML_TASK_BACKEND_CPU || comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
|
||||
// We have to reseve buffer for CUDA because it may fallback to CPU.
|
||||
if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
|
||||
GGML_ASSERT(ggml_cpu_has_cublas());
|
||||
}
|
||||
@ -15955,13 +16051,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||
} break;
|
||||
case GGML_OP_SCALE:
|
||||
{
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[GGML_TASK_COMPUTE].parallel = true;
|
||||
} break;
|
||||
case GGML_OP_SET:
|
||||
{
|
||||
stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
|
||||
} break;
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_RESHAPE:
|
||||
@ -15972,33 +16064,18 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||
case GGML_OP_GET_ROWS_BACK:
|
||||
case GGML_OP_DIAG:
|
||||
case GGML_OP_DIAG_MASK_ZERO:
|
||||
{
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
|
||||
} break;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_SOFT_MAX_BACK:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_ROPE_BACK:
|
||||
{
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[GGML_TASK_COMPUTE].parallel = true;
|
||||
} break;
|
||||
case GGML_OP_ALIBI:
|
||||
{
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
|
||||
} break;
|
||||
case GGML_OP_CLAMP:
|
||||
{
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
|
||||
} break;
|
||||
case GGML_OP_CONV_1D_1S:
|
||||
case GGML_OP_CONV_1D_2S:
|
||||
{
|
||||
stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[GGML_TASK_COMPUTE].parallel = true;
|
||||
|
||||
GGML_ASSERT(node->src0->ne[3] == 1);
|
||||
GGML_ASSERT(node->src1->ne[2] == 1);
|
||||
GGML_ASSERT(node->src1->ne[3] == 1);
|
||||
@ -16026,62 +16103,53 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
{
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[GGML_TASK_COMPUTE].parallel = true;
|
||||
|
||||
size_t cur = 0;
|
||||
|
||||
const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
|
||||
|
||||
if (node->src1->type == GGML_TYPE_F32) {
|
||||
cur = sizeof(float)*ne11*n_threads; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*ne11*n_threads; // this is overestimated by x2
|
||||
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
||||
}
|
||||
|
||||
if (node->src1->type == GGML_TYPE_F16) {
|
||||
cur = sizeof(float)*ne11*n_threads; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*ne11*n_threads; // this is overestimated by x2
|
||||
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
||||
}
|
||||
|
||||
work_size = MAX(work_size, cur);
|
||||
} break;
|
||||
case GGML_OP_FLASH_FF:
|
||||
{
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[GGML_TASK_COMPUTE].parallel = true;
|
||||
size_t cur = 0;
|
||||
|
||||
if (node->src1->type == GGML_TYPE_F32) {
|
||||
cur = sizeof(float)*node->src1->ne[1]*n_threads; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*node->src1->ne[1]*n_threads; // this is overestimated by x2
|
||||
cur = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2
|
||||
}
|
||||
|
||||
if (node->src1->type == GGML_TYPE_F16) {
|
||||
cur = sizeof(float)*node->src1->ne[1]*n_threads; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*node->src1->ne[1]*n_threads; // this is overestimated by x2
|
||||
cur = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2
|
||||
}
|
||||
|
||||
work_size = MAX(work_size, cur);
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_BACK:
|
||||
{
|
||||
stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[GGML_TASK_COMPUTE].parallel = true;
|
||||
|
||||
size_t cur = 0;
|
||||
|
||||
const int64_t D = node->src0->ne[0];
|
||||
const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
|
||||
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
||||
if (node->src1->type == GGML_TYPE_F32) {
|
||||
cur = sizeof(float)*mxDn*n_threads; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*mxDn*n_threads; // this is overestimated by x2
|
||||
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
|
||||
}
|
||||
|
||||
if (node->src1->type == GGML_TYPE_F16) {
|
||||
cur = sizeof(float)*mxDn*n_threads; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*mxDn*n_threads; // this is overestimated by x2
|
||||
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
|
||||
}
|
||||
|
||||
work_size = MAX(work_size, cur);
|
||||
@ -16089,31 +16157,21 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||
case GGML_OP_MAP_UNARY:
|
||||
case GGML_OP_MAP_BINARY:
|
||||
{
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
|
||||
} break;
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||
{
|
||||
stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[GGML_TASK_COMPUTE].parallel = true;
|
||||
stages[GGML_TASK_FINALIZE].backend = GGML_TASK_BACKEND_CPU;
|
||||
|
||||
size_t cur = ggml_type_size(node->type)*(n_threads + node->src0->ne[0]*n_threads);
|
||||
size_t cur = ggml_type_size(node->type)*(n_threads + node->src0->ne[0]*n_tasks);
|
||||
|
||||
work_size = MAX(work_size, cur);
|
||||
} break;
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
||||
{
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[GGML_TASK_COMPUTE].parallel = true;
|
||||
|
||||
size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_threads;
|
||||
size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks;
|
||||
|
||||
work_size = MAX(work_size, cur);
|
||||
} break;
|
||||
case GGML_OP_NONE:
|
||||
{
|
||||
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
|
||||
} break;
|
||||
case GGML_OP_COUNT:
|
||||
{
|
||||
@ -16134,6 +16192,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||
GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size);
|
||||
cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size);
|
||||
}
|
||||
|
||||
// ~ 50 us
|
||||
//printf("=== prepare computing took %d us\n", (int)(ggml_time_us() - t0));
|
||||
}
|
||||
|
||||
const int64_t perf_start_cycles = ggml_perf_cycles();
|
||||
@ -16162,16 +16223,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||
|
||||
enum ggml_compute_error err =
|
||||
ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
|
||||
if (err == GGML_COMPUTE_FALLBACK) {
|
||||
if (node->op == GGML_OP_MUL_MAT) {
|
||||
const struct ggml_task_profile *p =
|
||||
ggml_mulmat_get_default_task_profile(
|
||||
NULL, node->src0->type, node->src1->type);
|
||||
memcpy(&node->task_profile, p,
|
||||
sizeof(struct ggml_task_profile));
|
||||
}
|
||||
err = ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
|
||||
}
|
||||
GGML_ASSERT(err == GGML_COMPUTE_OK);
|
||||
|
||||
// performance stats (node)
|
||||
|
28
ggml.h
28
ggml.h
@ -202,7 +202,7 @@
|
||||
#define GGML_MAX_OPT 4
|
||||
#define GGML_MAX_NAME 32
|
||||
#define GGML_DEFAULT_N_THREADS 4
|
||||
#define GGML_MAX_TASK_PROFILES 8
|
||||
#define GGML_MAX_TASK_PROFILES 4
|
||||
|
||||
#define GGML_ASSERT(x) \
|
||||
do { \
|
||||
@ -399,17 +399,6 @@ extern "C" {
|
||||
uint8_t dev_flags[4];
|
||||
};
|
||||
|
||||
struct ggml_task_profile_factory {
|
||||
struct ggml_task_profile f32_f32[GGML_MAX_TASK_PROFILES];
|
||||
int n_f32_f32;
|
||||
|
||||
struct ggml_task_profile f16_f32[GGML_MAX_TASK_PROFILES];
|
||||
int n_f16_f32;
|
||||
|
||||
struct ggml_task_profile qxx_f32[GGML_MAX_TASK_PROFILES];
|
||||
int n_qxx_f32;
|
||||
};
|
||||
|
||||
// n-dimensional tensor
|
||||
struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
@ -450,6 +439,11 @@ extern "C" {
|
||||
char padding[12];
|
||||
};
|
||||
|
||||
// Fill `profiles` for the `node` and return number of profiles.
|
||||
typedef int (ggml_task_profiles_provider) (
|
||||
struct ggml_tensor *node,
|
||||
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]);
|
||||
|
||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||
|
||||
// computation graph
|
||||
@ -1345,15 +1339,11 @@ extern "C" {
|
||||
GGML_API int ggml_cpu_has_vsx (void);
|
||||
|
||||
//
|
||||
// mulmat task profiles
|
||||
// task profiles
|
||||
//
|
||||
GGML_API void ggml_mulmat_init_task_profiles(void);
|
||||
|
||||
GGML_API int ggml_mulmat_get_task_profiles(
|
||||
struct ggml_task_profile_factory *pf,
|
||||
enum ggml_type src0_t,
|
||||
enum ggml_type src1_t,
|
||||
struct ggml_task_profile **profiles);
|
||||
// Implements `ggml_task_profiles_provider`.
|
||||
GGML_API int ggml_get_task_profiles (struct ggml_tensor *node, struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]);
|
||||
|
||||
//
|
||||
// Internal types and functions exposed for tests and benchmarks
|
||||
|
10
llama.cpp
10
llama.cpp
@ -21,7 +21,7 @@
|
||||
#include "ggml-metal.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_MULMAT_TUNE
|
||||
#ifdef GGML_USE_TUNE
|
||||
#include "ggml-tune.h"
|
||||
#endif
|
||||
|
||||
@ -285,7 +285,7 @@ struct llama_context {
|
||||
int buf_last = 0;
|
||||
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
||||
|
||||
#ifdef GGML_USE_MULMAT_TUNE
|
||||
#ifdef GGML_USE_TUNE
|
||||
struct ggml_mulmat_tune *tune = nullptr;
|
||||
#endif
|
||||
|
||||
@ -1408,7 +1408,7 @@ static bool llama_eval_internal(
|
||||
ggml_cgraph gf = {};
|
||||
gf.n_threads = n_threads;
|
||||
|
||||
#ifdef GGML_USE_MULMAT_TUNE
|
||||
#ifdef GGML_USE_TUNE
|
||||
gf.tune =lctx.tune;
|
||||
#endif
|
||||
|
||||
@ -2743,7 +2743,7 @@ struct llama_context * llama_init_from_file(
|
||||
return ctx;
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_MULMAT_TUNE
|
||||
#ifdef GGML_USE_TUNE
|
||||
bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) {
|
||||
printf("\n");
|
||||
if (ctx->model.n_gpu_layers != 0) {
|
||||
@ -2882,7 +2882,7 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
|
||||
#endif
|
||||
|
||||
void llama_free(struct llama_context * ctx) {
|
||||
#ifdef GGML_USE_MULMAT_TUNE
|
||||
#ifdef GGML_USE_TUNE
|
||||
if (ctx->tune) {
|
||||
delete(ctx->tune);
|
||||
}
|
||||
|
@ -356,6 +356,17 @@ int main(void) {
|
||||
{
|
||||
++n_tests;
|
||||
|
||||
// required by getting task profiles.
|
||||
node.op = GGML_OP_MUL_MAT;
|
||||
struct ggml_tensor src0 = {
|
||||
.type = GGML_TYPE_Q4_0,
|
||||
};
|
||||
struct ggml_tensor src1 = {
|
||||
.type = GGML_TYPE_F32,
|
||||
};
|
||||
node.src0 = &src0;
|
||||
node.src1 = &src1;
|
||||
|
||||
node.backend = GGML_BACKEND_GPU;
|
||||
if (test_fallback(&node) == 0) {
|
||||
++n_passed;
|
||||
|
@ -3,6 +3,8 @@
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#define UNUSED(x) (void)(x)
|
||||
|
||||
static int bench(void);
|
||||
static int estimate_time_non_zero_NK(void);
|
||||
|
||||
@ -77,6 +79,20 @@ static int bench(void) {
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
// implement `ggml_task_profiles_provider`
|
||||
static int
|
||||
ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node,
|
||||
struct ggml_task_profile *profiles) {
|
||||
UNUSED(node);
|
||||
profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
|
||||
profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
|
||||
profiles[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
profiles[1].stages[0].backend = GGML_TASK_BACKEND_CPU;
|
||||
profiles[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
|
||||
|
||||
return 2;
|
||||
}
|
||||
|
||||
int estimate_time_non_zero_NK(void) {
|
||||
printf("test: %s\n", __func__);
|
||||
|
||||
@ -92,22 +108,10 @@ int estimate_time_non_zero_NK(void) {
|
||||
|
||||
const int m_num = 2;
|
||||
|
||||
struct ggml_task_profile_factory pf;
|
||||
memset(&pf, 0, sizeof(struct ggml_task_profile_factory));
|
||||
|
||||
{
|
||||
pf.n_qxx_f32 = 2;
|
||||
pf.qxx_f32[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
|
||||
pf.qxx_f32[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
|
||||
pf.qxx_f32[1].stages[0].backend = GGML_TASK_BACKEND_CPU;
|
||||
pf.qxx_f32[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
|
||||
}
|
||||
|
||||
struct ggml_mulmat_tune_params params;
|
||||
init_params(¶ms, m_num);
|
||||
|
||||
ggml_mulmat_tune_init(&tune, ¶ms, &pf);
|
||||
ggml_mulmat_tune_init(&tune, ¶ms, ggml_task_profiles_mock_qxx_provider);
|
||||
|
||||
struct ggml_mulmat_tune_shape *shape = NULL;
|
||||
for (int i = 0; i < tune.n_shapes; i++) {
|
||||
|
Loading…
Reference in New Issue
Block a user