bulk refactored task profile to support complete fallback; enable tune by default for ease of dev

This commit is contained in:
mqy 2023-06-15 06:43:08 +08:00
parent 1b041d7737
commit 48016f685c
15 changed files with 480 additions and 394 deletions

View File

@ -78,7 +78,7 @@ option(LLAMA_K_QUANTS "llama: use k-quants"
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" OFF) option(LLAMA_BUILD_SERVER "llama: build server example" OFF)
option(LLAMA_MULMAT_TUNE "llama: mulmat tune" OFF) option(LLAMA_TUNE "llama: mulmat tune" ON)
# #
# Build info header # Build info header
@ -278,9 +278,9 @@ if (LLAMA_METAL)
) )
endif() endif()
if (LLAMA_MULMAT_TUNE) if (LLAMA_TUNE)
add_compile_definitions(GGML_USE_MULMAT_TUNE) add_compile_definitions(GGML_USE_TUNE)
add_compile_definitions(GGML_MULMAT_TUNE_NDEBUG) add_compile_definitions(GGML_TUNE_NDEBUG)
endif() endif()
if (LLAMA_K_QUANTS) if (LLAMA_K_QUANTS)

View File

@ -231,14 +231,14 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
CFLAGS += -mfp16-format=ieee -mno-unaligned-access CFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif endif
ifdef LLAMA_NO_K_QUANTS ifndef LLAMA_NO_K_QUANTS
k_quants.o: k_quants.c k_quants.h k_quants.o: k_quants.c k_quants.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_NO_K_QUANTS endif # LLAMA_NO_K_QUANTS
ifdef LLAMA_MULMAT_TUNE ifndef LLAMA_NO_TUNE
CFLAGS += -DGGML_USE_MULMAT_TUNE -DGGML_MULMAT_TUNE_NDEBUG CFLAGS += -DGGML_USE_TUNE -DGGML_TUNE_NDEBUG
CXXFLAGS += -DGGML_USE_MULMAT_TUNE CXXFLAGS += -DGGML_USE_TUNE
endif endif
# #

View File

@ -345,7 +345,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.mem_test = true; params.mem_test = true;
} else if (arg == "--export") { } else if (arg == "--export") {
params.export_cgraph = true; params.export_cgraph = true;
#ifdef GGML_USE_MULMAT_TUNE #ifdef GGML_USE_TUNE
} else if (arg == "--tune") { } else if (arg == "--tune") {
params.tune = true; params.tune = true;
} else if (arg == "--tune-file") { } else if (arg == "--tune-file") {
@ -354,7 +354,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.tune_file = argv[i]; params.tune_file = argv[i];
#endif // GGML_USE_MULMAT_TUNE #endif // GGML_USE_TUNE
} else if (arg == "--verbose-prompt") { } else if (arg == "--verbose-prompt") {
params.verbose_prompt = true; params.verbose_prompt = true;
} else if (arg == "-r" || arg == "--reverse-prompt") { } else if (arg == "-r" || arg == "--reverse-prompt") {
@ -508,7 +508,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
#endif #endif
fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n"); fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
#ifdef GGML_USE_MULMAT_TUNE #ifdef GGML_USE_TUNE
fprintf(stderr, " --tune mulmat tune enable. If tune-file is set then exit after bench\n"); fprintf(stderr, " --tune mulmat tune enable. If tune-file is set then exit after bench\n");
fprintf(stderr, " --tune-file FILE mulmat tune data file. If tune is true, then write bench result to this file, else load the file and run\n"); fprintf(stderr, " --tune-file FILE mulmat tune data file. If tune is true, then write bench result to this file, else load the file and run\n");
#endif #endif

View File

@ -117,7 +117,7 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
#ifdef GGML_USE_MULMAT_TUNE #ifdef GGML_USE_TUNE
if (params.tune || !params.tune_file.empty()) { if (params.tune || !params.tune_file.empty()) {
bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str()); bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
if (!ok || (params.tune && !params.tune_file.empty())) { if (!ok || (params.tune && !params.tune_file.empty())) {

View File

@ -23,13 +23,13 @@ run bench ahead of time (saving tens of seconds), but there are two shortcomings
Makefile: Makefile:
``` ```
make clean && LLAMA_MULMAT_TUNE=1 make make clean && make
``` ```
CMake (with BLAS): CMake (with BLAS):
``` ```
cmake --build . --target clean cmake --build . --target clean
cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON cmake .. -DLLAMA_BLAS=ON
cmake --build . --config Release cmake --build . --config Release
``` ```
@ -52,13 +52,13 @@ Run examples:
Makefile: Makefile:
``` ```
make clean && LLAMA_MULMAT_TUNE=1 make make clean && make
``` ```
CMake (with BLAS) CMake (with BLAS)
``` ```
cmake --build . --target clean cmake --build . --target clean
cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON cmake .. -DLLAMA_BLAS=ON
cmake --build . --config Release cmake --build . --config Release
``` ```
@ -103,22 +103,29 @@ setup properly.
General steps: General steps:
1. run `./mulmat-tune -h` to see how to build for misc vendors. 1. run `./mulmat-tune -h` to see how to build for misc vendors.
you can build with `GGML_MULMAT_TUNE_NDEBUG=` to enable the the debug, e.g: To enable the debug, comment out `-DGGML_TUNE_NDEBUG` from makefile then run:
``` ```
make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1 LLAMA_NO_ACCELERATE=1 LLAMA_CLBLAST=1 make make clean; make
``` ```
On `macOS`, `ACCELERATE` is enabled by default. When `ACCELERATE` is built along On `macOS`, `ACCELERATE` is enabled by default. When `ACCELERATE` is built along
with `CUDA` or `CL`, you may not see `CUDA` or `CL` from debug because `CPU` with `CUDA` or `CL`, you may not see `CUDA` or `CL` from debug because `CPU`
or `CPU_BLAS` is more faster (as of the estimation from mulmat tune). or `CPU_BLAS` is more faster (as of the estimation from mulmat tune), try run
with `-t 1`?
2. create a small prompt file: 2. create a small prompt file:
``` ```
head -n 5 ./models/wikitext-2-raw/wiki.valid.raw > ./models/wiki.valid-5.raw head -n 5 ./models/wikitext-2-raw/wiki.valid.raw > ./models/wiki.valid-5.raw
``` ```
3. run any of the following example commands. 3. run any of the following example commands.
``` ```
./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 1 -b 32 ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 1 -b 32
./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 4 -b 64 ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 4 -b 64
``` ```
* `--mlock` is recommended for `macOS`, you may not want to use it. * `--mlock` is recommended for `macOS`, you may not want to use it.
* don't change `-c 128`: too large `context size` causes 0 perplexity trunk. * don't change `-c 128`: too large `context size` causes 0 perplexity trunk.
* `-t` is the number of threads, recommend `1`, `2`, `4` or `6`. * `-t` is the number of threads, recommend `1`, `2`, `4` or `6`.

View File

@ -262,8 +262,6 @@ int main(int argc, char **argv) {
struct ggml_mulmat_tune_params params; struct ggml_mulmat_tune_params params;
memset(&params, 0, sizeof(struct ggml_mulmat_tune_params)); memset(&params, 0, sizeof(struct ggml_mulmat_tune_params));
ggml_mulmat_init_task_profiles();
ggml_mulmat_tune_model_init(&params.model, model_name, ftype); ggml_mulmat_tune_model_init(&params.model, model_name, ftype);
params.m_num = m_num; params.m_num = m_num;
params.n_pass = n_pass; params.n_pass = n_pass;

View File

@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
#ifdef GGML_USE_MULMAT_TUNE #ifdef GGML_USE_TUNE
if (params.tune || !params.tune_file.empty()){ if (params.tune || !params.tune_file.empty()){
bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str()); bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
if (!ok || (params.tune && !params.tune_file.empty())) { if (!ok || (params.tune && !params.tune_file.empty())) {

View File

@ -394,7 +394,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
enum ggml_compute_error err = enum ggml_compute_error err =
shared->task_runner(&state->params, state->node); shared->task_runner(&state->params, state->node);
GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK); GGML_ASSERT(err == GGML_COMPUTE_OK);
ggml_spin_lock(&shared->spin); ggml_spin_lock(&shared->spin);
@ -433,7 +433,11 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
// This is the params for main thread. // This is the params for main thread.
struct ggml_compute_params params; struct ggml_compute_params params;
enum ggml_compute_error err; enum ggml_compute_error err = GGML_COMPUTE_OK;
START:
memset(&params, 0, sizeof(struct ggml_compute_params));
for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) { for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) {
if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) { if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) {
@ -504,11 +508,19 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
} }
if (err != GGML_COMPUTE_OK) { if (err != GGML_COMPUTE_OK) {
if (err == GGML_COMPUTE_FALLBACK) {
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
int n = ggml_get_task_profiles(node, profiles);
GGML_ASSERT(n > 0);
memcpy(&node->task_profile, &profiles[0],
sizeof(struct ggml_task_profile));
goto START;
}
return err; return err;
} }
} }
return GGML_COMPUTE_OK; return err;
} }
struct ggml_threading_context * struct ggml_threading_context *

View File

@ -55,7 +55,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0}; struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0};
struct ggml_task_profile *prof = NULL; const struct ggml_task_profile *prof = NULL;
if (e->M == M && e->N == N && e->K == K) { if (e->M == M && e->N == N && e->K == K) {
prof = e->profile; prof = e->profile;
@ -97,10 +97,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
e->N = N; e->N = N;
e->K = K; e->K = K;
// to disable this, build with #ifndef GGML_TUNE_NDEBUG
// `make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1
// make`
#if !defined(GGML_MULMAT_TUNE_NDEBUG)
const char *names[3]; const char *names[3];
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
names[i] = ggml_mulmat_tune_task_backend_name( names[i] = ggml_mulmat_tune_task_backend_name(
@ -163,8 +160,8 @@ void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune, bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
struct ggml_mulmat_tune_params *params, struct ggml_mulmat_tune_params *params,
struct ggml_task_profile_factory *pf) { ggml_task_profiles_provider *profiles_provider) {
GGML_ASSERT(profiles_provider);
struct ggml_mulmat_tune_model *model = &params->model; struct ggml_mulmat_tune_model *model = &params->model;
memset(tune, 0, sizeof(struct ggml_mulmat_tune)); memset(tune, 0, sizeof(struct ggml_mulmat_tune));
@ -208,8 +205,20 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
for (int i = 0; i < tune->n_shapes; i++) { for (int i = 0; i < tune->n_shapes; i++) {
struct ggml_mulmat_tune_shape *shape = &tune->shapes[i]; struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
shape->n_profiles = ggml_mulmat_get_task_profiles(
pf, shape->src0_type, shape->src1_type, &shape->profiles); struct ggml_tensor src0 = {
.type = shape->src0_type,
};
struct ggml_tensor src1 = {
.type = shape->src1_type,
};
struct ggml_tensor node = {
.op = GGML_OP_MUL_MAT,
.src0 = &src0,
.src1 = &src1,
};
shape->n_profiles = profiles_provider(&node, shape->profiles);
if (shape->n_profiles == 0) { if (shape->n_profiles == 0) {
// allowed for testing. // allowed for testing.
continue; continue;
@ -304,9 +313,20 @@ ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
for (int i = 0; i < tune->n_shapes; i++) { for (int i = 0; i < tune->n_shapes; i++) {
const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i]; const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
struct ggml_task_profile *builtin_profiles = NULL; struct ggml_tensor src0 = {
int n_profiles = ggml_mulmat_get_task_profiles( .type = shape->src0_type,
NULL, shape->src0_type, shape->src1_type, &builtin_profiles); };
struct ggml_tensor src1 = {
.type = shape->src1_type,
};
struct ggml_tensor node = {
.op = GGML_OP_MUL_MAT,
.src0 = &src0,
.src1 = &src1,
};
struct ggml_task_profile builtin_profiles[GGML_MAX_TASK_PROFILES];
int n_profiles = ggml_get_task_profiles(&node, builtin_profiles);
if (n_profiles != shape->n_profiles) { if (n_profiles != shape->n_profiles) {
snprintf(errbuf, errbuf_len - 1, "task profiles mismatch"); snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
@ -382,13 +402,6 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
memset(shape->items, 0, item_size); memset(shape->items, 0, item_size);
} }
{
size_t sz = sizeof(struct ggml_task_profile) * shape->n_profiles;
shape->profiles = malloc(sz);
GGML_ASSERT(shape->profiles);
memset(shape->profiles, 0, sz);
}
for (int ip = 0; ip < shape->n_profiles; ip++) { for (int ip = 0; ip < shape->n_profiles; ip++) {
struct ggml_task_profile *profile = &shape->profiles[ip]; struct ggml_task_profile *profile = &shape->profiles[ip];
for (int j = 0; j < 3; j++) { for (int j = 0; j < 3; j++) {
@ -468,7 +481,7 @@ bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
} }
} }
struct ggml_task_profile *profile = &shape->profiles[ip]; const struct ggml_task_profile *profile = &shape->profiles[ip];
for (int k = 0; k < 3; k++) { for (int k = 0; k < 3; k++) {
if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) { if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) {
rc = fprintf(fp, "%9d", item->stages_time[k]); rc = fprintf(fp, "%9d", item->stages_time[k]);
@ -537,7 +550,7 @@ void ggml_mulmat_tune_estimate_time(
const int max_m = shape->items[m_num - 1].M; const int max_m = shape->items[m_num - 1].M;
for (int ip = 0; ip < shape->n_profiles; ip++) { for (int ip = 0; ip < shape->n_profiles; ip++) {
struct ggml_task_profile *profile = &shape->profiles[ip]; const struct ggml_task_profile *profile = &shape->profiles[ip];
profile_time[ip].total_time = 0; profile_time[ip].total_time = 0;
profile_time[ip].profile = profile; profile_time[ip].profile = profile;
@ -573,7 +586,7 @@ void ggml_mulmat_tune_estimate_time(
GGML_ASSERT(p0 && p1); GGML_ASSERT(p0 && p1);
for (int i_stage = 0; i_stage < 3; i_stage++) { for (int i_stage = 0; i_stage < 3; i_stage++) {
struct ggml_task_stage *stage = &profile->stages[i_stage]; const struct ggml_task_stage *stage = &profile->stages[i_stage];
if (stage->backend == GGML_TASK_BACKEND_NONE) { if (stage->backend == GGML_TASK_BACKEND_NONE) {
continue; continue;
} }
@ -736,7 +749,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
return false; return false;
} }
bool ok = ggml_mulmat_tune_init(tune, params, NULL); bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles);
if (!ok) { if (!ok) {
return false; return false;
} }

View File

@ -46,7 +46,7 @@ struct ggml_mulmat_tune_shape {
enum ggml_type src1_type; enum ggml_type src1_type;
int n_profiles; int n_profiles;
struct ggml_task_profile *profiles; struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
int m_num; int m_num;
int *arr_m; int *arr_m;
@ -69,7 +69,7 @@ struct ggml_mulmat_tune {
}; };
struct ggml_mulmat_tune_time { struct ggml_mulmat_tune_time {
struct ggml_task_profile *profile; const struct ggml_task_profile *profile;
int stage_time[3]; int stage_time[3];
int total_time; int total_time;
}; };
@ -78,7 +78,7 @@ struct mm_cache_element {
int M; int M;
int N; int N;
int K; int K;
struct ggml_task_profile *profile; const struct ggml_task_profile *profile;
int stages_time[3]; int stages_time[3];
}; };
@ -108,7 +108,7 @@ void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune, bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
struct ggml_mulmat_tune_params *params, struct ggml_mulmat_tune_params *params,
struct ggml_task_profile_factory *profile_factory); ggml_task_profiles_provider *profiles_provider);
void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune); void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune);

535
ggml.c
View File

@ -144,7 +144,7 @@ inline static void* ggml_aligned_malloc(size_t size) {
#include "ggml-opencl.h" #include "ggml-opencl.h"
#endif #endif
#if defined(GGML_USE_MULMAT_TUNE) #if defined(GGML_USE_TUNE)
#include "ggml-tune.h" #include "ggml-tune.h"
#endif #endif
@ -4043,8 +4043,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
ggml_cl_init(); ggml_cl_init();
#endif #endif
ggml_mulmat_init_task_profiles();
is_first_call = false; is_first_call = false;
} }
@ -15524,20 +15522,89 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
return result; return result;
} }
// ---- mulmat task profiles ---- // ---- task profiles ----
static struct ggml_task_profile_factory default_task_profile_factory = {0}; // Implement `ggml_task_profiles_provider`.
// Fill `profiles` for the `node` and return number of profiles.
//
// NOTE: the node may be incompleted from testing or tunning, so please assert
// everything used here.
inline int ggml_get_task_profiles(
struct ggml_tensor *node,
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]) {
GGML_ASSERT(node);
GGML_ASSERT(node->op >= 0);
GGML_ASSERT(profiles);
// TODO: thread unsafe. Should be initialized once. memset(profiles, 0,
void ggml_mulmat_init_task_profiles(void) { sizeof(struct ggml_task_profile) * GGML_MAX_TASK_PROFILES);
const size_t sz = sizeof(struct ggml_task_profile_factory);
memset(&default_task_profile_factory, 0, sz); struct ggml_task_profile *p = profiles;
int n_profiles = 0;
switch (node->op) {
case GGML_OP_CPY:
case GGML_OP_DUP: {
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
n_profiles = 1;
} break;
case GGML_OP_ADD:
case GGML_OP_ADD1: {
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
p[0].stages[1].parallel = true;
n_profiles = 1;
} break;
case GGML_OP_ACC: {
p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
p[0].stages[1].parallel = true;
n_profiles = 1;
} break;
case GGML_OP_SUB:
case GGML_OP_DIV:
case GGML_OP_SQR:
case GGML_OP_SQRT:
case GGML_OP_LOG:
case GGML_OP_SUM:
case GGML_OP_SUM_ROWS:
case GGML_OP_MEAN:
case GGML_OP_REPEAT:
case GGML_OP_REPEAT_BACK:
case GGML_OP_ABS:
case GGML_OP_SGN:
case GGML_OP_NEG:
case GGML_OP_STEP:
case GGML_OP_RELU: {
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
n_profiles = 1;
} break;
case GGML_OP_MUL: {
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
p[0].stages[1].parallel = true;
n_profiles = 1;
} break;
case GGML_OP_GELU:
case GGML_OP_SILU:
case GGML_OP_SILU_BACK:
case GGML_OP_NORM:
case GGML_OP_RMS_NORM:
case GGML_OP_RMS_NORM_BACK: {
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
p[0].stages[1].parallel = true;
n_profiles = 1;
} break;
case GGML_OP_MUL_MAT:
case GGML_OP_OUT_PROD: {
GGML_ASSERT(node->src0);
GGML_ASSERT(node->src1);
enum ggml_type src0_t = node->src0->type;
enum ggml_type src1_t = node->src1->type;
GGML_ASSERT(src1_t == GGML_TYPE_F32);
// f32
{
struct ggml_task_profile *p = default_task_profile_factory.f32_f32;
int i = 0; int i = 0;
if (src0_t == GGML_TYPE_F32) {
p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
p[i].stages[1].parallel = true; p[i].stages[1].parallel = true;
i++; i++;
@ -15557,14 +15624,7 @@ void ggml_mulmat_init_task_profiles(void) {
p[i].stages[1].wait = true; p[i].stages[1].wait = true;
i++; i++;
#endif #endif
default_task_profile_factory.n_f32_f32 = i; } else if (src0_t == GGML_TYPE_F16) {
}
// f16
{
struct ggml_task_profile *p = default_task_profile_factory.f16_f32;
int i = 0;
p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
p[i].stages[1].parallel = true; p[i].stages[1].parallel = true;
@ -15585,14 +15645,7 @@ void ggml_mulmat_init_task_profiles(void) {
p[i].stages[1].wait = true; p[i].stages[1].wait = true;
i++; i++;
#endif #endif
default_task_profile_factory.n_f16_f32 = i; } else if (ggml_is_quantized(src0_t)) {
}
// qxx
{
struct ggml_task_profile *p = default_task_profile_factory.qxx_f32;
int i = 0;
p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
p[i].stages[1].parallel = true; p[i].stages[1].parallel = true;
@ -15615,73 +15668,108 @@ void ggml_mulmat_init_task_profiles(void) {
p[i].stages[1].wait = true; p[i].stages[1].wait = true;
i++; i++;
#endif #endif
default_task_profile_factory.n_qxx_f32 = i;
} }
n_profiles = i;
} break;
case GGML_OP_SCALE: {
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
p[0].stages[1].parallel = true;
n_profiles = 1;
} break;
case GGML_OP_SET: {
p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
n_profiles = 1;
} break;
case GGML_OP_CONT:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
case GGML_OP_GET_ROWS:
case GGML_OP_GET_ROWS_BACK:
case GGML_OP_DIAG:
case GGML_OP_DIAG_MASK_ZERO: {
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
n_profiles = 1;
} break;
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
case GGML_OP_SOFT_MAX_BACK:
case GGML_OP_ROPE:
case GGML_OP_ROPE_BACK: {
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
p[0].stages[1].parallel = true;
n_profiles = 1;
} break;
case GGML_OP_ALIBI: {
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
n_profiles = 1;
} break;
case GGML_OP_CLAMP: {
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
n_profiles = 1;
} break;
case GGML_OP_CONV_1D_1S:
case GGML_OP_CONV_1D_2S: {
p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
p[0].stages[1].parallel = true;
n_profiles = 1;
} break;
case GGML_OP_FLASH_ATTN: {
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
p[0].stages[1].parallel = true;
n_profiles = 1;
} break;
case GGML_OP_FLASH_FF: {
p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
p[0].stages[1].parallel = true;
n_profiles = 1;
} }
case GGML_OP_FLASH_ATTN_BACK: {
int ggml_mulmat_get_task_profiles(struct ggml_task_profile_factory *pf, p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
enum ggml_type src0_t, enum ggml_type src1_t, p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
struct ggml_task_profile **profiles) { p[0].stages[1].parallel = true;
GGML_ASSERT(profiles); n_profiles = 1;
} break;
if (pf == NULL) { case GGML_OP_MAP_UNARY:
pf = &default_task_profile_factory; case GGML_OP_MAP_BINARY: {
} p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
n_profiles = 1;
GGML_ASSERT(src1_t == GGML_TYPE_F32); } break;
case GGML_OP_CROSS_ENTROPY_LOSS:
if (src0_t == GGML_TYPE_F32) { p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
*profiles = pf->f32_f32; p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
return pf->n_f32_f32; p[0].stages[1].parallel = true;
} p[0].stages[2].backend = GGML_TASK_BACKEND_CPU;
n_profiles = 1;
if (src0_t == GGML_TYPE_F16) { case GGML_OP_CROSS_ENTROPY_LOSS_BACK: {
*profiles = pf->f16_f32; p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
return pf->n_f16_f32; p[0].stages[1].parallel = true;
} n_profiles = 1;
} break;
if (ggml_is_quantized(src0_t)) { case GGML_OP_NONE:
*profiles = pf->qxx_f32; case GGML_OP_COUNT: {
return pf->n_qxx_f32; GGML_ASSERT(false);
} } break;
default:
GGML_ASSERT(false); GGML_ASSERT(false);
} }
static const struct ggml_task_profile * GGML_ASSERT(n_profiles > 0 && n_profiles <= GGML_MAX_TASK_PROFILES);
ggml_mulmat_get_default_task_profile(struct ggml_task_profile_factory *pf, return n_profiles;
enum ggml_type src0_type,
enum ggml_type src1_type) {
GGML_ASSERT(src1_type == GGML_TYPE_F32);
if (pf == NULL) {
pf = &default_task_profile_factory;
}
struct ggml_task_profile *p = NULL;
if (src0_type == GGML_TYPE_F32) {
p = &pf->f32_f32[0];
} else if (src0_type == GGML_TYPE_F16) {
p = &pf->f16_f32[0];
} else if (ggml_is_quantized(src0_type)) {
p = &pf->qxx_f32[0];
} else {
GGML_ASSERT(false);
}
for (int i = 0; i < 3; i++) {
GGML_ASSERT(p->stages[i].backend == GGML_TASK_BACKEND_CPU ||
p->stages[i].backend == GGML_TASK_BACKEND_NONE);
}
return p;
} }
// Set task profile for GGML_OP_MUL_MAT or GGML_OP_OUT_PROD. // Set task profile for GGML_OP_MUL_MAT or GGML_OP_OUT_PROD.
static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node, static const struct ggml_task_profile *ggml_mulmat_get_task_profile(
struct ggml_mulmat_tune *tune) { struct ggml_tensor *node, struct ggml_task_profile *profiles,
int n_profiles, struct ggml_mulmat_tune *tune, int stages_time_us[3]) {
GGML_ASSERT(node); GGML_ASSERT(node);
GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD); GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD);
GGML_ASSERT(profiles);
GGML_ASSERT(n_profiles >= 2);
enum ggml_type src0_t = node->src0->type; enum ggml_type src0_t = node->src0->type;
enum ggml_type src1_t = node->src1->type; enum ggml_type src1_t = node->src1->type;
@ -15697,42 +15785,26 @@ static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node,
int N = (int)node->ne[0]; int N = (int)node->ne[0];
int K = (int)node->src1->ne[0]; int K = (int)node->src1->ne[0];
struct ggml_task_profile *profiles = NULL;
int n_profiles = ggml_mulmat_get_task_profiles(NULL, src0_t, src1_t, &profiles);
GGML_ASSERT(n_profiles >= 2);
GGML_ASSERT(profiles);
const struct ggml_task_profile *prof = NULL; const struct ggml_task_profile *prof = NULL;
if (cond_match) { if (cond_match) {
#if defined(GGML_USE_MULMAT_TUNE) #if defined(GGML_USE_TUNE)
if (tune != NULL) { if (tune != NULL) {
int stages_time_us[3]; prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t,
prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t, src1_t, stages_time_us); src1_t, stages_time_us);
if (prof != NULL) { if (prof != NULL) {
GGML_ASSERT(prof); return prof;
memcpy(&node->task_profile, prof, sizeof(struct ggml_task_profile));
// Do not wait if the estimated execution time is too small (e.g. less than 0.1 ms)
// TODO: need bench actual wait/notify time, see ggml-threading.c
for (int i = 0; i < 3; i++) {
if (node->task_profile.stages[i].wait) {
if (stages_time_us[i] < 100) {
node->task_profile.stages[i].wait = false;
}
}
}
return;
} }
} }
#else #else
UNUSED(tune); UNUSED(tune);
UNUSED(stages_time_us);
#endif #endif
if (prof == NULL && M >= 32 && N >= 32 && K >= 32) { if (prof == NULL && M >= 32 && N >= 32 && K >= 32) {
for (int j = 0; j < n_profiles; j++) { for (int j = 0; j < n_profiles; j++) {
enum ggml_task_backend comp_be = enum ggml_task_backend comp_be =
profiles[j].stages[GGML_TASK_COMPUTE].backend; profiles[j].stages[GGML_TASK_COMPUTE].backend;
switch (comp_be) { switch (comp_be) {
case GGML_TASK_BACKEND_GPU_CUDA: { case GGML_TASK_BACKEND_GPU_CUDA: {
GGML_ASSERT(ggml_cpu_has_cublas()); GGML_ASSERT(ggml_cpu_has_cublas());
@ -15753,76 +15825,131 @@ static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node,
break; break;
} }
} }
if (prof) {
break;
}
} }
} }
} }
if (prof == NULL) { if (prof == NULL) {
prof = ggml_mulmat_get_default_task_profile(NULL, src0_t, src1_t); prof = &profiles[0];
GGML_ASSERT(prof->stages[1].backend == GGML_TASK_BACKEND_CPU);
} }
GGML_ASSERT(prof); return prof;
memcpy(&node->task_profile, prof, sizeof(struct ggml_task_profile));
} }
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
int n_threads = cgraph->n_threads; int n_threads = cgraph->n_threads;
if (ggml_cpu_has_blas()) {
for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor *node = cgraph->nodes[i];
memset(&node->task_profile, 0, sizeof(struct ggml_task_profile));
struct ggml_task_stage *stages = node->task_profile.stages;
// Adapt node->backend: assume GPU at COMPUTE stage.
if (node->backend > GGML_BACKEND_CPU) {
stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_NONE;
stages[GGML_TASK_FINALIZE].backend = GGML_TASK_BACKEND_NONE;
stages[GGML_TASK_COMPUTE].parallel = false;
bool wait = (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL);
stages[GGML_TASK_COMPUTE].wait = wait;
if (ggml_cpu_has_cublas()) {
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_GPU_CUDA;
} else if (ggml_cpu_has_clblast()) {
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_GPU_CL;
} else {
GGML_ASSERT(false);
}
} else if (node->op == GGML_OP_MUL_MAT) {
struct ggml_mulmat_tune * tune = NULL;
#if defined(GGML_USE_MULMAT_TUNE)
tune = cgraph->tune;
#endif
ggml_mulmat_set_tensor_task_profile(node, tune);
} else if (node->op == GGML_OP_OUT_PROD) {
ggml_mulmat_set_tensor_task_profile(node, NULL);
}
}
}
struct ggml_threading_context *thrd_ctx = ggml_threading_start( struct ggml_threading_context *thrd_ctx = ggml_threading_start(
n_threads, ggml_threading_graph_compute_thread, ggml_compute_forward, n_threads, ggml_threading_graph_compute_thread, ggml_compute_forward,
GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL); GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL);
// initialize tasks + work buffer // initialize tasks + work buffer
{ {
// int64_t t0 = ggml_time_us();
size_t work_size = 0; size_t work_size = 0;
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
// thread scheduling for the different operations // thread scheduling for the different operations
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i]; struct ggml_tensor * node = cgraph->nodes[i];
if (node->op == GGML_OP_NONE || node->op == GGML_OP_CONT) {
continue;
}
int n_profiles = ggml_get_task_profiles(node, profiles);
const struct ggml_task_profile *profile = NULL;
// Adapt node->backend: assume GPU at COMPUTE stage.
if (node->backend == GGML_BACKEND_GPU ||
node->backend == GGML_BACKEND_GPU_SPLIT) {
enum ggml_task_backend be;
if (ggml_cpu_has_cublas()) {
be = GGML_TASK_BACKEND_GPU_CUDA;
} else if (ggml_cpu_has_clblast()) {
be = GGML_TASK_BACKEND_GPU_CL;
} else {
GGML_ASSERT(false);
}
for (int j = 0; j < n_profiles; j++) {
if (profiles[j].stages[1].backend == be) {
profile = &profiles[j];
break;
}
}
GGML_ASSERT(profile);
} else {
GGML_ASSERT(node->backend == GGML_BACKEND_CPU);
}
bool profile_copied = false;
if (node->op == GGML_OP_MUL_MAT) {
#if defined(GGML_USE_TUNE)
int stages_time_us[3];
profile = ggml_mulmat_get_task_profile(
node, profiles, n_profiles, cgraph->tune, stages_time_us);
GGML_ASSERT(profile);
if (cgraph->tune) {
memcpy(&node->task_profile, profile,
sizeof(struct ggml_task_profile));
profile_copied = true;
// Do not wait if the estimated execution time is too small
// (e.g. less than 0.1 ms)
// TODO: need bench actual wait/notify time, see
// ggml-threading.c
for (int j = 0; j< 3; j++) {
if (node->task_profile.stages[j].wait) {
if (stages_time_us[j] < 100) {
node->task_profile.stages[j].wait = false;
}
}
}
}
#else
profile = ggml_mulmat_get_task_profile(node, profiles,
n_profiles, NULL, NULL);
GGML_ASSERT(profile);
#endif
} else if (node->op == GGML_OP_OUT_PROD) { // FIXME: is is right?
profile = ggml_mulmat_get_task_profile(node, profiles,
n_profiles, NULL, NULL);
GGML_ASSERT(profile);
} else {
profile = &profiles[0];
GGML_ASSERT(profile->stages[1].backend ==
GGML_TASK_BACKEND_CPU);
}
if (!profile_copied) {
memcpy(&node->task_profile, profile,
sizeof(struct ggml_task_profile));
}
struct ggml_task_stage *stages = node->task_profile.stages; struct ggml_task_stage *stages = node->task_profile.stages;
// compute stage n_tasks.
int n_tasks = stages[1].parallel ? n_threads : 1;
// Allocate temp buffer `wdata` for CPU.
// NOTE: GPU MAY fallback to CPU, so we have to cover all possible cases.
switch (node->op) { switch (node->op) {
case GGML_OP_CPY: case GGML_OP_CPY:
case GGML_OP_DUP: case GGML_OP_DUP:
{ {
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
size_t cur = 0; size_t cur = 0;
if (ggml_is_quantized(node->type)) { if (ggml_is_quantized(node->type)) {
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads; cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks;
} }
work_size = MAX(work_size, cur); work_size = MAX(work_size, cur);
@ -15830,27 +15957,20 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
case GGML_OP_ADD: case GGML_OP_ADD:
case GGML_OP_ADD1: case GGML_OP_ADD1:
{ {
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
stages[GGML_TASK_COMPUTE].parallel = true;
size_t cur = 0; size_t cur = 0;
if (ggml_is_quantized(node->src0->type)) { if (ggml_is_quantized(node->src0->type)) {
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads; cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks;
} }
work_size = MAX(work_size, cur); work_size = MAX(work_size, cur);
} break; } break;
case GGML_OP_ACC: case GGML_OP_ACC:
{ {
stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
stages[GGML_TASK_COMPUTE].parallel = true;
size_t cur = 0; size_t cur = 0;
if (ggml_is_quantized(node->src0->type)) { if (ggml_is_quantized(node->src0->type)) {
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads; cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks;
} }
work_size = MAX(work_size, cur); work_size = MAX(work_size, cur);
@ -15870,16 +15990,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
case GGML_OP_NEG: case GGML_OP_NEG:
case GGML_OP_STEP: case GGML_OP_STEP:
case GGML_OP_RELU: case GGML_OP_RELU:
{
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
} break;
case GGML_OP_MUL: case GGML_OP_MUL:
{
if (stages[GGML_TASK_COMPUTE].backend == GGML_TASK_BACKEND_NONE) {
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
stages[GGML_TASK_COMPUTE].parallel = true;
}
} break;
case GGML_OP_GELU: case GGML_OP_GELU:
case GGML_OP_SILU: case GGML_OP_SILU:
case GGML_OP_SILU_BACK: case GGML_OP_SILU_BACK:
@ -15887,28 +15998,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
case GGML_OP_RMS_NORM_BACK: case GGML_OP_RMS_NORM_BACK:
{ {
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
stages[GGML_TASK_COMPUTE].parallel = true;
} break; } break;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
case GGML_OP_OUT_PROD: case GGML_OP_OUT_PROD: // FIXME: is is right?
{ {
size_t cur = 0; size_t cur = 0;
enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend; enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend;
GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE); GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE);
// TODO: remove this check once we are sure `ggml_mulmat_set_tensor_task_profile()` is correct.
if ((comp_backend & GGML_TASK_BACKEND_GPU) || comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
enum ggml_type src0_t = node->src0->type;
enum ggml_type src1_t = node->src1->type;
bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 ||
ggml_is_quantized(src0_t)) &&
src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 &&
ggml_is_contiguous(node->src0) &&
ggml_is_contiguous(node->src1);
GGML_ASSERT(cond_match);
}
if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
#if defined(GGML_USE_CLBLAST) #if defined(GGML_USE_CLBLAST)
cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
@ -15930,7 +16027,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
GGML_ASSERT(false); GGML_ASSERT(false);
} }
} else if (comp_backend == GGML_TASK_BACKEND_CPU || comp_backend == GGML_TASK_BACKEND_GPU_CUDA) { } else if (comp_backend == GGML_TASK_BACKEND_CPU || comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
// We have to reseve buffer for CUDA because it may fallback to CPU.
if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) { if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
GGML_ASSERT(ggml_cpu_has_cublas()); GGML_ASSERT(ggml_cpu_has_cublas());
} }
@ -15955,13 +16051,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
} break; } break;
case GGML_OP_SCALE: case GGML_OP_SCALE:
{ {
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
stages[GGML_TASK_COMPUTE].parallel = true;
} break; } break;
case GGML_OP_SET: case GGML_OP_SET:
{ {
stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
} break; } break;
case GGML_OP_CONT: case GGML_OP_CONT:
case GGML_OP_RESHAPE: case GGML_OP_RESHAPE:
@ -15972,33 +16064,18 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
case GGML_OP_GET_ROWS_BACK: case GGML_OP_GET_ROWS_BACK:
case GGML_OP_DIAG: case GGML_OP_DIAG:
case GGML_OP_DIAG_MASK_ZERO: case GGML_OP_DIAG_MASK_ZERO:
{
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
} break;
case GGML_OP_DIAG_MASK_INF: case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
case GGML_OP_SOFT_MAX_BACK: case GGML_OP_SOFT_MAX_BACK:
case GGML_OP_ROPE: case GGML_OP_ROPE:
case GGML_OP_ROPE_BACK: case GGML_OP_ROPE_BACK:
{
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
stages[GGML_TASK_COMPUTE].parallel = true;
} break;
case GGML_OP_ALIBI: case GGML_OP_ALIBI:
{
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
} break;
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
{ {
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
} break; } break;
case GGML_OP_CONV_1D_1S: case GGML_OP_CONV_1D_1S:
case GGML_OP_CONV_1D_2S: case GGML_OP_CONV_1D_2S:
{ {
stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
stages[GGML_TASK_COMPUTE].parallel = true;
GGML_ASSERT(node->src0->ne[3] == 1); GGML_ASSERT(node->src0->ne[3] == 1);
GGML_ASSERT(node->src1->ne[2] == 1); GGML_ASSERT(node->src1->ne[2] == 1);
GGML_ASSERT(node->src1->ne[3] == 1); GGML_ASSERT(node->src1->ne[3] == 1);
@ -16026,62 +16103,53 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
} break; } break;
case GGML_OP_FLASH_ATTN: case GGML_OP_FLASH_ATTN:
{ {
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
stages[GGML_TASK_COMPUTE].parallel = true;
size_t cur = 0; size_t cur = 0;
const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
if (node->src1->type == GGML_TYPE_F32) { if (node->src1->type == GGML_TYPE_F32) {
cur = sizeof(float)*ne11*n_threads; // TODO: this can become (n_tasks-1) cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
cur += sizeof(float)*ne11*n_threads; // this is overestimated by x2 cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
} }
if (node->src1->type == GGML_TYPE_F16) { if (node->src1->type == GGML_TYPE_F16) {
cur = sizeof(float)*ne11*n_threads; // TODO: this can become (n_tasks-1) cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
cur += sizeof(float)*ne11*n_threads; // this is overestimated by x2 cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
} }
work_size = MAX(work_size, cur); work_size = MAX(work_size, cur);
} break; } break;
case GGML_OP_FLASH_FF: case GGML_OP_FLASH_FF:
{ {
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
stages[GGML_TASK_COMPUTE].parallel = true;
size_t cur = 0; size_t cur = 0;
if (node->src1->type == GGML_TYPE_F32) { if (node->src1->type == GGML_TYPE_F32) {
cur = sizeof(float)*node->src1->ne[1]*n_threads; // TODO: this can become (n_tasks-1) cur = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
cur += sizeof(float)*node->src1->ne[1]*n_threads; // this is overestimated by x2 cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2
} }
if (node->src1->type == GGML_TYPE_F16) { if (node->src1->type == GGML_TYPE_F16) {
cur = sizeof(float)*node->src1->ne[1]*n_threads; // TODO: this can become (n_tasks-1) cur = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
cur += sizeof(float)*node->src1->ne[1]*n_threads; // this is overestimated by x2 cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2
} }
work_size = MAX(work_size, cur); work_size = MAX(work_size, cur);
} break; } break;
case GGML_OP_FLASH_ATTN_BACK: case GGML_OP_FLASH_ATTN_BACK:
{ {
stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
stages[GGML_TASK_COMPUTE].parallel = true;
size_t cur = 0; size_t cur = 0;
const int64_t D = node->src0->ne[0]; const int64_t D = node->src0->ne[0];
const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
if (node->src1->type == GGML_TYPE_F32) { if (node->src1->type == GGML_TYPE_F32) {
cur = sizeof(float)*mxDn*n_threads; // TODO: this can become (n_tasks-1) cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
cur += sizeof(float)*mxDn*n_threads; // this is overestimated by x2 cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
} }
if (node->src1->type == GGML_TYPE_F16) { if (node->src1->type == GGML_TYPE_F16) {
cur = sizeof(float)*mxDn*n_threads; // TODO: this can become (n_tasks-1) cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
cur += sizeof(float)*mxDn*n_threads; // this is overestimated by x2 cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
} }
work_size = MAX(work_size, cur); work_size = MAX(work_size, cur);
@ -16089,31 +16157,21 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
case GGML_OP_MAP_UNARY: case GGML_OP_MAP_UNARY:
case GGML_OP_MAP_BINARY: case GGML_OP_MAP_BINARY:
{ {
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
} break; } break;
case GGML_OP_CROSS_ENTROPY_LOSS: case GGML_OP_CROSS_ENTROPY_LOSS:
{ {
stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; size_t cur = ggml_type_size(node->type)*(n_threads + node->src0->ne[0]*n_tasks);
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
stages[GGML_TASK_COMPUTE].parallel = true;
stages[GGML_TASK_FINALIZE].backend = GGML_TASK_BACKEND_CPU;
size_t cur = ggml_type_size(node->type)*(n_threads + node->src0->ne[0]*n_threads);
work_size = MAX(work_size, cur); work_size = MAX(work_size, cur);
} break; } break;
case GGML_OP_CROSS_ENTROPY_LOSS_BACK: case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
{ {
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks;
stages[GGML_TASK_COMPUTE].parallel = true;
size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_threads;
work_size = MAX(work_size, cur); work_size = MAX(work_size, cur);
} break; } break;
case GGML_OP_NONE: case GGML_OP_NONE:
{ {
stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
} break; } break;
case GGML_OP_COUNT: case GGML_OP_COUNT:
{ {
@ -16134,6 +16192,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size); GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size);
cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size); cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size);
} }
// ~ 50 us
//printf("=== prepare computing took %d us\n", (int)(ggml_time_us() - t0));
} }
const int64_t perf_start_cycles = ggml_perf_cycles(); const int64_t perf_start_cycles = ggml_perf_cycles();
@ -16162,16 +16223,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
enum ggml_compute_error err = enum ggml_compute_error err =
ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize); ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
if (err == GGML_COMPUTE_FALLBACK) {
if (node->op == GGML_OP_MUL_MAT) {
const struct ggml_task_profile *p =
ggml_mulmat_get_default_task_profile(
NULL, node->src0->type, node->src1->type);
memcpy(&node->task_profile, p,
sizeof(struct ggml_task_profile));
}
err = ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
}
GGML_ASSERT(err == GGML_COMPUTE_OK); GGML_ASSERT(err == GGML_COMPUTE_OK);
// performance stats (node) // performance stats (node)

28
ggml.h
View File

@ -202,7 +202,7 @@
#define GGML_MAX_OPT 4 #define GGML_MAX_OPT 4
#define GGML_MAX_NAME 32 #define GGML_MAX_NAME 32
#define GGML_DEFAULT_N_THREADS 4 #define GGML_DEFAULT_N_THREADS 4
#define GGML_MAX_TASK_PROFILES 8 #define GGML_MAX_TASK_PROFILES 4
#define GGML_ASSERT(x) \ #define GGML_ASSERT(x) \
do { \ do { \
@ -399,17 +399,6 @@ extern "C" {
uint8_t dev_flags[4]; uint8_t dev_flags[4];
}; };
struct ggml_task_profile_factory {
struct ggml_task_profile f32_f32[GGML_MAX_TASK_PROFILES];
int n_f32_f32;
struct ggml_task_profile f16_f32[GGML_MAX_TASK_PROFILES];
int n_f16_f32;
struct ggml_task_profile qxx_f32[GGML_MAX_TASK_PROFILES];
int n_qxx_f32;
};
// n-dimensional tensor // n-dimensional tensor
struct ggml_tensor { struct ggml_tensor {
enum ggml_type type; enum ggml_type type;
@ -450,6 +439,11 @@ extern "C" {
char padding[12]; char padding[12];
}; };
// Fill `profiles` for the `node` and return number of profiles.
typedef int (ggml_task_profiles_provider) (
struct ggml_tensor *node,
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]);
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
// computation graph // computation graph
@ -1345,15 +1339,11 @@ extern "C" {
GGML_API int ggml_cpu_has_vsx (void); GGML_API int ggml_cpu_has_vsx (void);
// //
// mulmat task profiles // task profiles
// //
GGML_API void ggml_mulmat_init_task_profiles(void);
GGML_API int ggml_mulmat_get_task_profiles( // Implements `ggml_task_profiles_provider`.
struct ggml_task_profile_factory *pf, GGML_API int ggml_get_task_profiles (struct ggml_tensor *node, struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]);
enum ggml_type src0_t,
enum ggml_type src1_t,
struct ggml_task_profile **profiles);
// //
// Internal types and functions exposed for tests and benchmarks // Internal types and functions exposed for tests and benchmarks

View File

@ -21,7 +21,7 @@
#include "ggml-metal.h" #include "ggml-metal.h"
#endif #endif
#ifdef GGML_USE_MULMAT_TUNE #ifdef GGML_USE_TUNE
#include "ggml-tune.h" #include "ggml-tune.h"
#endif #endif
@ -285,7 +285,7 @@ struct llama_context {
int buf_last = 0; int buf_last = 0;
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
#ifdef GGML_USE_MULMAT_TUNE #ifdef GGML_USE_TUNE
struct ggml_mulmat_tune *tune = nullptr; struct ggml_mulmat_tune *tune = nullptr;
#endif #endif
@ -1408,7 +1408,7 @@ static bool llama_eval_internal(
ggml_cgraph gf = {}; ggml_cgraph gf = {};
gf.n_threads = n_threads; gf.n_threads = n_threads;
#ifdef GGML_USE_MULMAT_TUNE #ifdef GGML_USE_TUNE
gf.tune =lctx.tune; gf.tune =lctx.tune;
#endif #endif
@ -2743,7 +2743,7 @@ struct llama_context * llama_init_from_file(
return ctx; return ctx;
} }
#ifdef GGML_USE_MULMAT_TUNE #ifdef GGML_USE_TUNE
bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) { bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) {
printf("\n"); printf("\n");
if (ctx->model.n_gpu_layers != 0) { if (ctx->model.n_gpu_layers != 0) {
@ -2882,7 +2882,7 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
#endif #endif
void llama_free(struct llama_context * ctx) { void llama_free(struct llama_context * ctx) {
#ifdef GGML_USE_MULMAT_TUNE #ifdef GGML_USE_TUNE
if (ctx->tune) { if (ctx->tune) {
delete(ctx->tune); delete(ctx->tune);
} }

View File

@ -356,6 +356,17 @@ int main(void) {
{ {
++n_tests; ++n_tests;
// required by getting task profiles.
node.op = GGML_OP_MUL_MAT;
struct ggml_tensor src0 = {
.type = GGML_TYPE_Q4_0,
};
struct ggml_tensor src1 = {
.type = GGML_TYPE_F32,
};
node.src0 = &src0;
node.src1 = &src1;
node.backend = GGML_BACKEND_GPU; node.backend = GGML_BACKEND_GPU;
if (test_fallback(&node) == 0) { if (test_fallback(&node) == 0) {
++n_passed; ++n_passed;

View File

@ -3,6 +3,8 @@
#include <string.h> #include <string.h>
#define UNUSED(x) (void)(x)
static int bench(void); static int bench(void);
static int estimate_time_non_zero_NK(void); static int estimate_time_non_zero_NK(void);
@ -77,6 +79,20 @@ static int bench(void) {
return ok ? 0 : 1; return ok ? 0 : 1;
} }
// implement `ggml_task_profiles_provider`
static int
ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node,
struct ggml_task_profile *profiles) {
UNUSED(node);
profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
profiles[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
profiles[1].stages[0].backend = GGML_TASK_BACKEND_CPU;
profiles[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
return 2;
}
int estimate_time_non_zero_NK(void) { int estimate_time_non_zero_NK(void) {
printf("test: %s\n", __func__); printf("test: %s\n", __func__);
@ -92,22 +108,10 @@ int estimate_time_non_zero_NK(void) {
const int m_num = 2; const int m_num = 2;
struct ggml_task_profile_factory pf;
memset(&pf, 0, sizeof(struct ggml_task_profile_factory));
{
pf.n_qxx_f32 = 2;
pf.qxx_f32[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
pf.qxx_f32[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
pf.qxx_f32[1].stages[0].backend = GGML_TASK_BACKEND_CPU;
pf.qxx_f32[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
}
struct ggml_mulmat_tune_params params; struct ggml_mulmat_tune_params params;
init_params(&params, m_num); init_params(&params, m_num);
ggml_mulmat_tune_init(&tune, &params, &pf); ggml_mulmat_tune_init(&tune, &params, ggml_task_profiles_mock_qxx_provider);
struct ggml_mulmat_tune_shape *shape = NULL; struct ggml_mulmat_tune_shape *shape = NULL;
for (int i = 0; i < tune.n_shapes; i++) { for (int i = 0; i < tune.n_shapes; i++) {