From 213f133701e947a94bdd070315cc0528429bd27d Mon Sep 17 00:00:00 2001 From: mqy Date: Wed, 14 Jun 2023 18:33:14 +0800 Subject: [PATCH] initial --- .gitignore | 1 + CMakeLists.txt | 9 + Makefile | 31 +- examples/CMakeLists.txt | 1 + examples/common.cpp | 14 + examples/common.h | 2 + examples/main/main.cpp | 10 + examples/mulmat-tune/CMakeLists.txt | 14 + examples/mulmat-tune/README.md | 272 +++++++ examples/mulmat-tune/mulmat-tune.cpp | 277 +++++++ examples/perplexity/perplexity.cpp | 10 + ggml-cuda.cu | 2 +- ggml-opencl.cpp | 2 +- ggml-threading.c | 620 ++++++++++++++++ ggml-threading.h | 68 ++ ggml-tune.c | 897 ++++++++++++++++++++++ ggml-tune.h | 137 ++++ ggml.c | 1032 +++++++++++++------------- ggml.h | 71 +- llama.cpp | 160 +++- llama.h | 3 + tests/.gitignore | 2 + tests/CMakeLists.txt | 2 + tests/test-ggml-threading.c | 345 +++++++++ tests/test-ggml-tune.c | 200 +++++ 25 files changed, 3664 insertions(+), 518 deletions(-) create mode 100644 examples/mulmat-tune/CMakeLists.txt create mode 100644 examples/mulmat-tune/README.md create mode 100644 examples/mulmat-tune/mulmat-tune.cpp create mode 100644 ggml-threading.c create mode 100644 ggml-threading.h create mode 100644 ggml-tune.c create mode 100644 ggml-tune.h create mode 100644 tests/.gitignore create mode 100644 tests/test-ggml-threading.c create mode 100644 tests/test-ggml-tune.c diff --git a/.gitignore b/.gitignore index e7bfd52e3..c2e2a0ab0 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,7 @@ models/* /server /Pipfile /libllama.so +/mulmat-tune build-info.h arm_neon.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 736771954..832c1e986 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,6 +78,7 @@ option(LLAMA_K_QUANTS "llama: use k-quants" option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" OFF) +option(LLAMA_MULMAT_TUNE "llama: mulmat tune" OFF) # # Build info header @@ -214,6 +215,7 @@ if (LLAMA_BLAS) message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") add_compile_options(${BLAS_LINKER_FLAGS}) add_compile_definitions(GGML_USE_OPENBLAS) + add_compile_definitions(GGML_BLAS_VENDOR="${LLAMA_BLAS_VENDOR}") set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) @@ -276,6 +278,11 @@ if (LLAMA_METAL) ) endif() +if (LLAMA_MULMAT_TUNE) + add_compile_definitions(GGML_USE_MULMAT_TUNE) + add_compile_definitions(GGML_MULMAT_TUNE_NDEBUG) +endif() + if (LLAMA_K_QUANTS) set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h) add_compile_definitions(GGML_USE_K_QUANTS) @@ -450,6 +457,8 @@ endif() add_library(ggml OBJECT ggml.c + ggml-threading.c + ggml-tune.c ggml.h ${GGML_SOURCES_CUDA} ${GGML_SOURCES_OPENCL} diff --git a/Makefile b/Makefile index afd06e0a6..a8d1bdc09 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple mulmat-tune ifdef LLAMA_BUILD_SERVER BUILD_TARGETS += server @@ -47,7 +47,8 @@ endif OPT = -O3 CFLAGS = -I. $(OPT) -std=c11 -fPIC CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC -LDFLAGS = +# -lm fixed error: ggml.o: undefined reference to symbol 'tanhf@@GLIBC_2.2.5' from ubuntu 22.04 +LDFLAGS = -lm ifdef LLAMA_DEBUG CFLAGS += -O0 -g @@ -134,8 +135,7 @@ ifndef LLAMA_NO_K_QUANTS endif ifndef LLAMA_NO_ACCELERATE - # Mac M1 - include Accelerate framework. - # `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time). + # Mac Intel & M1 - include Accelerate framework. ifeq ($(UNAME_S),Darwin) CFLAGS += -DGGML_USE_ACCELERATE LDFLAGS += -framework Accelerate @@ -145,10 +145,16 @@ endif # LLAMA_NO_ACCELERATE ifdef LLAMA_OPENBLAS CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas LDFLAGS += -lopenblas + ifeq ($(UNAME_S),Darwin) + # openblas installed with Homebew on macOS. + CFLAGS += -I/usr/local/opt/openblas/include + LDFLAGS += -L/usr/local/opt/openblas/lib + endif endif # LLAMA_OPENBLAS ifdef LLAMA_BLIS CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis + CFLAGS += -DGGML_BLAS_VENDOR="\"BLIS\"" LDFLAGS += -lblis -L/usr/local/lib endif # LLAMA_BLIS @@ -230,6 +236,11 @@ k_quants.o: k_quants.c k_quants.h $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_NO_K_QUANTS +ifdef LLAMA_MULMAT_TUNE + CFLAGS += -DGGML_USE_MULMAT_TUNE -DGGML_MULMAT_TUNE_NDEBUG + CXXFLAGS += -DGGML_USE_MULMAT_TUNE +endif + # # Print build information # @@ -245,6 +256,8 @@ $(info I CC: $(CCV)) $(info I CXX: $(CXXV)) $(info ) +OBJS += ggml-tune.o ggml-threading.o + # # Build library # @@ -253,7 +266,12 @@ ggml.o: ggml.c ggml.h ggml-cuda.h $(CC) $(CFLAGS) -c $< -o $@ llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h - $(CXX) $(CXXFLAGS) -c $< -o $@ + +ggml-threading.o: ggml-threading.c ggml.h + $(CC) $(CFLAGS) -c $< -o $@ + +ggml-tune.o: ggml-tune.c ggml.h + $(CC) $(CFLAGS) -c $< -o $@ common.o: examples/common.cpp examples/common.h $(CXX) $(CXXFLAGS) -c $< -o $@ @@ -298,6 +316,9 @@ server: examples/server/server.cpp examples/server/httplib.h examples/server/jso train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +mulmat-tune: examples/mulmat-tune/mulmat-tune.cpp build-info.h ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o mulmat-tune $(LDFLAGS) + build-info.h: $(wildcard .git/index) scripts/build-info.sh @sh scripts/build-info.sh > $@.tmp @if ! cmp -s $@.tmp $@; then \ diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index cf9c4a223..cf01b8a2a 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -39,6 +39,7 @@ else() add_subdirectory(baby-llama) add_subdirectory(train-text-from-scratch) add_subdirectory(simple) + add_subdirectory(mulmat-tune) if (LLAMA_METAL) add_subdirectory(metal) endif() diff --git a/examples/common.cpp b/examples/common.cpp index fed24e027..882e90c9c 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -345,6 +345,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.mem_test = true; } else if (arg == "--export") { params.export_cgraph = true; +#ifdef GGML_USE_MULMAT_TUNE + } else if (arg == "--tune") { + params.tune = true; + } else if (arg == "--tune-file") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.tune_file = argv[i]; +#endif // GGML_USE_MULMAT_TUNE } else if (arg == "--verbose-prompt") { params.verbose_prompt = true; } else if (arg == "-r" || arg == "--reverse-prompt") { @@ -498,6 +508,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { #endif fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n"); +#ifdef GGML_USE_MULMAT_TUNE + fprintf(stderr, " --tune mulmat tune enable. If tune-file is set then exit after bench\n"); + fprintf(stderr, " --tune-file FILE mulmat tune data file. If tune is true, then write bench result to this file, else load the file and run\n"); +#endif fprintf(stderr, " --verbose-prompt print prompt before generation\n"); fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); diff --git a/examples/common.h b/examples/common.h index 6c2953cb2..5e394b218 100644 --- a/examples/common.h +++ b/examples/common.h @@ -77,6 +77,8 @@ struct gpt_params { bool mem_test = false; // compute maximum memory usage bool export_cgraph = false; // export the computation graph bool verbose_prompt = false; // print prompt tokens before generation + bool tune = false; // mulmat tune: enable + std::string tune_file = ""; // mulmat tune: data file }; bool gpt_params_parse(int argc, char ** argv, gpt_params & params); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 941312f9c..542e463bf 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -117,6 +117,16 @@ int main(int argc, char ** argv) { return 1; } +#ifdef GGML_USE_MULMAT_TUNE + if (params.tune || !params.tune_file.empty()) { + bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str()); + if (!ok || (params.tune && !params.tune_file.empty())) { + llama_free(ctx); + return ok? 0: 1; + } + } +#endif + // print system information { fprintf(stderr, "\n"); diff --git a/examples/mulmat-tune/CMakeLists.txt b/examples/mulmat-tune/CMakeLists.txt new file mode 100644 index 000000000..51e1053e8 --- /dev/null +++ b/examples/mulmat-tune/CMakeLists.txt @@ -0,0 +1,14 @@ +set(TARGET mulmat-tune) +add_executable(${TARGET} mulmat-tune.cpp) + +if (XCODE OR MSVC) + set(MULMAT_TUNE_LIBS ggml) +else() + set(MULMAT_TUNE_LIBS ggml m) +endif() + +target_link_libraries(${TARGET} PRIVATE ${MULMAT_TUNE_LIBS} ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/mulmat-tune/README.md b/examples/mulmat-tune/README.md new file mode 100644 index 000000000..cff8a3d64 --- /dev/null +++ b/examples/mulmat-tune/README.md @@ -0,0 +1,272 @@ +# Mulmat Benchmark and Tunning + +Apart from the standalone tool `mulmat-tune`, mulmat tune is also integrated into +`main` and `perplexity`. To avoid too many new cli options, I just added two options. +To make it run faster, the `m_num` is set as 8 thus max M is 128, and the `n_pass` +is set as 1. + +With the newly added cli options, we can use `main` and `perplexity` with the +following three ways: + +* bench and run: --tune +* bench and exit: --tune --tune-file +* load and run: --tune-file + +The `load` mode reads existing data file. Although this is fine because we can +run bench ahead of time (saving tens of seconds), but there are two shortcomings: +- have to re-run when format changed, this is OK because we are acknowledged. +- the most subtle problem is algorithm was changed silently but we are using the + outdated format. So I integrated mulmat tune into `main` and `perplexity` as + a complementary solution. + +## Build into main and perplexity + +Makefile: +``` +make clean && LLAMA_MULMAT_TUNE=1 make +``` + +CMake (with BLAS): +``` +cmake --build . --target clean +cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON +cmake --build . --config Release +``` + +Run examples: + +``` +# bench and run: + +./main -m ./models/3B/open-llama-3b-q4-0.bin -c 512 -b 1024 -n 256 --keep 48 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt -t 4 --tune + +# bench then exit: +./main -m ./models/3B/open-llama-3b-q4-0.bin --tune --tune-file + +# load and run + +./main -m ./models/3B/open-llama-3b-q4-0.bin -c 512 -b 1024 -n 256 --keep 48 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt -t 4 --tune-file +``` + +# Build the standalone `mulmat-tune` + +Makefile: +``` +make clean && LLAMA_MULMAT_TUNE=1 make +``` + +CMake (with BLAS) +``` +cmake --build . --target clean +cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON +cmake --build . --config Release +``` + +Run examples: + +``` +./mulmat-tune -h + +# run with default params (7B, Q4_0, ...) +./mulmat-tune + +# set model +./mulmat-tune --model 13B + +# set ggml ftype, 2 for Q4_0, 3 for Q4_1, run `mulmat-tune -h` for help. +./mulmat-tune --ftype 3 + +# customized m_num +./mulmat-tune --m_num 8 + +# customized n_pass: run 1 pass only instead of the default 3. +./mulmat-tune --n_pass 1 + +# customized n_threads instead of the default 1. +./mulmat-tune --n_threads 4 + +# save to file +./mulmat-tune --file + +# save to file, always override if exists (CAUTION!) +./mulmat-tune --file -y + +``` + +# End to End Test + +## Compare With Master + +You may want to run the following commands. Make sure the tune result file is +setup properly. + +General steps: + +1. run `./mulmat-tune -h` to see how to build for misc vendors. + you can build with `GGML_MULMAT_TUNE_NDEBUG=` to enable the the debug, e.g: + ``` + make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1 LLAMA_NO_ACCELERATE=1 LLAMA_CLBLAST=1 make + ``` + On `macOS`, `ACCELERATE` is enabled by default. When `ACCELERATE` is built along + with `CUDA` or `CL`, you may not see `CUDA` or `CL` from debug because `CPU` + or `CPU_BLAS` is more faster (as of the estimation from mulmat tune). +2. create a small prompt file: + ``` + head -n 5 ./models/wikitext-2-raw/wiki.valid.raw > ./models/wiki.valid-5.raw + ``` +3. run any of the following example commands. + ``` + ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 1 -b 32 + ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 4 -b 64 + ``` + * `--mlock` is recommended for `macOS`, you may not want to use it. + * don't change `-c 128`: too large `context size` causes 0 perplexity trunk. + * `-t` is the number of threads, recommend `1`, `2`, `4` or `6`. + * you can change the batch size (`-b`) between `1` and `128`. + * you may want to add other cli options. + +The following results are generated with Accelerate compiled. + +### 1 thread + +**Master (2d43387d)** + +``` +| M | perplexity (seconds per pass) | prompt eval time (ms per token) | +| --- | --------------- | +| 8 | 43.53 | 339.95 | +| 16 | 44.31 | 346.12 | +| 24 | 43.14 | 336.90 | +| 32 | 33.59 | 262.25 | +| 40 | 27.64 | 215.77 | +| 48 | 24.52 | 191.42 | +``` + +**This branch (tune)** + +``` +| M | perplexity (seconds per pass) | prompt eval time (ms per token) | +| --- | --------------- | +| 8 | 43.78 | 341.96 | +| 16 | 42.88 | 334.93 | +| 24 | 42.06 | 328.42 | +| 32 | 33.07 | 258.25 | +| 40 | 28.69 | 223.98 | +| 48 | 25.65 | 200.19 | +``` + +### 4 threads + +**Master (2d43387d)** + +``` +| M | perplexity (seconds per pass) | prompt eval time (ms per token) | +| --- | --------------- | +| 8 | 12.43 | 96.99 | +| 16 | 12.10 | 94.44 | +| 24 | 12.81 | 99.95 | +| 32 | 31.64 | 247.04 | +| 48 | 24.55 | 191.63 | +| 64 | 17.56 | 137.09 | +| 96 | 17.59 | 137.25 | +| 128 | 10.73 | 83.74 | +``` + +**This branch (no tune)** + +``` +| M | perplexity (seconds per pass) | prompt eval time (ms per token) | +| --- | --------------- | +| 8 | 12.31 | 96.07 | +| 16 | 12.00 | 93.63 | +| 24 | 12.07 | 94.15 | +| 32 | 20.34 | 158.76 | +| 48 | 15.86 | 123.73 | +| 64 | 10.98 | 85.69 | +| 96 | 11.24 | 87.66 | +| 128 | 7.53 | 58.77 | +``` + +**This branch (tune)** + +``` +| M | perplexity (seconds per pass) | prompt eval time (ms per token) | +| --- | --------------- | +| 8 | 12.48 | 97.37 | +| 16 | 12.26 | 95.70 | +| 24 | 12.25 | 95.53 | +| 32 | 11.98 | 93.58 | +| 48 | 12.57 | 98.12 | +| 64 | 11.28 | 88.05 | +| 96 | 9.55 | 74.53 | +| 128 | 7.51 | 58.61 | +``` + +# Bench Data Format + +**Example** + +``` +5 3B 2 6 1 + +3200 3200 2 0 3 10 +16 0 0 0 16 1 0 1 0 0 0 0 +16 1 0 2 17 0 1 0 0 0 0 0 + 0 0 0 0 34 0 1 0 0 0 0 0 + 1 1 793 0 9103 2102 0 0 6014 0 + 2 2 1591 0 8034 2305 0 0 30982 0 + 4 4 2236 0 6476 2484 0 0 31388 0 + 8 7 4161 0 6623 2389 0 0 29204 0 + 16 15 8339 0 6434 2752 0 0 34303 0 + 32 32 16919 0 6915 3651 0 0 42511 0 + 64 200 34270 0 6574 4528 0 0 68212 0 + 128 188 69400 0 6325 6839 0 0 74437 0 + 256 303 134597 0 6168 11544 0 0 110180 0 + 512 687 279685 0 6337 29712 0 0 159728 0 + +3200 8640 2 0 2 10 + + ... + + ``` + +**Informal Explanation** + +``` +head +groups+ + +head := version model ggml_ftype n_shapes n_threads +shape+ + +# head +version: 1 +model: "3B" | "7B" | "13B" | "30B" | "65B" +ggml_ftype: 0 - 4, 7 - 14 +n_shapes: number of shapes +n_threads: number of threads + +shape := N K m_num n_profiles +task_conf_profile+ +bench_item+ + +task_conf_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize) +stage_conf: backend parallel wait +backend: 0 (NONE) | 16 (CPU) | 17 (CPU_BLAS) | 32 (GPU) | 33 (GPU_CUDA) | 34 (GPU_CL) +parallel: 0 (false) | 1 (true) +wait: 0 (false) | 1 (true) + +bench_item: M profile_time+ +profile_time := stage_time[3] +stage_time[3]: init_time, compute_time, finalize_time +``` + +A task stage is invalid if it's backend equals to `GGML_TASK_BACKEND_NONE`. +Time unit is `us`. A column is all zeros when that stage does not exist. + +# NOTE + +1. "3B" is [open-llama 3B](https://github.com/ggerganov/llama.cpp/pull/1588). +2. Model names are subject to change: we may support something like X-3B, Y-4B, ... +3. As of Jun 1, this tool is still in early stage, will be changed frequently in + recent couple of days (or weeks). diff --git a/examples/mulmat-tune/mulmat-tune.cpp b/examples/mulmat-tune/mulmat-tune.cpp new file mode 100644 index 000000000..62f1da277 --- /dev/null +++ b/examples/mulmat-tune/mulmat-tune.cpp @@ -0,0 +1,277 @@ +#include +#include +#include +#include +#include + +#include "build-info.h" +#include "ggml-tune.h" +#include "ggml.h" +#include "llama.h" + +#define UNUSED(x) (void)(x) + +static void print_build_tips(void) { + const char *a = "LLAMA_NO_ACCELERATE"; + fprintf(stderr, "Tips on how to build with various backend vendors:\n\n"); + fprintf(stderr, "CUDA: make clean; LLAMA_CUBLAS=1 make\n"); + fprintf(stderr, "CL: make clean; LLAMA_CLBLAST=1 make\n"); + fprintf(stderr, "Accelerate: make clean; %s= make\n", a); + fprintf(stderr, "OpenBLAS: make clean; %s=1 LLAMA_OPENBLAS=1 make\n", a); + fprintf(stderr, "BLIS: make clean; %s=1 LLAMA_BLIS=1 make\n", a); + fprintf(stderr, "\n"); + fprintf(stderr, "NOTE: for CUDA/CL, use %s=1 to disable ACCELERATE\n", a); +} + +static bool prompt_yes_no(const char *prompt) { + char buf[2]; + while (true) { + fprintf(stderr, "%s (Y|n)\n", prompt); + buf[0] = 0; + buf[1] = 0; + int i = 0; + int c = 0; + + while (c != '\n') { + c = fgetc(stdin); + buf[i % 2] = c; + i++; + } + if (i == 1) { + if (buf[0] == '\n') { + return true; + } + } else if (i == 2) { + if (buf[0] == 'Y' || buf[0] == 'y') { + return true; + } + if (buf[0] == 'N' || buf[0] == 'n') { + return false; + } + } + } +} + +static void cmd_analyze(struct ggml_mulmat_tune *tune); + +static void usage(char *prog) { + const char *usage_lines[] = { + "usage: %s args", + "", + "bench [-m MODEL] [-t TYPE] [-f FILE] [-y]", + "--model MODEL 3B | 7B | 13B | 30B | 65B", + " default 7B", + "--ftype FTYPE ggml ftype:", + " 0: all F32", + " 1: mostly F16", + " 2: mostly Q4_0", + " 3: mostly Q4_1", + " 4: mostly Q4_1, some F16", + " 7: mostly Q8_0", + " 8: mostly Q5_0", + " 9: mostly Q5_1", + " 10: mostly Q2_K", + " 11: mostly Q3_K", + " 12: mostly Q4_K", + " 13: mostly Q5_K", + " 14: mostly Q6_K", + " default 2 (mostly Q4_0)", + "--m_num M_NUM number of M, the max M = 2^(M_NUM-1)", + " requires between [6, 12]", + " default 10", + "--n_pass PASS number of passes to run", + " default 1", + " requires: between [1, 3]", + "--n_threads NTH bench with this number of threads", + " requires: between [1, 16]", + " default 1", + "--file FILE data file to write", + " default stdout", + "-y always answer \"yes\" to all prompts", + }; + + int len = (int)(sizeof(usage_lines) / sizeof(char *)); + for (int i = 0; i < len; i++) { + const char *line = usage_lines[i]; + if (i == 0) { + fprintf(stderr, line, prog); + } else { + fprintf(stderr, "%s\n", line); + } + } + + printf("\n"); + print_build_tips(); + printf("\n"); +} + +int main(int argc, char **argv) { + if (argc == 2) { + if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) { + usage(argv[0]); + return 0; + } + } + + int arg_start = 1; + + const char *arg_model = NULL; + const char *arg_ftype = NULL; + const char *arg_m_num = NULL; + const char *arg_n_threads = NULL; + const char *arg_n_pass = NULL; + const char *arg_file = NULL; + bool always_yes = false; + + for (int i = arg_start; i < argc; i++) { + if (strcmp(argv[i], "--model") == 0) { + if (i + 1 < argc) { + arg_model = argv[i + 1]; + ++i; + } + } else if (strcmp(argv[i], "--ftype") == 0) { + if (i + 1 < argc) { + arg_ftype = argv[i + 1]; + ++i; + } + } else if (strcmp(argv[i], "--m_num") == 0) { + if (i + 1 < argc) { + arg_m_num = argv[i + 1]; + ++i; + } + } else if (strcmp(argv[i], "--n_pass") == 0) { + if (i + 1 < argc) { + arg_n_pass = argv[i + 1]; + ++i; + } + } else if (strcmp(argv[i], "--n_threads") == 0) { + if (i + 1 < argc) { + arg_n_threads = argv[i + 1]; + ++i; + } + } else if (strcmp(argv[i], "--file") == 0) { + if (i + 1 < argc) { + arg_file = argv[i + 1]; + ++i; + } + } else if (strcmp(argv[i], "-y") == 0) { + always_yes = true; + } else { + fprintf(stderr, "invalid arg: %s\n", argv[i]); + usage(argv[0]); + return 1; + } + } + + enum ggml_ftype ftype = GGML_FTYPE_MOSTLY_Q4_0; + { + if (arg_ftype != NULL) { + int v = atoi(arg_ftype); + ftype = (enum ggml_ftype)v; + } + + if (ftype > GGML_FTYPE_MOSTLY_Q5_1) { + fprintf(stderr, "k_quants type %d is not implemented\n", ftype); + return 1; + } + } + + if (arg_file != NULL && !always_yes) { + struct stat st; + int rc = stat(arg_file, &st); + UNUSED(st); + if (rc == 0) { // prompt + size_t len = strlen(arg_file) + 50; + char *prompt = (char *)malloc(len); + GGML_ASSERT(prompt); + snprintf(prompt, len, "data file '%s' exists, override?", arg_file); + + if (!prompt_yes_no(prompt)) { + printf("Aborted.\n"); + return 1; + } + free(prompt); + } + } + + int m_num = 10; + { + if (arg_m_num != NULL) { + int v = atoi(arg_m_num); + m_num = v; + } + + if (m_num < 6 || m_num > 12) { + fprintf(stderr, "invalid m_num: %d, expect between [6, 12]\n", + m_num); + usage(argv[0]); + return 1; + } + } + + int n_pass = 1; + { + if (arg_n_pass != NULL) { + int v = atoi(arg_n_pass); + n_pass = v; + } + if (n_pass < 1 || n_pass > GGML_MULMAT_MAX_PASS) { + fprintf(stderr, "invalid n_pass: %d, expect between [1, %d]\n", + n_pass, GGML_MULMAT_MAX_PASS); + usage(argv[0]); + return 1; + } + } + + int n_threads = 1; + { + if (arg_n_threads != NULL) { + int v = atoi(arg_n_threads); + n_threads = v; + if (n_threads < 1 || n_threads > 16) { + fprintf(stderr, + "invalid n_threads: %d, expect between [1, 16]\n", + n_threads); + usage(argv[0]); + return 1; + } + } + } + + const char *model_name = "7B"; + { + if (arg_model != NULL) { + model_name = arg_model; + } + } + + // Let init message print earlier. + { + struct ggml_init_params init_params = { + /*.mem_size =*/1, + /*.mem_buffer =*/NULL, + /*.no_alloc =*/0, + }; + struct ggml_context *ctx = ggml_init(init_params); + GGML_ASSERT(ctx); + ggml_free(ctx); + } + + struct ggml_mulmat_tune tune; + + struct ggml_mulmat_tune_params params; + memset(¶ms, 0, sizeof(struct ggml_mulmat_tune_params)); + + ggml_mulmat_init_task_profiles(); + + ggml_mulmat_tune_model_init(¶ms.model, model_name, ftype); + params.m_num = m_num; + params.n_pass = n_pass; + params.n_threads = n_threads; + params.progress = true; + params.output_console = true; + params.fname = arg_file; + + bool ok = ggml_mulmat_tune_bench(&tune, ¶ms); + return ok ? 0 : 1; +} diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index ae8cfe0af..1f14c18de 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -158,6 +158,16 @@ int main(int argc, char ** argv) { return 1; } +#ifdef GGML_USE_MULMAT_TUNE + if (params.tune || !params.tune_file.empty()){ + bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str()); + if (!ok || (params.tune && !params.tune_file.empty())) { + llama_free(ctx); + return ok? 0: 1; + } + } +#endif + // print system information { fprintf(stderr, "\n"); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 16488b9f9..cf52109bc 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2571,7 +2571,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ func = ggml_cuda_rms_norm; break; case GGML_OP_MUL_MAT: - if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) { + if (!any_on_device/* && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)*/) { return false; } func = ggml_cuda_mul_mat; diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 95f4cec6d..b2300a104 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1628,7 +1628,7 @@ bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_ } void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) { - GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst)); + // GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst)); if (src0->type == GGML_TYPE_F32) { ggml_cl_mul_mat_f32(src0, src1, dst); diff --git a/ggml-threading.c b/ggml-threading.c new file mode 100644 index 000000000..cf17793f6 --- /dev/null +++ b/ggml-threading.c @@ -0,0 +1,620 @@ + +#include +#include +#include + +#include "ggml-threading.h" +#include "ggml.h" + +#define UNUSED(x) (void)(x) + +// see https://github.com/ggerganov/llama.cpp/pull/1314 +#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) +#include +static inline void ggml_spin_pause(void) { _mm_pause(); } +#else +static inline void ggml_spin_pause(void) {} +#endif + +#if defined(_WIN32) + +#include + +typedef volatile LONG atomic_int; +typedef atomic_int atomic_bool; +typedef LONG atomic_flag; + +typedef CRITICAL_SECTION pthread_mutex_t; +typedef CONDITION_VARIABLE pthread_cond_t; +typedef void pthread_mutexattr_t; +typedef void pthread_condattr_t; + +typedef HANDLE pthread_t; + +static void atomic_store(atomic_int *ptr, LONG val) { + InterlockedExchange(ptr, val); +} + +static LONG atomic_load(atomic_int *ptr) { + return InterlockedCompareExchange(ptr, 0, 0); +} + +static LONG atomic_fetch_add(atomic_int *ptr, LONG inc) { + return InterlockedExchangeAdd(ptr, inc); +} + +static LONG atomic_fetch_sub(atomic_int *ptr, LONG dec) { + return atomic_fetch_add(ptr, -(dec)); +} + +static inline LONG atomic_flag_test_and_set(volatile atomic_flag *ptr) { + return InterlockedCompareExchange(ptr, 1, 0); +} +static inline LONG atomic_flag_clear(volatile atomic_flag *ptr) { + return InterlockedExchange(ptr, 0); +} +static int pthread_create(pthread_t *out, void *unused, + ggml_thread_ret_t (*func)(void *), void *arg) { + (void)unused; + HANDLE handle = + CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, arg, 0, NULL); + if (handle == NULL) { + return EAGAIN; + } + + *out = handle; + return 0; +} + +static int pthread_join(pthread_t thread, void *unused) { + (void)unused; + return (int)WaitForSingleObject(thread, INFINITE); +} + +static int pthread_mutex_init(pthread_mutex_t *mutex, + pthread_mutexattr_t *attr) { + (void)attr; + InitializeCriticalSection(mutex); + return 0; +} + +static int pthread_mutex_destroy(pthread_mutex_t *mutex) { + DeleteCriticalSection(mutex); + return 0; +} + +static int pthread_mutex_lock(pthread_mutex_t *mutex) { + EnterCriticalSection(mutex); + return 0; +} + +static int pthread_mutex_unlock(pthread_mutex_t *mutex) { + LeaveCriticalSection(mutex); + return 0; +} + +static int pthread_cond_init(pthread_cond_t *cond, pthread_condattr_t *attr) { + (void)attr; + InitializeConditionVariable(cond); + return 0; +} + +static int pthread_cond_destroy(pthread_cond_t *cond) { + (void)cond; + return 0; +} + +static int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex) { + SleepConditionVariableCS(cond, mutex, INFINITE); + return 0; +} + +static int pthread_cond_signal(pthread_cond_t *cond) { + WakeConditionVariable(cond); + return 0; +} + +static int pthread_cond_broadcast(pthread_cond_t *cond) { + WakeAllConditionVariable(cond); + return 0; +} + +static int sched_yield(void) { + // https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-yieldprocessor + YieldProcessor(); + return 0; +} + +#else // ! _WIN32 + +#include +#include +#include + +#endif + +// #define GGML_THREADING_DEBUG 1 + +#ifdef GGML_THREADING_DEBUG +#define PRINT_DEBUG(...) fprintf(stdout, __VA_ARGS__) +#else +#define PRINT_DEBUG(...) +#endif + +struct ggml_perf_stats { + int runs; + + // total cycles + atomic_int cycles; + + // total time in us. + atomic_int time_us; +}; + +struct ggml_compute_state_shared { + atomic_flag spin; + pthread_mutex_t mutex; + pthread_cond_t cond; + + // number of threads that has entered thread runner. + atomic_int n_ready; + + // number of assigned but unfinished tasks, workers decrease it. + atomic_int n_tasks; + + // number of waiting workers, workers increase it. + atomic_int n_waiting; + + // commands. + atomic_bool wait_now; + atomic_bool wait_on_done; + atomic_bool stop; + + ggml_threading_task_runner *task_runner; + + struct ggml_threading_context *ctx; +}; +struct ggml_compute_state { + pthread_t thrd; + + atomic_bool has_work; + struct ggml_compute_params params; + struct ggml_tensor *node; + + struct ggml_compute_state_shared *shared; +}; +struct ggml_threading_context { + int n_threads; + struct ggml_compute_state_shared shared; + struct ggml_compute_state *workers; + + enum ggml_threading_features features; + + struct ggml_perf_stats wait_perf; + struct ggml_perf_stats wakeup_perf; + + int64_t *stages_time; +}; + +// NOTE: ggml_spin_lock and ggml_spin_unlock may can be noop if +// feature wait_on_done is off. +static inline void ggml_spin_lock(volatile atomic_flag *obj) { + while (atomic_flag_test_and_set(obj)) { + ggml_spin_pause(); + } +} + +static inline void ggml_spin_unlock(volatile atomic_flag *obj) { + atomic_flag_clear(obj); +} + +static inline void ggml_perf_collect(struct ggml_perf_stats *st, int64_t c0, + int64_t t0) { + st->runs++; + st->cycles += (ggml_cycles() - c0); + st->time_us += (ggml_time_us() - t0); +} + +// A worker thread goes cond waiting. +// NOTE: must be protected by shared->spin +static void ggml_threading_cond_wait(struct ggml_compute_state *state) { + struct ggml_compute_state_shared *shared = state->shared; + + int64_t perf_cycles_0 = 0; + int64_t perf_time_0 = 0; + + if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) { + perf_cycles_0 = ggml_cycles(); + perf_time_0 = ggml_time_us(); + } + + GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0); + + if (!shared->wait_now) { + GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0); + ggml_spin_unlock(&shared->spin); + return; + } + + shared->n_waiting++; + ggml_spin_unlock(&shared->spin); + + GGML_ASSERT(pthread_cond_wait(&shared->cond, &shared->mutex) == 0); + GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0); + + ggml_spin_lock(&shared->spin); + + shared->n_waiting--; + + if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) { + ggml_perf_collect(&shared->ctx->wait_perf, perf_cycles_0, perf_time_0); + } +} + +// Wakeup all workers. +// +// Workers takes some time to wakeup, and has to lock spin after wakeup. Yield +// is used to avoid signal frequently. Current implementation is highly +// experimental. See tests/test-ggml-threading.c for details. +// +// NOTE: must be protected by shared->spin +static void +ggml_threading_wakeup_workers(struct ggml_compute_state_shared *shared) { + int64_t perf_cycles_0 = 0; + int64_t perf_time_0 = 0; + + if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) { + perf_cycles_0 = ggml_cycles(); + perf_time_0 = ggml_time_us(); + } + + shared->wait_now = false; + + int loop_counter = 0; + int notify_counter = 0; + int64_t last_signal_time = 0; + + while (shared->n_waiting != 0) { + ggml_spin_unlock(&shared->spin); + + if (loop_counter > 0) { + ggml_spin_pause(); + if (loop_counter > 3) { + sched_yield(); + } + } + ++loop_counter; + + // TODO: should bench actual average wait/wakeup time. + if (last_signal_time > 0 && (ggml_time_us() - last_signal_time) < 10) { + continue; + } + + GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0); + GGML_ASSERT(pthread_cond_broadcast(&shared->cond) == 0); + GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0); + ++notify_counter; + last_signal_time = ggml_time_us(); + + ggml_spin_lock(&shared->spin); + } + + if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) { + ggml_perf_collect(&shared->ctx->wakeup_perf, perf_cycles_0, + perf_time_0); + } + + // if (notify_counter > 1) { + // printf("%s: loop counter: %d, notify counter: %d\n", __func__, + // loop_counter, notify_counter); + // } + UNUSED(notify_counter); +} + +// Setup workers for a task stage. +// NOTE: must be protected by shared->spin +static void ggml_threading_setup_workers(struct ggml_threading_context *ctx, + struct ggml_task_profile *profile, + enum ggml_task_type type) { + PRINT_DEBUG("[main] setup workers for task ...\n"); + +#ifdef GGML_THREADING_DEBUG + int64_t t0 = ggml_time_us(); +#endif + + const int n_worker_threads = ctx->n_threads - 1; + struct ggml_task_stage *current = &profile->stages[type]; + struct ggml_compute_state_shared *shared = &ctx->shared; + + if (current->parallel) { + if (shared->n_waiting > 0) { + ggml_threading_wakeup_workers(shared); + } + + if ((ctx->features & GGML_THREADING_FEATURE_WAIT_ON_DONE) > 0) { + // Optimize energy: wait_on_done. We MAY also check following nodes, + // but that's a bit complicated. + shared->wait_on_done = false; + for (int i = type + 1; i <= GGML_TASK_FINALIZE; i++) { + struct ggml_task_stage *next = &profile->stages[i]; + if (next->parallel) { + break; + } + if (next->wait) { + shared->wait_on_done = true; + PRINT_DEBUG("[main] wait_on_done is enabled for " + "current task stage\n"); + break; + } + } + } + } else if (current->wait) { + if (shared->n_waiting < n_worker_threads) { + shared->wait_now = true; + PRINT_DEBUG("[main] wait_now was set, expect %d workers wait\n", + n_worker_threads); + ggml_spin_unlock(&shared->spin); + + while (shared->n_waiting != n_worker_threads) { + ggml_spin_pause(); + } + + ggml_spin_lock(&shared->spin); + PRINT_DEBUG("[main] saw %d workers waiting\n", n_worker_threads); + } + } + + PRINT_DEBUG("[main] setup workers for task took %d us\n", + (int)(ggml_time_us() - t0)); +} + +ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) { + GGML_ASSERT(data); + struct ggml_compute_state *state = (struct ggml_compute_state *)data; + GGML_ASSERT(state); + + struct ggml_compute_state_shared *shared = state->shared; + GGML_ASSERT(shared); + GGML_ASSERT(shared->task_runner); + + shared->n_ready++; + + PRINT_DEBUG("[%d-th] running\n", state->params.ith); + + while (!shared->stop) { + if (shared->wait_now) { + ggml_spin_lock(&shared->spin); + if (!state->has_work) { + ggml_threading_cond_wait(state); + } + ggml_spin_unlock(&shared->spin); + } + + if (shared->n_tasks > 0 && state->has_work) { + enum ggml_compute_error err = + shared->task_runner(&state->params, state->node); + + GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK); + + ggml_spin_lock(&shared->spin); + + state->has_work = false; + shared->n_tasks--; + + bool wait = shared->wait_on_done && !state->has_work; + if (wait) { + ggml_threading_cond_wait(state); + } + + ggml_spin_unlock(&shared->spin); + + // no need to pause. + if (wait) { + continue; + } + } + + ggml_spin_pause(); + } + + PRINT_DEBUG("[%d-th] exited\n", state->params.ith); + return 0; +} + +enum ggml_compute_error +ggml_threading_compute_tensor(struct ggml_threading_context *ctx, + struct ggml_tensor *node, void *wdata, + size_t wsize) { + GGML_ASSERT(ctx); + GGML_ASSERT(node); + + GGML_ASSERT(ctx->shared.task_runner); + struct ggml_compute_state_shared *state_shared = &ctx->shared; + + // This is the params for main thread. + struct ggml_compute_params params; + enum ggml_compute_error err; + + for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) { + if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) { + continue; + } + + PRINT_DEBUG("[main] stage: %d\n", type); + + int64_t t_stage = 0; + if (ctx->stages_time) { + t_stage = ggml_time_us(); + } + + // n_tasks is the total number of parallel computing tasks + // (including main thread). + int n_tasks = + node->task_profile.stages[type].parallel ? ctx->n_threads : 1; + + ggml_spin_lock(&state_shared->spin); + + if (ctx->n_threads > 1) { + ggml_threading_setup_workers(ctx, &node->task_profile, type); + } + + if (n_tasks > 1) { + // setup compute task parameters. + for (int j = 0; j < n_tasks - 1; j++) { + ctx->workers[j].params = (struct ggml_compute_params){ + .type = type, + .ith = j + 1, + .nth = n_tasks, + .wsize = wsize, + .wdata = wdata, + }; + ctx->workers[j].node = node; + ctx->workers[j].has_work = true; + } + state_shared->n_tasks = n_tasks - 1; + PRINT_DEBUG("[main] assigned %d tasks\n", state_shared->n_tasks); + } + + ggml_spin_unlock(&state_shared->spin); + + // main thread always run the 0-th task. + // TODO: assert(params->nth == 1) instead of + // assert(params->ith == 0) + { + params.type = type; + params.ith = 0; + params.nth = n_tasks; + params.wsize = wsize; + params.wdata = wdata; + + err = state_shared->task_runner(¶ms, node); + } + + // wait for tasks done. + if (n_tasks > 1) { + while (state_shared->n_tasks != 0) { + ggml_spin_pause(); + } + } + + PRINT_DEBUG("[main] all tasks finished\n\n"); + + if (ctx->stages_time) { + ctx->stages_time[type] = ggml_time_us() - t_stage; + } + + if (err != GGML_COMPUTE_OK) { + return err; + } + } + + return GGML_COMPUTE_OK; +} + +struct ggml_threading_context * +ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner, + ggml_threading_task_runner *task_stage_runner, + enum ggml_threading_features features, + int64_t stages_time[3]) { + GGML_ASSERT(n_threads > 0); + GGML_ASSERT(thread_runner); + GGML_ASSERT(task_stage_runner); + + size_t ctx_sz = sizeof(struct ggml_threading_context); + struct ggml_threading_context *ctx = malloc(ctx_sz); + GGML_ASSERT(ctx); + memset(ctx, 0, ctx_sz); + + ctx->shared = (struct ggml_compute_state_shared){ + .spin = {0}, + .n_ready = 0, + .n_tasks = 0, + .n_waiting = 0, + .wait_now = false, + .wait_on_done = false, + .stop = false, + .task_runner = task_stage_runner, + .ctx = ctx, + }; + + PRINT_DEBUG("[main] thread start, features: %d\n", features); + + ctx->n_threads = n_threads; + ctx->features = features; + ctx->stages_time = stages_time; + + int n_workers = n_threads - 1; + if (n_workers > 0) { + GGML_ASSERT(pthread_mutex_init(&ctx->shared.mutex, NULL) == 0); + GGML_ASSERT(pthread_cond_init(&ctx->shared.cond, NULL) == 0); + + size_t workers_sz = sizeof(struct ggml_compute_state) * n_workers; + struct ggml_compute_state *workers = malloc(workers_sz); + GGML_ASSERT(workers); + memset(workers, 0, workers_sz); + + for (int j = 0; j < n_workers; j++) { + workers[j].shared = &ctx->shared; + GGML_ASSERT(pthread_create(&workers[j].thrd, NULL, thread_runner, + &workers[j]) == 0); + } + + ctx->workers = workers; + + while (ctx->shared.n_ready != n_workers) { + ggml_spin_pause(); + } + } + + return ctx; +} + +static void +ggml_threading_print_perf_stats(struct ggml_threading_context *ctx) { + bool print_stats = ctx->features & GGML_THREADING_FEATURE_PERF; +#ifdef GGML_THREADING_DEBUG + print_stats = true; +#endif + + if (!print_stats) { + return; + } + + const char *prefix_arr[2] = {"[threading wait ]", "[threading wakeup]"}; + struct ggml_perf_stats *st_arr[2] = {&ctx->wait_perf, &ctx->wakeup_perf}; + for (int i = 0; i < 2; i++) { + struct ggml_perf_stats *st = st_arr[i]; + if (st->runs == 0) { + continue; + } + fprintf(stdout, + "%s runs: %4d, avg cycles: %8.3f ms, avg time: " + "%8.3f ms\n", + prefix_arr[i], st->runs, + 1.0 * st->cycles / (st->runs * ggml_cycles_per_ms()), + 1.0 * st->time_us / (st->runs * 1000)); + } +} + +void ggml_threading_stop(struct ggml_threading_context *ctx) { + GGML_ASSERT(ctx); + + if (ctx->workers) { + PRINT_DEBUG("[main] stopping thread pool ...\n"); + ctx->shared.stop = true; + + ggml_spin_lock(&ctx->shared.spin); + ggml_threading_wakeup_workers(&ctx->shared); + ggml_spin_unlock(&ctx->shared.spin); + + for (int j = 0; j < ctx->n_threads - 1; j++) { + GGML_ASSERT(pthread_join(ctx->workers[j].thrd, NULL) == 0); + } + free(ctx->workers); + PRINT_DEBUG("[main] thread pool stopped\n"); + } + + ggml_threading_print_perf_stats(ctx); + + free(ctx); +} diff --git a/ggml-threading.h b/ggml-threading.h new file mode 100644 index 000000000..f3214efc7 --- /dev/null +++ b/ggml-threading.h @@ -0,0 +1,68 @@ +#pragma once + +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_WIN32) +typedef int ggml_thread_ret_t; +#else +typedef void *ggml_thread_ret_t; +#endif + +struct ggml_threading_context; + +// Optional (experimental) features. +enum ggml_threading_features { + GGML_THREADING_FEATURE_NONE = 0, + GGML_THREADING_FEATURE_WAIT_ON_DONE = 1 << 0, + GGML_THREADING_FEATURE_PERF = 1 << 1, +}; + +// Compute errors. +enum ggml_compute_error { + GGML_COMPUTE_OK = 0, + GGML_COMPUTE_FALLBACK = 1, +}; + +// The task runner to be called by main thread and workers. +typedef enum ggml_compute_error(ggml_threading_task_runner)( + struct ggml_compute_params *params, struct ggml_tensor *node); + +// The thread runner to feed into OS threads. +typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data); + +// Init and start underlying workers if n_threads > 1. +// +// features: optional for configure threading additional features. +// see `ggml_threading_feature`, default 0. +// stages_time: optional for collecting per-stage wall clock time. +struct ggml_threading_context * +ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread, + ggml_threading_task_runner *task_stage_runner, + enum ggml_threading_features features, + int64_t stages_time[3]); + +// Stop workers (if exist), free memories (including the ctx). +void ggml_threading_stop(struct ggml_threading_context *ctx); + +// The default implementation of `ggml_threading_thread_runner` +ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data); + +// Compute a tensor. It computes the enabled task stages one by one. +// Caller should take care of the return error: retry for fallback error. +enum ggml_compute_error +ggml_threading_compute_tensor(struct ggml_threading_context *ctx, + struct ggml_tensor *node, void *wdata, + size_t wsize); + +// This is an experimental functionality for mulmat tune, as a thin wrapper. +enum ggml_compute_error +ggml_compute_forward_wrapper(struct ggml_compute_params *params, + struct ggml_tensor *tensor); + +#ifdef __cplusplus +} +#endif diff --git a/ggml-tune.c b/ggml-tune.c new file mode 100644 index 000000000..fbca953ed --- /dev/null +++ b/ggml-tune.c @@ -0,0 +1,897 @@ +#include + +#include "ggml-threading.h" +#include "ggml-tune.h" +#include "ggml.h" + +// MUL_MAT fine tunning for non-GPU-offloading cases. + +#define GGML_MULMAT_CACHE_LEN 16 +static struct mm_cache_element default_mm_cache[GGML_MULMAT_CACHE_LEN] = {0}; + +#define FNV_OFFSET 14695981039346656037UL +#define FNV_PRIME 1099511628211UL +static uint64_t ggml_mulmat_tune_cache_hash(int M, int N, int K) { + char buf[30]; + snprintf(buf, 30, "%d%d%d", M, N, K); + + uint64_t hash = FNV_OFFSET; + for (const char *p = buf; *p; p++) { + hash ^= (uint64_t)(unsigned char)(*p); + hash *= FNV_PRIME; + } + return hash; +} + +static const char * +ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) { + switch (backend) { + case GGML_TASK_BACKEND_NONE: + return ""; + case GGML_TASK_BACKEND_CPU: + return "CPU"; + case GGML_TASK_BACKEND_CPU_BLAS: + return "BLAS"; + case GGML_TASK_BACKEND_GPU: + return "GPU"; + case GGML_TASK_BACKEND_GPU_CUDA: + return "CUDA"; + case GGML_TASK_BACKEND_GPU_CL: + return "CL"; + default: + GGML_ASSERT(false); + } +} + +const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile( + struct ggml_mulmat_tune *tune, int M, int N, int K, enum ggml_type src0_t, + enum ggml_type src1_t, int stages_time[3]) { + GGML_ASSERT(tune); + + // TODO: default_mm_cache is thread-unsafe. + struct mm_cache_element *mm_cache = default_mm_cache; + int slot = ggml_mulmat_tune_cache_hash(M, N, K) % GGML_MULMAT_CACHE_LEN; + struct mm_cache_element *e = &mm_cache[slot]; + + struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0}; + + struct ggml_task_profile *prof = NULL; + + if (e->M == M && e->N == N && e->K == K) { + prof = e->profile; + if (stages_time) { + for (int i = 0; i < 3; i++) { + stages_time[i] = e->stages_time[i]; + } + } + } else { + const struct ggml_mulmat_tune_shape *shape = NULL; + shape = ggml_mulmat_tune_get_shape(tune, N, K, src0_t, src1_t); + if (shape) { + ggml_mulmat_tune_estimate_time(shape, M, profiles_time); + + int min = INT32_MAX; + int index = -1; + for (int i = 0; i < shape->n_profiles; i++) { + int total = profiles_time[i].total_time; + if (total < min) { + min = total; + index = i; + } + } + + if (index >= 0) { + prof = profiles_time[index].profile; + for (int i = 0; i < 3; i++) { + int t = profiles_time[index].stage_time[i]; + if (stages_time) { + stages_time[i] = t; + } + e->stages_time[i] = t; + } + + GGML_ASSERT(prof); + + e->profile = prof; + e->M = M; + e->N = N; + e->K = K; + + // to disable this, build with + // `make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1 + // make` +#if !defined(GGML_MULMAT_TUNE_NDEBUG) + const char *names[3]; + for (int i = 0; i < 3; i++) { + names[i] = ggml_mulmat_tune_task_backend_name( + prof->stages[i].backend); + } + printf( + "\n[mulmat tune] M: %3d, N: %5d, K: %5d, backends of the " + "fastest profile: %s %s %s\n", + M, N, K, names[0], names[1], names[2]); +#endif + } + } + } + + return prof; +} + +void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model, + const char *name, enum ggml_ftype ftype) { + const int n_vocab = 32000; + int n_embd; + // n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult + int n_ff; + // n_rot = n_embd/n_head; + int n_rot; + + if (strcmp(name, "3B") == 0) { + // n_head=32, n_mult=216, n_layer=26 + // https://github.com/ggerganov/llama.cpp/pull/1588 + n_embd = 3200; + n_ff = 8640; + n_rot = 100; + } else if (strcmp(name, "7B") == 0) { + n_embd = 4096; + n_ff = 11008; + n_rot = 128; + } else if (strcmp(name, "13B") == 0) { + n_embd = 5120; + n_ff = 13824; + n_rot = 128; + } else if (strcmp(name, "30B") == 0) { + n_embd = 6656; + n_ff = 17920; + n_rot = 128; + } else if (strcmp(name, "65B") == 0) { + n_embd = 8192; + n_ff = 22016; + n_rot = 128; + } else { + GGML_ASSERT(false); + } + + model->name = name; + model->ftype = ftype; + model->n_vocab = n_vocab; + model->n_embd = n_embd; + model->n_ff = n_ff; + model->n_rot = n_rot; +} + +bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune, + struct ggml_mulmat_tune_params *params, + struct ggml_task_profile_factory *pf) { + + struct ggml_mulmat_tune_model *model = ¶ms->model; + + memset(tune, 0, sizeof(struct ggml_mulmat_tune)); + + tune->version = GGML_MULMAT_TUNE_VERSION; + tune->n_threads = params->n_threads; + tune->ftype = model->ftype; + + size_t name_len = strlen(model->name); + GGML_ASSERT(name_len > 0); + strncpy(tune->model, model->name, sizeof(tune->model) - 1); + + const enum ggml_type rot_src0_type = GGML_TYPE_F16; + const enum ggml_type src1_type = GGML_TYPE_F32; + + int n_vocab = model->n_vocab; + int n_embd = model->n_embd; + int n_ff = model->n_ff; + int n_rot = model->n_rot; + + enum ggml_type type = ggml_ftype_to_ggml_type(model->ftype); + + GGML_ASSERT(GGML_MULMAT_N_SHAPES >= 6); + tune->n_shapes = GGML_MULMAT_N_SHAPES; + + // Attention layers + tune->shapes[0] = (struct ggml_mulmat_tune_shape){ + .N = n_embd, .K = n_embd, .src0_type = type, .src1_type = src1_type}; + // Feed forward layers + tune->shapes[1] = (struct ggml_mulmat_tune_shape){ + .N = n_embd, .K = n_ff, .src0_type = type, .src1_type = src1_type}; + tune->shapes[2] = (struct ggml_mulmat_tune_shape){ + .N = n_ff, .K = n_embd, .src0_type = type, .src1_type = src1_type}; + tune->shapes[3] = (struct ggml_mulmat_tune_shape){ + .N = n_vocab, .K = n_embd, .src0_type = type, .src1_type = src1_type}; + // RoPE + tune->shapes[4] = (struct ggml_mulmat_tune_shape){ + .N = n_rot, .K = 0, .src0_type = rot_src0_type, .src1_type = src1_type}; + tune->shapes[5] = (struct ggml_mulmat_tune_shape){ + .N = 0, .K = n_rot, .src0_type = rot_src0_type, .src1_type = src1_type}; + + for (int i = 0; i < tune->n_shapes; i++) { + struct ggml_mulmat_tune_shape *shape = &tune->shapes[i]; + shape->n_profiles = ggml_mulmat_get_task_profiles( + pf, shape->src0_type, shape->src1_type, &shape->profiles); + if (shape->n_profiles == 0) { + // allowed for testing. + continue; + } + + shape->m_num = params->m_num; + shape->arr_m = malloc(shape->m_num * sizeof(int)); + for (int j = 0; j < shape->m_num; j++) { + shape->arr_m[j] = 1 << j; + } + + size_t sz = sizeof(struct ggml_mulmat_tune_m) * + (shape->n_profiles * shape->m_num); + shape->items = malloc(sz); + GGML_ASSERT(shape->items); + memset(shape->items, 0, sz); + } + + return true; +} + +void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune) { + for (int i = 0; i < tune->n_shapes; i++) { + struct ggml_mulmat_tune_shape *shape = &tune->shapes[i]; + GGML_ASSERT(shape); + + // arr_m and items can be NULL only when testing. + if (shape->arr_m) { + free(shape->arr_m); + } + if (shape->items) { + free(shape->items); + } + } +} + +static bool ggml_mulmat_tune_write_profiles( + FILE *fp, const struct ggml_task_profile *profiles, int n_profiles) { + int rc; + for (int i = 0; i < n_profiles; i++) { + const struct ggml_task_profile *profile = &profiles[i]; + for (int j = 0; j < 3; j++) { + const struct ggml_task_stage *ts = &profile->stages[j]; + rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0, + ts->wait ? 1 : 0); + if (rc <= 0) { + return false; + } + if (j < 2) { + rc = fprintf(fp, " "); + if (rc <= 0) { + return false; + } + } + } + rc = fprintf(fp, "\n"); + if (rc <= 0) { + return false; + } + } + + return true; +} + +static bool +ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune, + const char *model, int ftype, int n_threads, + char *errbuf, int errbuf_len) { + + if (tune->version != GGML_MULMAT_TUNE_VERSION) { + snprintf(errbuf, errbuf_len - 1, + "version mismatch, built-in: %d, " + "yours: %d", + GGML_MULMAT_TUNE_VERSION, tune->version); + return false; + } else if (strcmp(model, tune->model) != 0) { + snprintf(errbuf, errbuf_len - 1, + "model mismatch. built-in: %s, yours: %s", model, tune->model); + return false; + } else if (ftype != tune->ftype) { + snprintf(errbuf, errbuf_len - 1, + "ftype mismatch. built-in: %d, yours: %d\n", ftype, + tune->ftype); + return false; + } else if (n_threads != tune->n_threads) { + snprintf(errbuf, errbuf_len - 1, + "n_threads mismatch. run-time: %d, yours: %d\n", n_threads, + tune->n_threads); + return false; + } + + for (int i = 0; i < tune->n_shapes; i++) { + const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i]; + + struct ggml_task_profile *builtin_profiles = NULL; + int n_profiles = ggml_mulmat_get_task_profiles( + NULL, shape->src0_type, shape->src1_type, &builtin_profiles); + + if (n_profiles != shape->n_profiles) { + snprintf(errbuf, errbuf_len - 1, "task profiles mismatch"); + return false; + } + + // TODO: profiles order is relevant, too strict. + size_t sz = sizeof(struct ggml_task_profile) * n_profiles; + if (memcmp(builtin_profiles, shape->profiles, sz) != 0) { + snprintf(errbuf, errbuf_len - 1, "task profiles mismatch"); + + printf("=== built-in profiles:\n"); + ggml_mulmat_tune_write_profiles(stderr, builtin_profiles, + n_profiles); + + printf("=== incoming profiles:\n"); + ggml_mulmat_tune_write_profiles(stderr, shape->profiles, + shape->n_profiles); + return false; + } + } + + return true; +} + +bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune, + const char *model, int ftype, int n_threads) { + char errbuf[128]; + bool ok = ggml_mulmat_tune_validate_internal(tune, model, ftype, n_threads, + errbuf, sizeof(errbuf)); + if (!ok) { + fprintf(stderr, "[mulmat tune] error: %s. run bench again.\n", errbuf); + } + + return ok; +} + +bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { + int rc = fscanf(fp, "%d", &tune->version); + if (rc <= 0) { + return false; + } + + if (tune->version != GGML_MULMAT_TUNE_VERSION) { + fprintf(stderr, "[mulmat tune] version mismatch, run bench again\n"); + return false; + } + + rc = fscanf(fp, "%s %d %d %d", tune->model, (int *)&tune->ftype, + &tune->n_shapes, &tune->n_threads); + if (rc <= 0) { + return false; + } + + for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) { + struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape]; + + rc = fscanf(fp, "%d %d %d %d %d %d", &shape->N, &shape->K, + (int *)&shape->src0_type, (int *)&shape->src1_type, + &shape->n_profiles, &shape->m_num); + if (rc <= 0) { + return false; + } + + { + size_t item_size = sizeof(struct ggml_mulmat_tune_m) * + (shape->n_profiles * shape->m_num); + shape->items = malloc(item_size); + if (shape->items == NULL) { + fprintf(stderr, "[mulmat tune] failed to allocate memory\n"); + return false; + } + memset(shape->items, 0, item_size); + } + + { + size_t sz = sizeof(struct ggml_task_profile) * shape->n_profiles; + shape->profiles = malloc(sz); + GGML_ASSERT(shape->profiles); + memset(shape->profiles, 0, sz); + } + + for (int ip = 0; ip < shape->n_profiles; ip++) { + struct ggml_task_profile *profile = &shape->profiles[ip]; + for (int j = 0; j < 3; j++) { + struct ggml_task_stage *ts = &profile->stages[j]; + int backend; + int parallel; + int wait; + rc = fscanf(fp, "%d %d %d", &backend, ¶llel, &wait); + if (rc <= 0) { + return false; + } + ts->backend = (enum ggml_task_backend)backend; + ts->parallel = parallel ? true : false; + ts->wait = wait ? true : false; + } + } + + for (int i_m = 0; i_m < shape->m_num; i_m++) { + int M; + for (int ip = 0; ip < shape->n_profiles; ip++) { + if (ip == 0) { + rc = fscanf(fp, "%d", &M); + if (rc <= 0) { + return false; + } + } + struct ggml_mulmat_tune_m *item = + &shape->items[ip * shape->m_num + i_m]; + item->M = M; + rc = fscanf(fp, "%d %d %d", &item->stages_time[0], + &item->stages_time[1], &item->stages_time[2]); + if (rc <= 0) { + return false; + } + } + } + } + + return true; +} + +bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, + FILE *fp) { + int rc; + rc = fprintf(fp, "%d %s %d %d %d\n\n", tune->version, tune->model, + tune->ftype, tune->n_shapes, tune->n_threads); + if (rc <= 0) { + return false; + } + + for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) { + if (i_shape > 0) { + fprintf(fp, "\n"); + } + const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape]; + rc = fprintf(fp, "%d %d %d %d %d %d\n", shape->N, shape->K, + shape->src0_type, shape->src1_type, shape->n_profiles, + shape->m_num); + if (rc <= 0) { + return false; + } + + rc = ggml_mulmat_tune_write_profiles(fp, shape->profiles, + shape->n_profiles); + if (rc <= 0) { + return false; + } + + for (int i_m = 0; i_m < shape->m_num; i_m++) { + for (int ip = 0; ip < shape->n_profiles; ip++) { + struct ggml_mulmat_tune_m *item = + &shape->items[ip * shape->m_num + i_m]; + if (ip == 0) { + rc = fprintf(fp, "%4d", item->M); + if (rc <= 0) { + return false; + } + } + + struct ggml_task_profile *profile = &shape->profiles[ip]; + for (int k = 0; k < 3; k++) { + if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) { + rc = fprintf(fp, "%9d", item->stages_time[k]); + if (rc <= 0) { + return false; + } + } else { + rc = fprintf(fp, " 0"); + if (rc <= 0) { + return false; + } + } + } + } + rc = fprintf(fp, "\n"); + if (rc <= 0) { + return false; + } + } + } + + return true; +} + +const struct ggml_mulmat_tune_shape * +ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N, + const int K, enum ggml_type src0_type, + enum ggml_type src1_type) { + GGML_ASSERT(N > 0 && K > 0); + + for (int i = 0; i < tune->n_shapes; i++) { + const struct ggml_mulmat_tune_shape *s = &tune->shapes[i]; + if (s->src0_type != src0_type || s->src1_type != src1_type) { + continue; + } + + if (s->N > 0 && s->K > 0) { + if (s->N == N && s->K == K) { + return s; + } + } else if (s->N > 0 && s->K == 0) { + if (s->N == N) { + return s; + } + } else if (s->N == 0 && s->K > 0) { + if (s->K == K) { + return s; + } + } + } + + return NULL; +} + +// This is the experimental reference implementation. +// Requires both n_threads are same at bench time and runtime. +void ggml_mulmat_tune_estimate_time( + const struct ggml_mulmat_tune_shape *shape, int M, + struct ggml_mulmat_tune_time *profile_time) { + + GGML_ASSERT(shape); + GGML_ASSERT(profile_time); + + const int m_num = shape->m_num; + const int min_m = shape->items[0].M; + const int max_m = shape->items[m_num - 1].M; + + for (int ip = 0; ip < shape->n_profiles; ip++) { + struct ggml_task_profile *profile = &shape->profiles[ip]; + profile_time[ip].total_time = 0; + profile_time[ip].profile = profile; + + const int items_offset = ip * m_num; + + struct ggml_mulmat_tune_m *p0 = NULL; + struct ggml_mulmat_tune_m *p1 = NULL; + if (M < min_m) { + // first two. + p0 = &shape->items[items_offset]; + p1 = &shape->items[items_offset + 1]; + } else if (M > max_m) { + // last two + p0 = &shape->items[items_offset + m_num - 2]; + p1 = &shape->items[items_offset + m_num - 1]; + } else { + for (int i = 0; i < m_num; i++) { + p1 = &shape->items[items_offset + i]; + if (p1->M == M) { + p0 = p1; + break; + } + + if (i > 0) { + p0 = (struct ggml_mulmat_tune_m *)(p1 - 1); + if (M > p0->M && M < p1->M) { + break; + } + } + } + } + + GGML_ASSERT(p0 && p1); + + for (int i_stage = 0; i_stage < 3; i_stage++) { + struct ggml_task_stage *stage = &profile->stages[i_stage]; + if (stage->backend == GGML_TASK_BACKEND_NONE) { + continue; + } + + int p0_v = p0->stages_time[i_stage]; + int p1_v = p1->stages_time[i_stage]; + + GGML_ASSERT(p0_v >= 0); + GGML_ASSERT(p1_v >= 0); + + // t = aM + b + double a; + double b; + + if (p0 == p1) { + a = 0.0; + b = p1_v; + } else { + a = 1.0 * (p1_v - p0_v) / (p1->M - p0->M); + b = p1_v - a * p1->M; + } + int t = (int)(a * M + b); + + profile_time[ip].stage_time[i_stage] = t; + profile_time[ip].total_time += t; + } + } +} + +// Experimental: create mul_mat tensor. +static struct ggml_tensor *ggml_mulmat_new_tensor(int M, int N, int K, + enum ggml_type src0_type, + struct ggml_context **ctx) { + // At most 256, because in `ggml_quantize_qx_x`, the index type of hist is + // either int8_t or uint8_t. + // Use 1024 to avoid suddenly broken. + int64_t hist[1024]; + + bool src0_is_quantized = ggml_is_quantized(src0_type); + + size_t ctx_size = 0; + ctx_size += (size_t)(M * N * ggml_type_sizef(GGML_TYPE_F32)); // src1 + ctx_size += (size_t)(N * K * ggml_type_sizef(src0_type)); // src0 + ctx_size += (size_t)(1024 * 1024 * 64); // experimental + + if (src0_is_quantized) { + // quantize F32 to Qx_x + ctx_size += (size_t)(N * K * ggml_type_sizef(GGML_TYPE_F32)); + } + + struct ggml_init_params init_params = { + .mem_size = ctx_size, + .mem_buffer = NULL, + .no_alloc = 0, + }; + + *ctx = ggml_init(init_params); + GGML_ASSERT(*ctx); + + // src0: N x K + struct ggml_tensor *src0 = + ggml_new_tensor_2d(*ctx, src0_type, (int64_t)K, (int64_t)N); + + // src1: M x K + struct ggml_tensor *src1 = + ggml_new_tensor_2d(*ctx, GGML_TYPE_F32, (int64_t)K, (int64_t)M); + ggml_set_f32(src1, 0.5f); + + if (src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_F16) { + ggml_set_f32(src0, 0.1f); + } else if (src0_is_quantized) { + struct ggml_tensor *src0_f32 = + ggml_new_tensor_2d(*ctx, GGML_TYPE_F32, (int64_t)K, (int64_t)N); + ggml_set_f32(src0_f32, 0.1f); + + switch (src0_type) { + case GGML_TYPE_Q4_0: + ggml_quantize_q4_0((const float *)src0_f32->data, src0->data, N * K, + K, hist); + break; + case GGML_TYPE_Q4_1: + ggml_quantize_q4_1((const float *)src0_f32->data, src0->data, N * K, + K, hist); + break; + case GGML_TYPE_Q5_0: + ggml_quantize_q5_0((const float *)src0_f32->data, src0->data, N * K, + K, hist); + break; + case GGML_TYPE_Q5_1: + ggml_quantize_q5_1((const float *)src0_f32->data, src0->data, N * K, + K, hist); + break; + case GGML_TYPE_Q8_0: + ggml_quantize_q8_0((const float *)src0_f32->data, src0->data, N * K, + K, hist); + break; + default: + GGML_ASSERT(false); + } + } else { + GGML_ASSERT(false); + } + + // node: M x N + // Will compute z = y * xT, z: node, y: src1, x: src0 + return ggml_mul_mat(*ctx, src0, src1); +} + +// Experimental: allocate memory for wdata with max possible size. +// This part of code is actually belongs to ggml compute graph. +static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) { + // The size is actually determined by cgraph before computing. + // Apart from the src0_type, wsize is affected by backend, cache line size, + // n_threads etc. + + const size_t extra = 1024 * 1024; + size_t sz = (size_t)(N * K * ggml_type_sizef(GGML_TYPE_F32)) + extra; + void *buf = malloc(sz); + + if (!buf) { + fprintf(stderr, + "[mulmat tune] error: failed to allocate %zu MiB memory", + sz / 1024 / 1024); + return 0; + } + + memset(buf, 0, sz); + *wdata = buf; + return sz; +} + +int ggml_mulmat_tune_get_builtin_task_backends( + enum ggml_task_backend *backends) { + int i = 0; + backends[i++] = GGML_TASK_BACKEND_CPU; + + if (ggml_cpu_has_cpublas()) { + backends[i++] = GGML_TASK_BACKEND_CPU_BLAS; + } + + if (ggml_cpu_has_cublas()) { + backends[i++] = GGML_TASK_BACKEND_GPU_CUDA; + } else if (ggml_cpu_has_clblast()) { + backends[i++] = GGML_TASK_BACKEND_GPU_CL; + } + return i; +} + +bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, + struct ggml_mulmat_tune_params *params) { + GGML_ASSERT(tune); + GGML_ASSERT(params); + GGML_ASSERT(params->model.name); + + enum ggml_task_backend backends[16]; + int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends); + if (n_backends < 2) { + fprintf(stderr, + "[mulmat tune] error: this program was not built with BLAS.\n"); + return false; + } + + bool ok = ggml_mulmat_tune_init(tune, params, NULL); + if (!ok) { + return false; + } + + { + char buf[128] = {0}; + int offset = 0; + + for (int i = 0; i < n_backends; i++) { + if (i > 0) { + buf[offset++] = ','; + buf[offset++] = ' '; + } + const char *name = ggml_mulmat_tune_task_backend_name(backends[i]); + size_t len = strlen(name); + memcpy(&buf[offset], name, len); + offset += (int)len; + } + + fprintf(stdout, + "[mulmat tune] model: %s, ggml ftype: %d, " + "n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n", + params->model.name, params->model.ftype, params->n_pass, + params->n_threads, tune->n_shapes, buf); + } + + int64_t stages_time[3]; + int64_t t0 = ggml_time_ms(); + + struct ggml_threading_context *thrd_ctx = ggml_threading_start( + tune->n_threads, ggml_threading_graph_compute_thread, + ggml_compute_forward_wrapper, GGML_THREADING_FEATURE_WAIT_ON_DONE, + stages_time); + + for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) { + const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape]; + int M; + int N = shape->N; + int K = shape->K; + + char buf[20] = {0}; + int buf_len = sizeof(buf) - 1; + int line_len = 0; + + for (int i_m = 0; i_m < shape->m_num; i_m++) { + M = shape->arr_m[i_m]; + if (shape->N == 0) { + N = M; + } else if (shape->K == 0) { + K = M; + } + + if (params->progress) { + line_len = snprintf(buf, buf_len, "%d %d %d ", N, K, M); + fprintf(stdout, "%s", buf); + fflush(stdout); + } + + char *wdata = NULL; + size_t wsize = ggml_mulmat_allocate_wdata(N, K, &wdata); + if (wsize == 0) { + return false; + } + + struct ggml_context *ctx = NULL; + struct ggml_tensor *node = + ggml_mulmat_new_tensor(M, N, K, shape->src0_type, &ctx); + + for (int ip = 0; ip < shape->n_profiles; ip++) { + const struct ggml_task_profile *profile = &shape->profiles[ip]; + + memcpy(&node->task_profile, profile, + sizeof(struct ggml_task_profile)); + + struct ggml_mulmat_tune_m *item = + &shape->items[ip * shape->m_num + i_m]; + item->M = M; + + int min[3] = {INT32_MAX, INT32_MAX, INT32_MAX}; + + for (int k = 0; k < params->n_pass; k++) { + for (int j = 0; j < 3; j++) { + stages_time[j] = 0; + } + + /*enum ggml_compute_error err = */ + ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize); + + for (int i = 0; i < 3; i++) { + int v = (int)stages_time[i]; + if (v < min[i]) { + min[i] = v; + } + } + + if (params->progress) { + fprintf(stdout, "."); + fflush(stdout); + line_len++; + } + } + for (int i = 0; i < 3; i++) { + item->stages_time[i] = min[i]; + } + } + + ggml_free(ctx); + free(wdata); + + if (params->progress) { + line_len += 10; + for (int j = 0; j < line_len; j++) { + fprintf(stdout, "\b \b"); + } + fflush(stdout); + } + } + } + + ggml_threading_stop(thrd_ctx); + + fprintf(stdout, "[mulmat tune] done, elapsed time: %d seconds.\n", + (int)(ggml_time_ms() - t0) / 1000); + + // output + + if (params->fname && strcmp(params->fname, "") != 0) { + FILE *fp = fopen(params->fname, "w"); + if (!fp) { + fprintf(stderr, + "[mulmat tune] warn: failed to open file `%s`, print to " + "console instead\n\n", + params->fname); + params->output_console = 1; + } else { + ok = ggml_mulmat_tune_write_data(tune, fp); + fclose(fp); + + if (ok) { + fprintf(stdout, "[mulmat tune] data was written to `%s`\n", + params->fname); + } else { + fprintf( + stderr, + "[mulmat tune] warn: failed to write file `%s`, print to " + "console instead\n\n", + params->fname); + params->output_console = 1; + } + } + } + + if (params->output_console) { + return ggml_mulmat_tune_write_data(tune, stdout); + } + + return true; +} diff --git a/ggml-tune.h b/ggml-tune.h new file mode 100644 index 000000000..404f1f1c4 --- /dev/null +++ b/ggml-tune.h @@ -0,0 +1,137 @@ +#pragma once + +#include +#include +#include + +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_MULMAT_TUNE_VERSION 8 +#define GGML_MULMAT_N_SHAPES 6 + +#define GGML_MULMAT_MAX_PASS 3 + +struct ggml_mulmat_tune_m { + int M; + + int stages_time[3]; +}; + +struct ggml_mulmat_tune_model { + const char *name; + + enum ggml_ftype ftype; + + int n_vocab; + + int n_embd; + + // n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult + int n_ff; + + // n_rot = n_embd/n_head; + int n_rot; +}; + +struct ggml_mulmat_tune_shape { + // For RoPE, one of N / K is 0. + int N; + int K; + + enum ggml_type src0_type; + enum ggml_type src1_type; + + int n_profiles; + struct ggml_task_profile *profiles; + + int m_num; + int *arr_m; + + struct ggml_mulmat_tune_m *items; +}; + +struct ggml_mulmat_tune { + int version; + + char model[16]; + + enum ggml_ftype ftype; + + int n_shapes; + // Given N/K, we bench for mul_mat [M,K] x [K,N]. + struct ggml_mulmat_tune_shape shapes[GGML_MULMAT_N_SHAPES]; + + int n_threads; +}; + +struct ggml_mulmat_tune_time { + struct ggml_task_profile *profile; + int stage_time[3]; + int total_time; +}; + +struct mm_cache_element { + int M; + int N; + int K; + struct ggml_task_profile *profile; + int stages_time[3]; +}; + +// params for tune/bench. +struct ggml_mulmat_tune_params { + struct ggml_mulmat_tune_model model; + int m_num; + int n_pass; + int n_threads; + bool progress; // print and clear '.' + bool output_console; // also print result to console + const char *fname; +}; + +// NOTE: stages_time is filled if not null. +const struct ggml_task_profile * +ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M, + int N, int K, enum ggml_type src0_t, + enum ggml_type src1_t, int stages_time[3]); + +bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune, + const char *model_name, int ftype, + int n_threads); + +void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model, + const char *name, enum ggml_ftype ftype); + +bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune, + struct ggml_mulmat_tune_params *params, + struct ggml_task_profile_factory *profile_factory); + +void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune); + +bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp); + +bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp); + +const struct ggml_mulmat_tune_shape * +ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, int N, int K, + enum ggml_type src0_type, enum ggml_type src1_type); + +void ggml_mulmat_tune_estimate_time(const struct ggml_mulmat_tune_shape *shape, + int M, + struct ggml_mulmat_tune_time *profile_time); + +const char *ggml_task_backend_name(enum ggml_task_backend backend); + +int ggml_mulmat_tune_get_builtin_task_backends( + enum ggml_task_backend *backends); + +bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, + struct ggml_mulmat_tune_params *params); + +#ifdef __cplusplus +} +#endif diff --git a/ggml.c b/ggml.c index 78c365354..5d0b83b1d 100644 --- a/ggml.c +++ b/ggml.c @@ -61,26 +61,6 @@ static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) { return atomic_fetch_add(ptr, -(dec)); } -typedef HANDLE pthread_t; - -typedef DWORD thread_ret_t; -static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) { - (void) unused; - HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL); - if (handle == NULL) - { - return EAGAIN; - } - - *out = handle; - return 0; -} - -static int pthread_join(pthread_t thread, void* unused) { - (void) unused; - return (int) WaitForSingleObject(thread, INFINITE); -} - static int sched_yield (void) { Sleep (0); return 0; @@ -88,8 +68,6 @@ static int sched_yield (void) { #else #include #include - -typedef void* thread_ret_t; #endif // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 @@ -166,6 +144,12 @@ inline static void* ggml_aligned_malloc(size_t size) { #include "ggml-opencl.h" #endif +#if defined(GGML_USE_MULMAT_TUNE) + #include "ggml-tune.h" +#endif + +#include "ggml-threading.h" + #undef MIN #undef MAX #define MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -4059,6 +4043,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { ggml_cl_init(); #endif + ggml_mulmat_init_task_profiles(); + is_first_call = false; } @@ -4302,7 +4288,7 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.src0 =*/ NULL, /*.src1 =*/ NULL, /*.opt =*/ { NULL }, - /*.n_tasks =*/ 0, + /*.task_profile =*/ { 0 }, /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, @@ -8516,14 +8502,19 @@ static void ggml_compute_forward_mul_f32( const int ith = params->ith; const int nth = params->nth; + enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; + if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { #ifdef GGML_USE_CLBLAST - if (src1->backend == GGML_BACKEND_GPU) { - if (ith == 0) { - ggml_cl_mul(src0, src1, dst); + if (src1->backend == GGML_BACKEND_GPU) { + if (ith == 0) { + ggml_cl_mul(src0, src1, dst); + } + return; } - return; - } +#else + GGML_ASSERT(false); #endif + }; const int64_t nr = ggml_nrows(src0); @@ -9950,36 +9941,6 @@ static void ggml_compute_forward_rms_norm_back( } -// ggml_compute_forward_mul_mat - -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) -// helper function to determine if it is better to use BLAS or not -// for large matrices, BLAS is faster -static bool ggml_compute_forward_mul_mat_use_blas( - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - //const int64_t ne00 = src0->ne[0]; - //const int64_t ne01 = src0->ne[1]; - - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - - // TODO: find the optimal values for these - if (ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && - (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { - - /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ - return true; - } - - return false; -} -#endif - static void ggml_compute_forward_mul_mat_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -10050,28 +10011,25 @@ static void ggml_compute_forward_mul_mat_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows + enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; + + if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { #if defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { - ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); - } + GGML_ASSERT(params->nth == 1); + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); + ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); return; - } +#else + GGML_ASSERT(false); #endif + } + GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU); + + if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { - if (params->ith != 0) { - return; - } - - if (params->type == GGML_TASK_INIT) { - return; - } - - if (params->type == GGML_TASK_FINALIZE) { - return; - } + GGML_ASSERT(params->nth == 1); + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { @@ -10089,16 +10047,13 @@ static void ggml_compute_forward_mul_mat_f32( //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); return; - } +#else + GGML_ASSERT(false); #endif - - if (params->type == GGML_TASK_INIT) { - return; } - if (params->type == GGML_TASK_FINALIZE) { - return; - } + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); + GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU); // parallelize by src0 rows using ggml_vec_dot_f32 @@ -10215,30 +10170,26 @@ static void ggml_compute_forward_mul_mat_f16_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows + enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; + + if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { #if defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { - ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); - } + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); + ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); return; - } +#else + GGML_ASSERT(false); #endif + } + enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend; + GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU); + + if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { GGML_ASSERT(nb10 == sizeof(float)); - - if (params->ith != 0) { - return; - } - - if (params->type == GGML_TASK_INIT) { - return; - } - - if (params->type == GGML_TASK_FINALIZE) { - return; - } + GGML_ASSERT(params->nth == 1); + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { @@ -10271,8 +10222,14 @@ static void ggml_compute_forward_mul_mat_f16_f32( /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/ return; - } +#else + GGML_ASSERT(false); #endif + } + + GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE); + GGML_ASSERT(init_backend == GGML_TASK_BACKEND_CPU); + GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU); if (params->type == GGML_TASK_INIT) { ggml_fp16_t * const wdata = params->wdata; @@ -10293,9 +10250,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( return; } - if (params->type == GGML_TASK_FINALIZE) { - return; - } + GGML_ASSERT (params->type == GGML_TASK_COMPUTE); // fp16 -> half the size, so divide by 2 // TODO: do not support transposed src1 @@ -10420,50 +10375,62 @@ static void ggml_compute_forward_mul_mat_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows + enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; + + if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { #if defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { - ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); - } + GGML_ASSERT(params->nth == 1); + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); + ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); return; - } +#else + GGML_ASSERT(false); #endif + } + enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend; + GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU); + + if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { - if (params->ith != 0) { - return; - } - - if (params->type == GGML_TASK_INIT) { - return; - } - - if (params->type == GGML_TASK_FINALIZE) { - return; - } + GGML_ASSERT (init_backend == GGML_TASK_BACKEND_CPU); + GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE); + GGML_ASSERT(src0->data); + GGML_ASSERT(params->wdata); float * const wdata = params->wdata; dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; + if (params->type == GGML_TASK_INIT) { + // rows per thread + const int dr = (ne01 + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + int ir1 = MIN(ir0 + dr, ne01); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + char * data0_offset = (char *) src0->data + i03*nb03 + i02*nb02; + float * wdata_offset = wdata + i03*ne03 + i02*ne02; + for (int64_t i = ir0; i < ir1; ++i) { + dequantize_row_q(data0_offset + i*nb01, wdata_offset + i*ne00, ne00); + } + } + } + return; + } + + GGML_ASSERT(nth == 1); + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); + for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - { - size_t id = 0; - for (int64_t i01 = 0; i01 < ne01; ++i01) { - dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00); - id += ne00; - } - - assert(id*sizeof(float) <= params->wsize); - } - + // zT = y * xT const float * x = wdata; - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, @@ -10472,13 +10439,19 @@ static void ggml_compute_forward_mul_mat_q_f32( } } - //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); - return; - } +#else + GGML_ASSERT(false); #endif + } + + GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE); + GGML_ASSERT(init_backend == GGML_TASK_BACKEND_CPU); + GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU); if (params->type == GGML_TASK_INIT) { + GGML_ASSERT(params->nth == 1); + char * wdata = params->wdata; const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; @@ -10490,13 +10463,10 @@ static void ggml_compute_forward_mul_mat_q_f32( } } } - return; } - if (params->type == GGML_TASK_FINALIZE) { - return; - } + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); // parallelize by src0 rows using ggml_vec_dot_q @@ -14324,20 +14294,31 @@ static void ggml_compute_forward_cross_entropy_loss_back( } } - ///////////////////////////////// -static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { +static enum ggml_compute_error ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { GGML_ASSERT(params); -#ifdef GGML_USE_CUBLAS - bool skip_cpu = ggml_cuda_compute_forward(params, tensor); - if (skip_cpu) { - return; + enum ggml_task_backend comp_backend = tensor->task_profile.stages[GGML_TASK_COMPUTE].backend; + + if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) { +#if defined(GGML_USE_CUBLAS) + bool skip_cpu = ggml_cuda_compute_forward(params, tensor); + if (skip_cpu) { + return GGML_COMPUTE_OK; + } + GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU); + return GGML_COMPUTE_FALLBACK; +#else + GGML_ASSERT(false); +#endif } - GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU); - GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU); -#endif // GGML_USE_CUBLAS + + // if (tensor->task_profile.stages[params->type].backend > GGML_TASK_BACKEND_CPU) { + // printf("mulmat: test fallback\n"); + // return GGML_COMPUTE_FALLBACK; + // } switch (tensor->op) { case GGML_OP_DUP: @@ -14585,6 +14566,15 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm GGML_ASSERT(false); } break; } + + return GGML_COMPUTE_OK; +} + +enum ggml_compute_error ggml_compute_forward_wrapper(struct ggml_compute_params *params, + struct ggml_tensor *tensor) { + // We call ggml_compute_forward because the CUDA mul_mat entry point + // was moved out of `ggml_compute_forward_mul_mat`. + return ggml_compute_forward(params, tensor); } //////////////////////////////////////////////////////////////////////////////// @@ -15480,6 +15470,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { /*.n_nodes =*/ 0, /*.n_leafs =*/ 0, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, + /*.tune =*/ NULL, /*.work_size =*/ 0, /*.work =*/ NULL, /*.nodes =*/ { NULL }, @@ -15533,175 +15524,288 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg return result; } -// -// thread data -// -// synchronization is done via busy loops -// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops -// +// ---- mulmat task profiles ---- -#ifdef __APPLE__ +static struct ggml_task_profile_factory default_task_profile_factory = {0}; -//#include -// -//typedef os_unfair_lock ggml_lock_t; -// -//#define ggml_lock_init(x) UNUSED(x) -//#define ggml_lock_destroy(x) UNUSED(x) -//#define ggml_lock_lock os_unfair_lock_lock -//#define ggml_lock_unlock os_unfair_lock_unlock -// -//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT +// TODO: thread unsafe. Should be initialized once. +void ggml_mulmat_init_task_profiles(void) { + const size_t sz = sizeof(struct ggml_task_profile_factory); + memset(&default_task_profile_factory, 0, sz); -typedef int ggml_lock_t; + // f32 + { + struct ggml_task_profile *p = default_task_profile_factory.f32_f32; + int i = 0; -#define ggml_lock_init(x) UNUSED(x) -#define ggml_lock_destroy(x) UNUSED(x) -#define ggml_lock_lock(x) UNUSED(x) -#define ggml_lock_unlock(x) UNUSED(x) - -#define GGML_LOCK_INITIALIZER 0 - -typedef pthread_t ggml_thread_t; - -#define ggml_thread_create pthread_create -#define ggml_thread_join pthread_join - -#else - -//typedef pthread_spinlock_t ggml_lock_t; - -//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE) -//#define ggml_lock_destroy pthread_spin_destroy -//#define ggml_lock_lock pthread_spin_lock -//#define ggml_lock_unlock pthread_spin_unlock - -typedef int ggml_lock_t; - -#define ggml_lock_init(x) UNUSED(x) -#define ggml_lock_destroy(x) UNUSED(x) -#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) -#define ggml_lock_lock(x) _mm_pause() -#else -#define ggml_lock_lock(x) UNUSED(x) -#endif -#define ggml_lock_unlock(x) UNUSED(x) - -#define GGML_LOCK_INITIALIZER 0 - -typedef pthread_t ggml_thread_t; - -#define ggml_thread_create pthread_create -#define ggml_thread_join pthread_join + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[1].parallel = true; + i++; +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; + p[i].stages[1].wait = true; + i++; #endif -struct ggml_compute_state_shared { - ggml_lock_t spin; +#if defined(GGML_USE_CUBLAS) + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; + p[i].stages[1].wait = true; + i++; +#elif defined(GGML_USE_CLBLAST) + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; + p[i].stages[1].wait = true; + i++; +#endif + default_task_profile_factory.n_f32_f32 = i; + } - int n_threads; + // f16 + { + struct ggml_task_profile *p = default_task_profile_factory.f16_f32; + int i = 0; - // synchronization primitives - atomic_int n_ready; - atomic_bool has_work; - atomic_bool stop; // stop all threads -}; + p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[1].parallel = true; + i++; -struct ggml_compute_state { - ggml_thread_t thrd; +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; + p[i].stages[1].wait = true; + i++; +#endif - struct ggml_compute_params params; - struct ggml_tensor * node; +#if defined(GGML_USE_CUBLAS) + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; + p[i].stages[1].wait = true; + i++; +#elif defined(GGML_USE_CLBLAST) + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; + p[i].stages[1].wait = true; + i++; +#endif + default_task_profile_factory.n_f16_f32 = i; + } - struct ggml_compute_state_shared * shared; -}; + // qxx + { + struct ggml_task_profile *p = default_task_profile_factory.qxx_f32; + int i = 0; -static thread_ret_t ggml_graph_compute_thread(void * data) { - struct ggml_compute_state * state = (struct ggml_compute_state *) data; + p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[1].parallel = true; + i++; - const int n_threads = state->shared->n_threads; +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[0].parallel = true; + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; + p[i].stages[1].wait = true; + i++; +#endif - while (true) { - if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) { - atomic_store(&state->shared->has_work, false); - } else { - while (atomic_load(&state->shared->has_work)) { - if (atomic_load(&state->shared->stop)) { - return 0; +#if defined(GGML_USE_CUBLAS) + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; + p[i].stages[1].wait = true; + i++; +#elif defined(GGML_USE_CLBLAST) + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; + p[i].stages[1].wait = true; + i++; +#endif + default_task_profile_factory.n_qxx_f32 = i; + } +} + +int ggml_mulmat_get_task_profiles(struct ggml_task_profile_factory *pf, + enum ggml_type src0_t, enum ggml_type src1_t, + struct ggml_task_profile **profiles) { + GGML_ASSERT(profiles); + + if (pf == NULL) { + pf = &default_task_profile_factory; + } + + GGML_ASSERT(src1_t == GGML_TYPE_F32); + + if (src0_t == GGML_TYPE_F32) { + *profiles = pf->f32_f32; + return pf->n_f32_f32; + } + + if (src0_t == GGML_TYPE_F16) { + *profiles = pf->f16_f32; + return pf->n_f16_f32; + } + + if (ggml_is_quantized(src0_t)) { + *profiles = pf->qxx_f32; + return pf->n_qxx_f32; + } + + GGML_ASSERT(false); +} + +static const struct ggml_task_profile * +ggml_mulmat_get_default_task_profile(struct ggml_task_profile_factory *pf, + enum ggml_type src0_type, + enum ggml_type src1_type) { + GGML_ASSERT(src1_type == GGML_TYPE_F32); + if (pf == NULL) { + pf = &default_task_profile_factory; + } + + struct ggml_task_profile *p = NULL; + + if (src0_type == GGML_TYPE_F32) { + p = &pf->f32_f32[0]; + } else if (src0_type == GGML_TYPE_F16) { + p = &pf->f16_f32[0]; + } else if (ggml_is_quantized(src0_type)) { + p = &pf->qxx_f32[0]; + } else { + GGML_ASSERT(false); + } + + for (int i = 0; i < 3; i++) { + GGML_ASSERT(p->stages[i].backend == GGML_TASK_BACKEND_CPU || + p->stages[i].backend == GGML_TASK_BACKEND_NONE); + } + + return p; +} + +// Set task profile for GGML_OP_MUL_MAT or GGML_OP_OUT_PROD. +static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node, + struct ggml_mulmat_tune *tune) { + GGML_ASSERT(node); + GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD); + + enum ggml_type src0_t = node->src0->type; + enum ggml_type src1_t = node->src1->type; + + // Type and memory layout requirements for computing mul_mat with BLAS. + bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 || + ggml_is_quantized(src0_t)) && + src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 && + ggml_is_contiguous(node->src0) && + ggml_is_contiguous(node->src1); + + int M = (int)node->ne[1]; + int N = (int)node->ne[0]; + int K = (int)node->src1->ne[0]; + + struct ggml_task_profile *profiles = NULL; + int n_profiles = ggml_mulmat_get_task_profiles(NULL, src0_t, src1_t, &profiles); + GGML_ASSERT(n_profiles >= 2); + GGML_ASSERT(profiles); + + const struct ggml_task_profile *prof = NULL; + + if (cond_match) { +#if defined(GGML_USE_MULMAT_TUNE) + if (tune != NULL) { + int stages_time_us[3]; + prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t, src1_t, stages_time_us); + if (prof != NULL) { + GGML_ASSERT(prof); + memcpy(&node->task_profile, prof, sizeof(struct ggml_task_profile)); + // Do not wait if the estimated execution time is too small (e.g. less than 0.1 ms) + // TODO: need bench actual wait/notify time, see ggml-threading.c + for (int i = 0; i < 3; i++) { + if (node->task_profile.stages[i].wait) { + if (stages_time_us[i] < 100) { + node->task_profile.stages[i].wait = false; + } + } + } + return; + } + } +#else + UNUSED(tune); +#endif + + if (prof == NULL && M >= 32 && N >= 32 && K >= 32) { + for (int j = 0; j < n_profiles; j++) { + enum ggml_task_backend comp_be = + profiles[j].stages[GGML_TASK_COMPUTE].backend; + + switch (comp_be) { + case GGML_TASK_BACKEND_GPU_CUDA: { + GGML_ASSERT(ggml_cpu_has_cublas()); + prof = &profiles[j]; + break; + } + case GGML_TASK_BACKEND_GPU_CL: { + GGML_ASSERT(ggml_cpu_has_clblast()); + prof = &profiles[j]; + break; + } + case GGML_TASK_BACKEND_CPU_BLAS: { + GGML_ASSERT(ggml_cpu_has_cpublas()); + prof = &profiles[j]; + break; + } + default: { + break; + } } - ggml_lock_lock (&state->shared->spin); - ggml_lock_unlock(&state->shared->spin); } } - - atomic_fetch_sub(&state->shared->n_ready, 1); - - // wait for work - while (!atomic_load(&state->shared->has_work)) { - if (atomic_load(&state->shared->stop)) { - return 0; - } - ggml_lock_lock (&state->shared->spin); - ggml_lock_unlock(&state->shared->spin); - } - - // check if we should stop - if (atomic_load(&state->shared->stop)) { - break; - } - - if (state->node) { - if (state->params.ith < state->params.nth) { - ggml_compute_forward(&state->params, state->node); - } - - state->node = NULL; - } else { - break; - } } - return 0; + if (prof == NULL) { + prof = ggml_mulmat_get_default_task_profile(NULL, src0_t, src1_t); + } + + GGML_ASSERT(prof); + memcpy(&node->task_profile, prof, sizeof(struct ggml_task_profile)); } void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { - const int n_threads = cgraph->n_threads; + int n_threads = cgraph->n_threads; - struct ggml_compute_state_shared state_shared = { - /*.spin =*/ GGML_LOCK_INITIALIZER, - /*.n_threads =*/ n_threads, - /*.n_ready =*/ 0, - /*.has_work =*/ false, - /*.stop =*/ false, - }; - struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL; + if (ggml_cpu_has_blas()) { + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor *node = cgraph->nodes[i]; - // create thread pool - if (n_threads > 1) { - ggml_lock_init(&state_shared.spin); + memset(&node->task_profile, 0, sizeof(struct ggml_task_profile)); + struct ggml_task_stage *stages = node->task_profile.stages; - atomic_store(&state_shared.has_work, true); + // Adapt node->backend: assume GPU at COMPUTE stage. + if (node->backend > GGML_BACKEND_CPU) { + stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_NONE; + stages[GGML_TASK_FINALIZE].backend = GGML_TASK_BACKEND_NONE; - for (int j = 0; j < n_threads - 1; j++) { - workers[j] = (struct ggml_compute_state) { - .thrd = 0, - .params = { - .type = GGML_TASK_COMPUTE, - .ith = j + 1, - .nth = n_threads, - .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, - .wdata = cgraph->work ? cgraph->work->data : NULL, - }, - .node = NULL, - .shared = &state_shared, - }; - - int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); - GGML_ASSERT(rc == 0); - UNUSED(rc); + stages[GGML_TASK_COMPUTE].parallel = false; + bool wait = (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL); + stages[GGML_TASK_COMPUTE].wait = wait; + if (ggml_cpu_has_cublas()) { + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_GPU_CUDA; + } else if (ggml_cpu_has_clblast()) { + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_GPU_CL; + } else { + GGML_ASSERT(false); + } + } else if (node->op == GGML_OP_MUL_MAT) { + struct ggml_mulmat_tune * tune = NULL; +#if defined(GGML_USE_MULMAT_TUNE) + tune = cgraph->tune; +#endif + ggml_mulmat_set_tensor_task_profile(node, tune); + } else if (node->op == GGML_OP_OUT_PROD) { + ggml_mulmat_set_tensor_task_profile(node, NULL); + } } } + struct ggml_threading_context *thrd_ctx = ggml_threading_start( + n_threads, ggml_threading_graph_compute_thread, ggml_compute_forward, + GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL); + // initialize tasks + work buffer { size_t work_size = 0; @@ -15709,13 +15813,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) // thread scheduling for the different operations for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; + struct ggml_task_stage *stages = node->task_profile.stages; switch (node->op) { case GGML_OP_CPY: case GGML_OP_DUP: { - node->n_tasks = n_threads; - + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; size_t cur = 0; if (ggml_is_quantized(node->type)) { cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads; @@ -15726,7 +15830,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_ADD: case GGML_OP_ADD1: { - node->n_tasks = n_threads; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; size_t cur = 0; @@ -15738,7 +15843,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; case GGML_OP_ACC: { - node->n_tasks = n_threads; + stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; size_t cur = 0; @@ -15764,9 +15871,15 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_STEP: case GGML_OP_RELU: { - node->n_tasks = 1; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; } break; case GGML_OP_MUL: + { + if (stages[GGML_TASK_COMPUTE].backend == GGML_TASK_BACKEND_NONE) { + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; + } + } break; case GGML_OP_GELU: case GGML_OP_SILU: case GGML_OP_SILU_BACK: @@ -15774,66 +15887,65 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM_BACK: { - node->n_tasks = n_threads; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; } break; case GGML_OP_MUL_MAT: case GGML_OP_OUT_PROD: { - node->n_tasks = n_threads; - - // TODO: use different scheduling for different matrix sizes - //const int nr0 = ggml_nrows(node->src0); - //const int nr1 = ggml_nrows(node->src1); - - //node->n_tasks = MIN(n_threads, MAX(1, nr0/128)); - //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks); - size_t cur = 0; + enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend; + GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE); -#if defined(GGML_USE_CUBLAS) - if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning + // TODO: remove this check once we are sure `ggml_mulmat_set_tensor_task_profile()` is correct. + if ((comp_backend & GGML_TASK_BACKEND_GPU) || comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { + enum ggml_type src0_t = node->src0->type; + enum ggml_type src1_t = node->src1->type; + bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 || + ggml_is_quantized(src0_t)) && + src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 && + ggml_is_contiguous(node->src0) && + ggml_is_contiguous(node->src1); + GGML_ASSERT(cond_match); } - else -#elif defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning + + if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { +#if defined(GGML_USE_CLBLAST) cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); - } - else +#else + GGML_ASSERT(false); #endif - if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning + } else if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { + GGML_ASSERT(ggml_cpu_has_cpublas()); + GGML_ASSERT(node->src1->type == GGML_TYPE_F32); + + if (node->src0->type == GGML_TYPE_F32) { + cur = 0; + } else if (node->src0->type == GGML_TYPE_F16) { // here we need memory just for single 2D matrix from src0 cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); - } else { - cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); - } -#else - cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); -#endif - } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) { - cur = 0; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; - } -#endif - } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; + } else if (ggml_is_quantized(node->src0->type)) { cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); - } else -#endif - { + } else { + GGML_ASSERT(false); + } + } else if (comp_backend == GGML_TASK_BACKEND_CPU || comp_backend == GGML_TASK_BACKEND_GPU_CUDA) { + // We have to reseve buffer for CUDA because it may fallback to CPU. + if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) { + GGML_ASSERT(ggml_cpu_has_cublas()); + } + + GGML_ASSERT(node->src1->type == GGML_TYPE_F32); + + if (node->src0->type == GGML_TYPE_F32) { + cur = 0; + } else if (node->src0->type == GGML_TYPE_F16) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); + } else if (ggml_is_quantized(node->src0->type)) { const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type; cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q]; + } else { + GGML_ASSERT(false); } } else { GGML_ASSERT(false); @@ -15843,9 +15955,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; case GGML_OP_SCALE: { - node->n_tasks = n_threads; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; } break; case GGML_OP_SET: + { + stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + } break; case GGML_OP_CONT: case GGML_OP_RESHAPE: case GGML_OP_VIEW: @@ -15856,7 +15973,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_DIAG: case GGML_OP_DIAG_MASK_ZERO: { - node->n_tasks = 1; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; } break; case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: @@ -15864,20 +15981,23 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: { - node->n_tasks = n_threads; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; } break; case GGML_OP_ALIBI: { - node->n_tasks = 1; //TODO + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; } break; case GGML_OP_CLAMP: { - node->n_tasks = 1; //TODO + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; } break; case GGML_OP_CONV_1D_1S: case GGML_OP_CONV_1D_2S: { - node->n_tasks = n_threads; + stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; GGML_ASSERT(node->src0->ne[3] == 1); GGML_ASSERT(node->src1->ne[2] == 1); @@ -15906,45 +16026,48 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; case GGML_OP_FLASH_ATTN: { - node->n_tasks = n_threads; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; size_t cur = 0; const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2 + cur = sizeof(float)*ne11*n_threads; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_threads; // this is overestimated by x2 } if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2 + cur = sizeof(float)*ne11*n_threads; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_threads; // this is overestimated by x2 } work_size = MAX(work_size, cur); } break; case GGML_OP_FLASH_FF: { - node->n_tasks = n_threads; - + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; size_t cur = 0; if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 + cur = sizeof(float)*node->src1->ne[1]*n_threads; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src1->ne[1]*n_threads; // this is overestimated by x2 } if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 + cur = sizeof(float)*node->src1->ne[1]*n_threads; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src1->ne[1]*n_threads; // this is overestimated by x2 } work_size = MAX(work_size, cur); } break; case GGML_OP_FLASH_ATTN_BACK: { - node->n_tasks = n_threads; + stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; size_t cur = 0; @@ -15952,13 +16075,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2 + cur = sizeof(float)*mxDn*n_threads; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_threads; // this is overestimated by x2 } if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2 + cur = sizeof(float)*mxDn*n_threads; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_threads; // this is overestimated by x2 } work_size = MAX(work_size, cur); @@ -15966,32 +16089,38 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: { - node->n_tasks = 1; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; } break; case GGML_OP_CROSS_ENTROPY_LOSS: { - node->n_tasks = n_threads; + stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; + stages[GGML_TASK_FINALIZE].backend = GGML_TASK_BACKEND_CPU; - size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks); + size_t cur = ggml_type_size(node->type)*(n_threads + node->src0->ne[0]*n_threads); work_size = MAX(work_size, cur); } break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { - node->n_tasks = n_threads; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; - size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks; + size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_threads; work_size = MAX(work_size, cur); } break; case GGML_OP_NONE: { - node->n_tasks = 1; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; } break; case GGML_OP_COUNT: { GGML_ASSERT(false); } break; + default: + GGML_ASSERT(false); } } @@ -16023,126 +16152,27 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) const int64_t perf_node_start_cycles = ggml_perf_cycles(); const int64_t perf_node_start_time_us = ggml_perf_time_us(); - // INIT - struct ggml_compute_params params = { - /*.type =*/ GGML_TASK_INIT, - /*.ith =*/ 0, - /*.nth =*/ node->n_tasks, - /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0, - /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, - }; - - ggml_compute_forward(¶ms, node); - - // COMPUTE - if (node->n_tasks > 1) { - if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { - atomic_store(&state_shared.has_work, false); - } - - while (atomic_load(&state_shared.has_work)) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - - // launch thread pool - for (int j = 0; j < n_threads - 1; j++) { - workers[j].params = (struct ggml_compute_params) { - .type = GGML_TASK_COMPUTE, - .ith = j + 1, - .nth = node->n_tasks, - .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, - .wdata = cgraph->work ? cgraph->work->data : NULL, - }; - workers[j].node = node; - } - - atomic_fetch_sub(&state_shared.n_ready, 1); - - while (atomic_load(&state_shared.n_ready) > 0) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - - atomic_store(&state_shared.has_work, true); + // TODO: can be moved out of loop? + void *wdata = NULL; + size_t wsize = 0; + if (cgraph->work) { + wdata = cgraph->work->data; + wsize = ggml_nbytes(cgraph->work); } - params.type = GGML_TASK_COMPUTE; - ggml_compute_forward(¶ms, node); - - // wait for thread pool - if (node->n_tasks > 1) { - if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { - atomic_store(&state_shared.has_work, false); - } - - while (atomic_load(&state_shared.has_work)) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - - atomic_fetch_sub(&state_shared.n_ready, 1); - - while (atomic_load(&state_shared.n_ready) != 0) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - } - - // FINALIZE - if (node->n_tasks > 1) { - if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { - atomic_store(&state_shared.has_work, false); - } - - while (atomic_load(&state_shared.has_work)) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - - // launch thread pool - for (int j = 0; j < n_threads - 1; j++) { - workers[j].params = (struct ggml_compute_params) { - .type = GGML_TASK_FINALIZE, - .ith = j + 1, - .nth = node->n_tasks, - .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, - .wdata = cgraph->work ? cgraph->work->data : NULL, - }; - workers[j].node = node; - } - - atomic_fetch_sub(&state_shared.n_ready, 1); - - while (atomic_load(&state_shared.n_ready) > 0) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - - atomic_store(&state_shared.has_work, true); - } - - params.type = GGML_TASK_FINALIZE; - ggml_compute_forward(¶ms, node); - - // wait for thread pool - if (node->n_tasks > 1) { - if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { - atomic_store(&state_shared.has_work, false); - } - - while (atomic_load(&state_shared.has_work)) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - - atomic_fetch_sub(&state_shared.n_ready, 1); - - while (atomic_load(&state_shared.n_ready) != 0) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); + enum ggml_compute_error err = + ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize); + if (err == GGML_COMPUTE_FALLBACK) { + if (node->op == GGML_OP_MUL_MAT) { + const struct ggml_task_profile *p = + ggml_mulmat_get_default_task_profile( + NULL, node->src0->type, node->src1->type); + memcpy(&node->task_profile, p, + sizeof(struct ggml_task_profile)); } + err = ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize); } + GGML_ASSERT(err == GGML_COMPUTE_OK); // performance stats (node) { @@ -16155,19 +16185,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } } - // join thread pool - if (n_threads > 1) { - atomic_store(&state_shared.stop, true); - atomic_store(&state_shared.has_work, true); - - for (int j = 0; j < n_threads - 1; j++) { - int rc = ggml_thread_join(workers[j].thrd, NULL); - GGML_ASSERT(rc == 0); - UNUSED(rc); - } - - ggml_lock_destroy(&state_shared.spin); - } + ggml_threading_stop(thrd_ctx); // performance stats (graph) { @@ -16242,7 +16260,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char tensor->n_dims, ne[0], ne[1], ne[2], ne[3], nb[0], nb[1], nb[2], nb[3], - tensor->n_tasks, + tensor->task_profile.stages[0].parallel, // replaceed n_tasks. tensor->data, tensor->name); } @@ -18024,14 +18042,6 @@ int ggml_cpu_has_wasm_simd(void) { #endif } -int ggml_cpu_has_blas(void) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) - return 1; -#else - return 0; -#endif -} - int ggml_cpu_has_cublas(void) { #if defined(GGML_USE_CUBLAS) return 1; @@ -18048,10 +18058,22 @@ int ggml_cpu_has_clblast(void) { #endif } +int ggml_cpu_has_cpublas(void) { +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_gpublas(void) { return ggml_cpu_has_cublas() || ggml_cpu_has_clblast(); } +int ggml_cpu_has_blas(void) { + return ggml_cpu_has_cpublas() || ggml_cpu_has_gpublas(); +} + int ggml_cpu_has_sse3(void) { #if defined(__SSE3__) return 1; diff --git a/ggml.h b/ggml.h index 1380c530f..f51b658fd 100644 --- a/ggml.h +++ b/ggml.h @@ -1,5 +1,7 @@ #pragma once + + // // GGML Tensor Library // @@ -200,6 +202,7 @@ #define GGML_MAX_OPT 4 #define GGML_MAX_NAME 32 #define GGML_DEFAULT_N_THREADS 4 +#define GGML_MAX_TASK_PROFILES 8 #define GGML_ASSERT(x) \ do { \ @@ -347,7 +350,6 @@ extern "C" { GGML_OP_COUNT, }; - // ggml object struct ggml_object { size_t offs; @@ -360,6 +362,54 @@ extern "C" { static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); + // As part of task config profile solution, `ggml_task_backend` defines + // backends for each task stage. Similar to `ggml_tensor.backend`, + // `ggml_tensor.task_profile` generalizes how to configure tensor computing + // at per task-stage level. + // + // The following enum values are designed as combination of hardware and + // optional software interface. + enum ggml_task_backend { + GGML_TASK_BACKEND_NONE = 0, + + // [0x10, 0x1F]: CPU + GGML_TASK_BACKEND_CPU = 0x10, + GGML_TASK_BACKEND_CPU_BLAS = 0x11, + + // [0x20 - 0x2F]: GPU + GGML_TASK_BACKEND_GPU = 0x20, + GGML_TASK_BACKEND_GPU_CUDA = 0x21, + GGML_TASK_BACKEND_GPU_CL = 0x22, + }; + + // config for computing one of the 3 task stages of a tensor. + struct ggml_task_stage { + enum ggml_task_backend backend; + bool parallel; + // hint idle workers go waiting, valid only when parallel is false. + bool wait; + }; + + // config for computing a tensor. + struct ggml_task_profile { + // index 0: INIT, 1: COMPUTE, 2: FINALIZE + struct ggml_task_stage stages[3]; + + // MUST be used only in testing codes. + uint8_t dev_flags[4]; + }; + + struct ggml_task_profile_factory { + struct ggml_task_profile f32_f32[GGML_MAX_TASK_PROFILES]; + int n_f32_f32; + + struct ggml_task_profile f16_f32[GGML_MAX_TASK_PROFILES]; + int n_f16_f32; + + struct ggml_task_profile qxx_f32[GGML_MAX_TASK_PROFILES]; + int n_qxx_f32; + }; + // n-dimensional tensor struct ggml_tensor { enum ggml_type type; @@ -383,7 +433,8 @@ extern "C" { struct ggml_tensor * opt[GGML_MAX_OPT]; // thread scheduling - int n_tasks; + + struct ggml_task_profile task_profile; // performance int perf_runs; @@ -396,7 +447,7 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - char padding[4]; + char padding[12]; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); @@ -407,6 +458,8 @@ extern "C" { int n_leafs; int n_threads; + struct ggml_mulmat_tune *tune; + size_t work_size; struct ggml_tensor * work; @@ -1287,9 +1340,21 @@ extern "C" { GGML_API int ggml_cpu_has_cublas (void); GGML_API int ggml_cpu_has_clblast (void); GGML_API int ggml_cpu_has_gpublas (void); + GGML_API int ggml_cpu_has_cpublas (void); GGML_API int ggml_cpu_has_sse3 (void); GGML_API int ggml_cpu_has_vsx (void); + // + // mulmat task profiles + // + GGML_API void ggml_mulmat_init_task_profiles(void); + + GGML_API int ggml_mulmat_get_task_profiles( + struct ggml_task_profile_factory *pf, + enum ggml_type src0_t, + enum ggml_type src1_t, + struct ggml_task_profile **profiles); + // // Internal types and functions exposed for tests and benchmarks // diff --git a/llama.cpp b/llama.cpp index c165d3239..fa5a94e21 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #endif #include "llama-util.h" @@ -20,6 +21,10 @@ #include "ggml-metal.h" #endif +#ifdef GGML_USE_MULMAT_TUNE +#include "ggml-tune.h" +#endif + #include #include #include @@ -280,6 +285,10 @@ struct llama_context { int buf_last = 0; size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; +#ifdef GGML_USE_MULMAT_TUNE + struct ggml_mulmat_tune *tune = nullptr; +#endif + void use_buf(struct ggml_context * ctx, int i) { #if defined(LLAMA_USE_SCRATCH) size_t last_size = 0; @@ -1396,10 +1405,12 @@ static bool llama_eval_internal( struct ggml_context * ctx0 = ggml_init(params); - // for big prompts, if BLAS is enabled, it is better to use only one thread - // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ggml_cgraph gf = {}; - gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; + gf.n_threads = n_threads; + +#ifdef GGML_USE_MULMAT_TUNE + gf.tune =lctx.tune; +#endif struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); ggml_set_name(embd, "embd"); @@ -2732,7 +2743,150 @@ struct llama_context * llama_init_from_file( return ctx; } +#ifdef GGML_USE_MULMAT_TUNE +bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) { + printf("\n"); + if (ctx->model.n_gpu_layers != 0) { + fprintf(stderr, "[mulmat tune] error: is disabled by GPU offloading\n"); + return false; + } + + const char *model_name = llama_model_type_name(ctx->model.type); + + llama_hparams *hparams = &ctx->model.hparams; + + enum ggml_ftype ggml_ftype; + switch (hparams->ftype) { + case LLAMA_FTYPE_ALL_F32: + ggml_ftype = GGML_FTYPE_ALL_F32; + break; + case LLAMA_FTYPE_MOSTLY_F16: + ggml_ftype = GGML_FTYPE_MOSTLY_F16; + break; + case LLAMA_FTYPE_MOSTLY_Q4_0: + ggml_ftype = GGML_FTYPE_MOSTLY_Q4_0; + break; + case LLAMA_FTYPE_MOSTLY_Q4_1: + ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1; + break; + case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: + ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1_SOME_F16; + break; + case LLAMA_FTYPE_MOSTLY_Q5_0: + ggml_ftype = GGML_FTYPE_MOSTLY_Q5_0; + break; + case LLAMA_FTYPE_MOSTLY_Q5_1: + ggml_ftype = GGML_FTYPE_MOSTLY_Q5_1; + break; + case LLAMA_FTYPE_MOSTLY_Q8_0: + ggml_ftype = GGML_FTYPE_MOSTLY_Q8_0; + break; + case LLAMA_FTYPE_MOSTLY_Q2_K: + ggml_ftype = GGML_FTYPE_MOSTLY_Q2_K; + break; + case LLAMA_FTYPE_MOSTLY_Q3_K_S: + case LLAMA_FTYPE_MOSTLY_Q3_K_M: + case LLAMA_FTYPE_MOSTLY_Q3_K_L: + ggml_ftype = GGML_FTYPE_MOSTLY_Q3_K; + break; + case LLAMA_FTYPE_MOSTLY_Q4_K_S: + case LLAMA_FTYPE_MOSTLY_Q4_K_M: + ggml_ftype = GGML_FTYPE_MOSTLY_Q4_K; + break; + case LLAMA_FTYPE_MOSTLY_Q5_K_S: + case LLAMA_FTYPE_MOSTLY_Q5_K_M: + ggml_ftype = GGML_FTYPE_MOSTLY_Q5_K; + break; + case LLAMA_FTYPE_MOSTLY_Q6_K: + ggml_ftype = GGML_FTYPE_MOSTLY_Q6_K; + break; + default: + throw std::runtime_error( + format("invalid output file type %d\n", hparams->ftype)); + } + + int n_vocab = hparams->n_vocab; + int n_embd = hparams->n_embd; + int n_rot = hparams->n_rot; + + int n_mult = hparams->n_mult; + int n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult; + + struct ggml_mulmat_tune_params params = { + /*.model =*/ { + /* .name =*/ model_name, + /* .ftype =*/ ggml_ftype, + /* .n_vocab =*/ n_vocab, + /* .n_embd =*/ n_embd, + /* .n_ff =*/ n_ff, + /* .n_rot =*/ n_rot, + }, + /* .m_num =*/ 8, + /* .n_pass =*/ 1, + /* .n_threads =*/ n_threads, + /* .prrogress =*/ true, + /* .output_console =*/ false, + /* .fname =*/ fname, + }; + + bool empty_fname = !fname || strcmp(fname, "") == 0; + + ctx->tune = new(struct ggml_mulmat_tune); + if (!ctx->tune) { + throw std::runtime_error(format("failed to allocate memory for tune\n")); + } + + if (tune) { + bool ok = ggml_mulmat_tune_bench(ctx->tune, ¶ms); + if (!ok) { + ggml_mulmat_tune_free(ctx->tune); + return false; + } + if (!empty_fname) { + ggml_mulmat_tune_free(ctx->tune); + return true; + } + } else { + if (empty_fname) { + return false; + } + } + + if (!empty_fname) { + FILE *fp = fopen(fname, "r"); + if (!fp) { + fprintf(stderr, "[mulmat tune] failed to open file %s.\n", + fname); + } else { + bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp); + fclose(fp); + + if (!ok) { + fprintf(stderr, + "[mulmat tune] failed to read data from %s\n", + fname); + return false; + } + + fprintf(stderr, "[mulmat tune] loaded data from %s\n", fname); + + ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads); + if (!ok) { + return false; + } + } + } + + return true; +} +#endif + void llama_free(struct llama_context * ctx) { +#ifdef GGML_USE_MULMAT_TUNE + if (ctx->tune) { + delete(ctx->tune); + } +#endif delete ctx; } diff --git a/llama.h b/llama.h index 1241ba6c0..c3f6a2154 100644 --- a/llama.h +++ b/llama.h @@ -300,6 +300,9 @@ extern "C" { // Print system information LLAMA_API const char * llama_print_system_info(void); + // Experimental utility functionality for mulmat tunning. + LLAMA_API bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname); + #ifdef __cplusplus } #endif diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 000000000..f4b8ee1b3 --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1,2 @@ +/test-ggml-threading +/test-ggml-tune diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4171c126c..977b8ef6d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -12,3 +12,5 @@ llama_add_test(test-sampling.cpp) llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) # llama_add_test(test-grad0.c) # SLOW # llama_add_test(test-opt.c) # SLOW +llama_add_test(test-ggml-threading.c) +llama_add_test(test-ggml-tune.c) diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c new file mode 100644 index 000000000..0b47623e2 --- /dev/null +++ b/tests/test-ggml-threading.c @@ -0,0 +1,345 @@ +#include "ggml-threading.h" +#include "ggml.h" + +#include +#include +#include +#include + +// Purposes: +// 1. general overview of the threading behaviors. +// 2. race (dead lock) detection. + +// # build +// cd build +// +// # build release: +// cmake .. && cmake --build . --config Release +// +// # build with sanitize: +// cmake .. -DLLAMA_SANITIZE_THREAD=ON && cmake --build . --config Release +// +// # run: +// ./bin/test-ggml-threading + +// How to turn off the warning on Apple: malloc: nano zone abandoned due to +// inability to reserve vm space? +// ==> export MallocNanoZone=0, no need to rebuild. +// See `nano_init()` from +// https://opensource.apple.com/source/libmalloc/libmalloc-140.40.1/src/nano_malloc.c.auto.html + +// How to view the threading debug: +// ==> uncomment `#define GGML_THREADING_DEBUG 1` from file ggml-threading.c + +#define UNUSED(x) (void)(x) + +#define MAX_N_THREADS 16 + +static const int n_repeat = 10; + +// It's frustrating to use atomic with c11 on Windows, let's replace atomic +// counter with array. +static int work_done_arr[MAX_N_THREADS]; + +static enum ggml_compute_error +mock_task_runner(struct ggml_compute_params *params, struct ggml_tensor *node) { + int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000; + if (node->task_profile.stages[params->type].parallel) { + loops /= params->nth; + } + + volatile int64_t j = 0; + for (int i = 0; i < loops; i++) { + j++; + } + + UNUSED(j); + + work_done_arr[params->ith]++; + return GGML_COMPUTE_OK; +} + +int test_driver(int id, struct ggml_tensor *node, int n_threads) { + printf("\n[test-ggml-threading] #%d, n_threads: %d\n", id, n_threads); + + for (int i = 0; i < n_threads; i++) { + work_done_arr[i] = 0; + } + + bool wait_on_done = (node->task_profile.dev_flags[0] > 0u); + + enum ggml_threading_features features = GGML_THREADING_FEATURE_PERF; + if (wait_on_done) { + features |= GGML_THREADING_FEATURE_WAIT_ON_DONE; + } + + int t0 = (int)ggml_time_us(); + + struct ggml_threading_context *ctx = + ggml_threading_start(n_threads, ggml_threading_graph_compute_thread, + mock_task_runner, features, /*stages_time*/ NULL); + + int t1 = (int)ggml_time_us(); + + for (int i = 0; i < n_repeat; i++) { + enum ggml_compute_error err = ggml_threading_compute_tensor( + ctx, node, /*wdata*/ NULL, /*wsize*/ 0); + if (err != GGML_COMPUTE_OK) { + ggml_threading_stop(ctx); + fprintf(stderr, + "ggml_threading_compute_tensor failed with error: %d.\n", + err); + return 1; + } + } + + int t2 = (int)ggml_time_us(); + + ggml_threading_stop(ctx); + + int t3 = (int)ggml_time_us(); + + int expect = 0; + for (int i = 0; i < 3; i++) { + struct ggml_task_stage *ts = &node->task_profile.stages[i]; + if (ts->backend != GGML_TASK_BACKEND_NONE) { + if (ts->parallel) { + expect += n_threads; + } else { + expect++; + } + } + } + expect *= n_repeat; + + int actual = 0; + for (int i = 0; i < n_threads; i++) { + actual += work_done_arr[i]; + } + + uint8_t loops = node->task_profile.dev_flags[1]; + + printf("\tloops: %2d million(s), ---wait_on_done---: %d\n\tstage-0: " + "(parallel: %d, " + "wait: %d)\n" + "\tstage-1: (parallel: %d, wait: %d)\n", + loops, wait_on_done, node->task_profile.stages[0].parallel, + node->task_profile.stages[0].wait, + node->task_profile.stages[1].parallel, + node->task_profile.stages[1].wait); + + if (actual == expect) { + printf("\tthreading: init %6.3f ms, compute %6.3f ms, cleanup %6.3f " + "ms, total %6.3f ms\n", + 1.0 * (t1 - t0) / 1000, 1.0 * (t2 - t1) / 1000, + 1.0 * (t3 - t2) / 1000, 1.0 * (t3 - t0) / 1000); + return 0; + } + + fprintf(stderr, "\t== failed. expect %d done, actual %d done\n\n", expect, + actual); + + return 2; +} + +static enum ggml_compute_error +mock_task_runner_fallback(struct ggml_compute_params *params, + struct ggml_tensor *node) { + UNUSED(params); + if (node->backend == GGML_BACKEND_GPU) { + // ... finally failed to compute in GPU. + + node->backend = GGML_BACKEND_CPU; + return GGML_COMPUTE_FALLBACK; + } else { + return GGML_COMPUTE_OK; + } +} + +// By design, fallback should happen when attempt computing tensor in GPU, +// thus it is not parallelled. +int test_fallback(struct ggml_tensor *node) { + struct ggml_threading_context *ctx = ggml_threading_start( + 1, ggml_threading_graph_compute_thread, mock_task_runner_fallback, + /*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL); + + enum ggml_compute_error err = + ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0); + if (err == GGML_COMPUTE_FALLBACK) { + err = ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, + /*wsize*/ 0); + } + + ggml_threading_stop(ctx); + if (err != GGML_COMPUTE_OK) { + fprintf(stderr, + "ggml_threading_compute_tensor failed with error: %d.\n", err); + return 1; + } + + return 0; +} + +int main(void) { + ggml_time_init(); + + struct ggml_tensor node; + memset(&node, 0, sizeof(struct ggml_tensor)); + + struct ggml_task_stage *stages = node.task_profile.stages; + + stages[0].backend = GGML_TASK_BACKEND_CPU; + stages[1].backend = GGML_TASK_BACKEND_CPU; + stages[2].backend = GGML_TASK_BACKEND_NONE; + + int n_passed = 0; + int n_tests = 0; + + int parallel[3] = {0, 1, 2}; + + // In github build actions (windows-latest-cmake and ubuntu-latest-cmake): + // When n_threads >= 4, the thread init time and compute time suddenly goes + // down to 100x ~ 1000x slow -- comparing to n_threads == 2. + // + // But the tests (n_threads 1, 2, 4, 6) looks sound on my devices: + // - MacBook air 2013, ubuntu 22.04 + // - MacBook pro 2018, macOS 13.4 + // + // So I assume the github build host has limited multi-cpu quota. + // Will skip computing when threading init time is too slow. + // + // NOTE: it's observed that when workload is 0 and n_threads >= number of + // physical cores: + // - the wait/wakeup time varies much: can be up to tens or hundreds of the + // average time, thus greatly punishes those small workloads. + // - wait_on_done is general faster than wait_now, can be 10x faster. + + int threads_arr[] = {1, 2, 4, 8}; + int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]); + + // millions of loops. + uint8_t workload_arr[] = {0u, 1u, 10u}; + int workload_arr_len = sizeof(workload_arr) / sizeof(workload_arr[0]); + + // node.task_profile.dev_flags: byte 0 for wait_on_done, byte 1 for loops. + + for (int x = 0; x < workload_arr_len; x++) { + node.task_profile.dev_flags[1] = workload_arr[x]; + + for (int i = 0; i < threads_arr_len; i++) { + int n_threads = threads_arr[i]; + if (n_threads > MAX_N_THREADS) { + abort(); + } + + printf("\n[test-ggml-threading] ==== n_nodes: %d, n_threads: %d, " + "loops: %2d million(s) ====\n", + n_repeat, n_threads, workload_arr[x]); + + if (n_threads > 1) { // skip this n_threads when too slow. + int t0 = (int)ggml_time_us(); + + struct ggml_threading_context *ctx = ggml_threading_start( + n_threads, ggml_threading_graph_compute_thread, + mock_task_runner, 0, /*stages_time*/ NULL); + + int t1 = (int)ggml_time_us(); + + ggml_threading_stop(ctx); + + int elapsed_us = t1 - t0; + if (elapsed_us > 500 * n_threads) { + fprintf(stderr, + "[test-ggml-threading] warning: it took took %.3f " + "ms to start %d worker thread(s).\n", + 1.0 * elapsed_us / 1000, n_threads - 1); + fprintf(stderr, "[test-ggml-threading] warning: looks like " + "the environment is too slow to run this " + "number of threads, skip.\n"); + continue; + } + } + + // multi-threads: parallel + wait_now/wait_on_done + + if (n_threads == 1) { + stages[0].parallel = false; + stages[1].parallel = false; + stages[0].wait = false; + stages[1].wait = false; + + n_tests++; + if (test_driver(n_tests, &node, n_threads) == 0) { + n_passed++; + } + continue; + } + + for (int j = 0; j < 3; j++) { + stages[0].wait = false; + stages[1].wait = false; + node.task_profile.dev_flags[0] = 0u; + + if (parallel[j] == 0) { + stages[0].parallel = false; + stages[1].parallel = false; + + n_tests++; + if (test_driver(n_tests, &node, n_threads) == 0) { + n_passed++; + } + } else if (parallel[j] == 1) { + stages[0].parallel = true; + stages[1].parallel = false; + + for (int k = 0; k < 2; k++) { + stages[1].wait = (k == 1); + + if (!stages[1].wait) { + n_tests++; + if (test_driver(n_tests, &node, n_threads) == 0) { + n_passed++; + } + continue; + } + + // wait + + for (int m = 0; m < 2; m++) { + if (m == 1) { + node.task_profile.dev_flags[0] = 1u; + } + n_tests++; + if (test_driver(n_tests, &node, n_threads) == 0) { + n_passed++; + } + node.task_profile.dev_flags[0] = 0u; + } + } + } else { + stages[0].parallel = true; + stages[1].parallel = true; + + n_tests++; + if (test_driver(n_tests, &node, n_threads) == 0) { + n_passed++; + } + } + } + } + } + + { + ++n_tests; + + node.backend = GGML_BACKEND_GPU; + if (test_fallback(&node) == 0) { + ++n_passed; + printf("\n[test-ggml-threading] test fallback: ok\n\n"); + } + } + + printf("[test-ggml-threading] %d/%d passed.\n", n_passed, n_tests); + + return (n_passed == n_tests) ? 0 : 1; +} diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c new file mode 100644 index 000000000..ed612fff4 --- /dev/null +++ b/tests/test-ggml-tune.c @@ -0,0 +1,200 @@ +#include "ggml-tune.h" +#include "ggml.h" + +#include + +static int bench(void); +static int estimate_time_non_zero_NK(void); + +static void init_params(struct ggml_mulmat_tune_params *params, int m_num) { + *params = (struct ggml_mulmat_tune_params){ + .model = + (struct ggml_mulmat_tune_model){ + .name = "3B", // fake + .ftype = GGML_FTYPE_MOSTLY_Q4_0, + .n_vocab = 4096, + .n_embd = 1024, + .n_ff = 2048, + .n_rot = 128, + }, + .m_num = m_num, + .n_pass = 1, + .n_threads = 1, + .progress = false, + .output_console = true, + .fname = NULL}; +} + +int main(void) { + int rv = bench(); + if (rv != 0) { + return rv; + } + + printf("\n"); + + rv = estimate_time_non_zero_NK(); + if (rv != 0) { + return rv; + } + printf("\n"); + + return 0; +} + +static int bench(void) { + printf("test: %s\n", __func__); + + { + enum ggml_task_backend backends[16]; + int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends); + if (n_backends < 2) { + printf("test: %s, skipped because no BLAS\n", __func__); + return 0; + } + } + + { + struct ggml_init_params init_params = { + /*.mem_size =*/1, + /*.mem_buffer =*/NULL, + /*.no_alloc =*/0, + }; + struct ggml_context *ctx = ggml_init(init_params); + GGML_ASSERT(ctx); + ggml_free(ctx); + } + + struct ggml_mulmat_tune tune; + + struct ggml_mulmat_tune_params params; + + init_params(¶ms, /*m_num*/ 4); + + bool ok = ggml_mulmat_tune_bench(&tune, ¶ms); + ggml_mulmat_tune_free(&tune); + + return ok ? 0 : 1; +} + +int estimate_time_non_zero_NK(void) { + printf("test: %s\n", __func__); + + struct test_data_t { + int M; + int time[3]; // 3 profiles. + }; + + struct ggml_mulmat_tune tune = { + .version = 1, + .ftype = GGML_FTYPE_MOSTLY_Q4_0, + }; + + const int m_num = 2; + + struct ggml_task_profile_factory pf; + memset(&pf, 0, sizeof(struct ggml_task_profile_factory)); + + { + pf.n_qxx_f32 = 2; + pf.qxx_f32[0].stages[0].backend = GGML_TASK_BACKEND_CPU; + pf.qxx_f32[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + + pf.qxx_f32[1].stages[0].backend = GGML_TASK_BACKEND_CPU; + pf.qxx_f32[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; + } + + struct ggml_mulmat_tune_params params; + init_params(¶ms, m_num); + + ggml_mulmat_tune_init(&tune, ¶ms, &pf); + + struct ggml_mulmat_tune_shape *shape = NULL; + for (int i = 0; i < tune.n_shapes; i++) { + if (tune.shapes[i].N > 0 && tune.shapes[i].K > 0) { + shape = &tune.shapes[i]; + break; + } + } + GGML_ASSERT(shape); + GGML_ASSERT(shape->n_profiles == 2); + GGML_ASSERT(ggml_is_quantized(shape->src0_type)); + + printf("shape: N: %d, K: %d, n_profiles: %d\n", shape->N, shape->K, + shape->n_profiles); + + { + shape->items[0] = + (struct ggml_mulmat_tune_m){.M = 2, .stages_time = {2, 4, 0}}; + shape->items[1] = + (struct ggml_mulmat_tune_m){.M = 4, .stages_time = {4, 8, 0}}; + + shape->items[2] = + (struct ggml_mulmat_tune_m){.M = 2, .stages_time = {4, 4, 0}}; + shape->items[3] = + (struct ggml_mulmat_tune_m){.M = 4, .stages_time = {4, 4, 0}}; + } + + const struct test_data_t test_data[] = { + { + .M = 1, // out of range + .time = {3, 8}, + }, + { + .M = 2, + .time = {6, 8}, + }, + { + .M = 3, + .time = {9, 8}, + }, + { + .M = 4, + .time = {12, 8}, + }, + { + .M = 5, // out of range + .time = {15, 8}, + }, + }; + + int n_tests = (int)(sizeof(test_data) / sizeof(struct test_data_t)); + + struct ggml_mulmat_tune_time profile_time[GGML_MAX_TASK_PROFILES]; + size_t profile_time_sz = + sizeof(struct ggml_mulmat_tune_time) * GGML_MAX_TASK_PROFILES; + + int n_passed = 0; + for (int i = 0; i < n_tests; i++) { + memset(profile_time, 0, profile_time_sz); + const struct test_data_t *e = &test_data[i]; + + const struct ggml_mulmat_tune_shape *matched_shape = + ggml_mulmat_tune_get_shape(&tune, shape->N, shape->K, + shape->src0_type, shape->src1_type); + GGML_ASSERT(matched_shape); + GGML_ASSERT(matched_shape == shape); + + ggml_mulmat_tune_estimate_time(matched_shape, e->M, profile_time); + + for (int j = 0; j < shape->n_profiles; j++) { + int actual = profile_time[j].total_time; + int expect = e->time[j]; + if (expect != actual) { + fprintf(stderr, + "test fail. i: %d, j: %d, M: %d, expect: " + "%d, actual: %d\n", + i, j, e->M, expect, actual); + } else { + ++n_passed; + } + } + } + + n_tests *= shape->n_profiles; + printf("%2d of %2d pass\n", n_passed, n_tests); + + ggml_mulmat_tune_free(&tune); + + return n_passed == n_tests ? 0 : 1; +}