initial

2025-01-11 03:01:45 +00:00 · 2023-06-14 18:33:14 +08:00 · 2023-06-14 18:33:14 +08:00 · 213f133701
commit 213f133701
parent ce2c7d72e2
25 changed files with 3664 additions and 518 deletions
--- a/.gitignore
+++ b/.gitignore
@ -40,6 +40,7 @@ models/*
 /server
 /Pipfile
 /libllama.so
 /mulmat-tune
 build-info.h
 arm_neon.h
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -78,6 +78,7 @@ option(LLAMA_K_QUANTS                        "llama: use k-quants"
 option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER               "llama: build server example"                           OFF)
 option(LLAMA_MULMAT_TUNE                "llama: mulmat tune"                                    OFF)
 #
 # Build info header
@ -214,6 +215,7 @@ if (LLAMA_BLAS)
        message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
        add_compile_options(${BLAS_LINKER_FLAGS})
        add_compile_definitions(GGML_USE_OPENBLAS)
        add_compile_definitions(GGML_BLAS_VENDOR="${LLAMA_BLAS_VENDOR}")
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
@ -276,6 +278,11 @@ if (LLAMA_METAL)
        )
 endif()
 if (LLAMA_MULMAT_TUNE)
    add_compile_definitions(GGML_USE_MULMAT_TUNE)
    add_compile_definitions(GGML_MULMAT_TUNE_NDEBUG)
 endif()
 if (LLAMA_K_QUANTS)
    set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
    add_compile_definitions(GGML_USE_K_QUANTS)
@ -450,6 +457,8 @@ endif()
 add_library(ggml OBJECT
            ggml.c
            ggml-threading.c
            ggml-tune.c
            ggml.h
            ${GGML_SOURCES_CUDA}
            ${GGML_SOURCES_OPENCL}
--- a/31
+++ b/31
@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple mulmat-tune
 ifdef LLAMA_BUILD_SERVER
 	BUILD_TARGETS += server
@ -47,7 +47,8 @@ endif
 OPT = -O3
 CFLAGS   = -I.              $(OPT) -std=c11   -fPIC
 CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
-LDFLAGS  =
+# -lm fixed error: ggml.o: undefined reference to symbol 'tanhf@@GLIBC_2.2.5' from ubuntu 22.04
 LDFLAGS  = -lm
 ifdef LLAMA_DEBUG
 	CFLAGS   += -O0 -g
@ -134,8 +135,7 @@ ifndef LLAMA_NO_K_QUANTS
 endif
 ifndef LLAMA_NO_ACCELERATE
-	# Mac M1 - include Accelerate framework.
+	# Mac Intel & M1 - include Accelerate framework.
 	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
 	ifeq ($(UNAME_S),Darwin)
 		CFLAGS  += -DGGML_USE_ACCELERATE
 		LDFLAGS += -framework Accelerate
@ -145,10 +145,16 @@ endif # LLAMA_NO_ACCELERATE
 ifdef LLAMA_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
 	LDFLAGS += -lopenblas
 	ifeq ($(UNAME_S),Darwin)
 		# openblas installed with Homebew on macOS.
 		CFLAGS  += -I/usr/local/opt/openblas/include
 		LDFLAGS += -L/usr/local/opt/openblas/lib
 	endif
 endif # LLAMA_OPENBLAS
 ifdef LLAMA_BLIS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
 	CFLAGS  += -DGGML_BLAS_VENDOR="\"BLIS\""
 	LDFLAGS += -lblis -L/usr/local/lib
 endif # LLAMA_BLIS
@ -230,6 +236,11 @@ k_quants.o: k_quants.c k_quants.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_NO_K_QUANTS
 ifdef LLAMA_MULMAT_TUNE
 	CFLAGS   += -DGGML_USE_MULMAT_TUNE -DGGML_MULMAT_TUNE_NDEBUG
 	CXXFLAGS += -DGGML_USE_MULMAT_TUNE
 endif
 #
 # Print build information
 #
@ -245,6 +256,8 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )
 OBJS += ggml-tune.o ggml-threading.o
 #
 # Build library
 #
@ -253,7 +266,12 @@ ggml.o: ggml.c ggml.h ggml-cuda.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
 llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 ggml-threading.o: ggml-threading.c ggml.h
 	$(CC)  $(CFLAGS) -c $< -o $@
 ggml-tune.o: ggml-tune.c ggml.h
 	$(CC)  $(CFLAGS) -c $< -o $@
 common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
@ -298,6 +316,9 @@ server: examples/server/server.cpp examples/server/httplib.h examples/server/jso
 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 mulmat-tune: examples/mulmat-tune/mulmat-tune.cpp build-info.h ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o mulmat-tune $(LDFLAGS)
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -39,6 +39,7 @@ else()
    add_subdirectory(baby-llama)
    add_subdirectory(train-text-from-scratch)
    add_subdirectory(simple)
    add_subdirectory(mulmat-tune)
    if (LLAMA_METAL)
        add_subdirectory(metal)
    endif()
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -345,6 +345,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.mem_test = true;
        } else if (arg == "--export") {
            params.export_cgraph = true;
 #ifdef GGML_USE_MULMAT_TUNE
        } else if (arg == "--tune") {
            params.tune = true;
        } else if (arg == "--tune-file") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.tune_file = argv[i];
 #endif // GGML_USE_MULMAT_TUNE
        } else if (arg == "--verbose-prompt") {
            params.verbose_prompt = true;
        } else if (arg == "-r" || arg == "--reverse-prompt") {
@ -498,6 +508,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #endif
    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
    fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
 #ifdef GGML_USE_MULMAT_TUNE
    fprintf(stderr, "  --tune                mulmat tune enable. If tune-file is set then exit after bench\n");
    fprintf(stderr, "  --tune-file FILE      mulmat tune data file. If tune is true, then write bench result to this file, else load the file and run\n");
 #endif
    fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
    fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
--- a/examples/common.h
+++ b/examples/common.h
@ -77,6 +77,8 @@ struct gpt_params {
    bool mem_test          = false; // compute maximum memory usage
    bool export_cgraph     = false; // export the computation graph
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool tune              = false; // mulmat tune: enable
    std::string tune_file  = "";    // mulmat tune: data file
 };
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -117,6 +117,16 @@ int main(int argc, char ** argv) {
        return 1;
    }
 #ifdef GGML_USE_MULMAT_TUNE
    if (params.tune || !params.tune_file.empty()) {
        bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
        if (!ok || (params.tune && !params.tune_file.empty())) {
            llama_free(ctx);
            return ok? 0: 1;
        }
    }
 #endif
    // print system information
    {
        fprintf(stderr, "\n");
--- a/examples/mulmat-tune/CMakeLists.txt
+++ b/examples/mulmat-tune/CMakeLists.txt
@ -0,0 +1,14 @@
 set(TARGET mulmat-tune)
 add_executable(${TARGET} mulmat-tune.cpp)
 if (XCODE OR MSVC)
  set(MULMAT_TUNE_LIBS ggml)
 else()
  set(MULMAT_TUNE_LIBS ggml m)
 endif()
 target_link_libraries(${TARGET} PRIVATE ${MULMAT_TUNE_LIBS} ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/mulmat-tune/README.md
+++ b/examples/mulmat-tune/README.md
@ -0,0 +1,272 @@
 # Mulmat Benchmark and Tunning
 Apart from the standalone tool `mulmat-tune`, mulmat tune is also integrated into
 `main` and `perplexity`. To avoid too many new cli options, I just added two options.
 To make it run faster, the `m_num` is set as 8 thus max M is 128, and the `n_pass`
 is set as 1.
 With the newly added cli options, we can use `main` and `perplexity` with the
 following three ways:
 * bench and run:  --tune
 * bench and exit: --tune --tune-file <FILE>
 * load  and run:  --tune-file <FILE>
 The `load` mode reads existing data file. Although this is fine because we can
 run bench ahead of time (saving tens of seconds), but there are two shortcomings:
 - have to re-run when format changed, this is OK because we are acknowledged.
 - the most subtle problem is algorithm was changed silently but we are using the
  outdated format. So I integrated mulmat tune into `main` and `perplexity` as
  a complementary solution.
 ## Build into main and perplexity
 Makefile:
 ```
 make clean && LLAMA_MULMAT_TUNE=1 make
 ```
 CMake (with BLAS):
 ```
 cmake --build . --target clean
 cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON
 cmake --build . --config Release
 ```
 Run examples:
 ```
 # bench and run:
 ./main -m ./models/3B/open-llama-3b-q4-0.bin -c 512 -b 1024 -n 256 --keep 48 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt -t 4 --tune
 # bench then exit:
 ./main -m ./models/3B/open-llama-3b-q4-0.bin --tune --tune-file <FILE>
 # load and run
 ./main -m ./models/3B/open-llama-3b-q4-0.bin -c 512 -b 1024 -n 256 --keep 48 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt -t 4 --tune-file <FILE>
 ```
 # Build the standalone `mulmat-tune`
 Makefile:
 ```
 make clean && LLAMA_MULMAT_TUNE=1 make
 ```
 CMake (with BLAS)
 ```
 cmake --build . --target clean
 cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON
 cmake --build . --config Release
 ```
 Run examples:
 ```
 ./mulmat-tune -h
 # run with default params (7B, Q4_0, ...)
 ./mulmat-tune
 # set model
 ./mulmat-tune --model 13B
 # set ggml ftype, 2 for Q4_0, 3 for Q4_1, run `mulmat-tune -h` for help.
 ./mulmat-tune --ftype 3
 # customized m_num
 ./mulmat-tune --m_num 8
 # customized n_pass: run 1 pass only instead of the default 3.
 ./mulmat-tune --n_pass 1
 # customized n_threads instead of the default 1.
 ./mulmat-tune --n_threads 4
 # save to file
 ./mulmat-tune --file <FILE>
 # save to file, always override if exists (CAUTION!)
 ./mulmat-tune --file <FILE> -y
 ```
 # End to End Test
 ## Compare With Master
 You may want to run the following commands. Make sure the tune result file is
 setup properly.
 General steps:
 1. run `./mulmat-tune -h` to see how to build for misc vendors.
   you can build with `GGML_MULMAT_TUNE_NDEBUG=` to enable the the debug, e.g:
   ```
   make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1 LLAMA_NO_ACCELERATE=1 LLAMA_CLBLAST=1 make
   ```
   On `macOS`, `ACCELERATE` is enabled by default. When `ACCELERATE` is built along
   with `CUDA` or `CL`, you may not see `CUDA` or `CL` from debug because `CPU`
   or `CPU_BLAS` is more faster (as of the estimation from mulmat tune).
 2. create a small prompt file:
   ```
   head -n 5 ./models/wikitext-2-raw/wiki.valid.raw > ./models/wiki.valid-5.raw
   ```
 3. run any of the following example commands.
   ```
   ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 1 -b 32
   ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 4 -b 64
   ```
   * `--mlock` is recommended for `macOS`, you may not want to use it.
   * don't change `-c 128`: too large `context size` causes 0 perplexity trunk.
   * `-t` is the number of threads, recommend `1`, `2`, `4` or `6`.
   * you can change the batch size (`-b`) between `1` and `128`.
   * you may want to add other cli options.
 The following results are generated with Accelerate compiled.
 ### 1 thread
 **Master (2d43387d)**
 ```
 | M   | perplexity (seconds per pass) | prompt eval time (ms per token) |
 | --- | --------------- |
 |  8  |  43.53 | 339.95 |
 |  16 |  44.31 | 346.12 |
 |  24 |  43.14 | 336.90 |
 |  32 |  33.59 | 262.25 |
 |  40 |  27.64 | 215.77 |
 |  48 |  24.52 | 191.42 |
 ```
 **This branch (tune)**
 ```
 |  M  |  perplexity (seconds per pass) | prompt eval time (ms per token) |
 | --- | --------------- |
 |  8  |  43.78 | 341.96 |
 |  16 |  42.88 | 334.93 |
 |  24 |  42.06 | 328.42 |
 |  32 |  33.07 | 258.25 |
 |  40 |  28.69 | 223.98 |
 |  48 |  25.65 | 200.19 |
 ```
 ### 4 threads
 **Master (2d43387d)**
 ```
 |  M  |  perplexity (seconds per pass) | prompt eval time (ms per token) |
 | --- | --------------- |
 |   8 |  12.43 |  96.99 |
 |  16 |  12.10 |  94.44 |
 |  24 |  12.81 |  99.95 |
 |  32 |  31.64 | 247.04 |
 |  48 |  24.55 | 191.63 |
 |  64 |  17.56 | 137.09 |
 |  96 |  17.59 | 137.25 |
 | 128 |  10.73 |  83.74 |
 ```
 **This branch (no tune)**
 ```
 |  M  |  perplexity (seconds per pass) | prompt eval time (ms per token) |
 | --- | --------------- |
 |   8 |  12.31 |  96.07 |
 |  16 |  12.00 |  93.63 |
 |  24 |  12.07 |  94.15 |
 |  32 |  20.34 | 158.76 |
 |  48 |  15.86 | 123.73 |
 |  64 |  10.98 |  85.69 |
 |  96 |  11.24 |  87.66 |
 | 128 |   7.53 |  58.77 |
 ```
 **This branch (tune)**
 ```
 |  M  |  perplexity (seconds per pass) | prompt eval time (ms per token) |
 | --- | --------------- |
 |   8 |  12.48 |  97.37 |
 |  16 |  12.26 |  95.70 |
 |  24 |  12.25 |  95.53 |
 |  32 |  11.98 |  93.58 |
 |  48 |  12.57 |  98.12 |
 |  64 |  11.28 |  88.05 |
 |  96 |   9.55 |  74.53 |
 | 128 |   7.51 |  58.61 |
 ```
 # Bench Data Format
 **Example**
 ```
 5 3B 2 6 1
 3200 3200  2 0 3 10
 16 0 0 0  16 1 0 1   0 0 0 0
 16 1 0 2  17 0 1 0   0 0 0 0
 0 0 0 0  34 0 1 0   0 0 0 0
   1        1      793 0     9103     2102 0 0     6014 0
   2        2     1591 0     8034     2305 0 0    30982 0
   4        4     2236 0     6476     2484 0 0    31388 0
   8        7     4161 0     6623     2389 0 0    29204 0
  16       15     8339 0     6434     2752 0 0    34303 0
  32       32    16919 0     6915     3651 0 0    42511 0
  64      200    34270 0     6574     4528 0 0    68212 0
 128      188    69400 0     6325     6839 0 0    74437 0
 256      303   134597 0     6168    11544 0 0   110180 0
 512      687   279685 0     6337    29712 0 0   159728 0
 3200 8640  2 0 2 10
 ...
 ```
 **Informal Explanation**
 ```
 head
 groups+
 head := version model ggml_ftype n_shapes n_threads
 shape+
 # head
 version: 1
 model: "3B" | "7B" | "13B" | "30B" | "65B"
 ggml_ftype: 0 - 4, 7 - 14
 n_shapes: number of shapes
 n_threads: number of threads
 shape := N K  m_num n_profiles
 task_conf_profile+
 bench_item+
 task_conf_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize)
 stage_conf: backend parallel wait
 backend: 0 (NONE) | 16 (CPU) | 17 (CPU_BLAS) | 32 (GPU) | 33 (GPU_CUDA) | 34 (GPU_CL)
 parallel: 0 (false) | 1 (true)
 wait: 0 (false) | 1 (true)
 bench_item: M profile_time+
 profile_time := stage_time[3]
 stage_time[3]: init_time, compute_time, finalize_time
 ```
 A task stage is invalid if it's backend equals to `GGML_TASK_BACKEND_NONE`.
 Time unit is `us`. A column is all zeros when that stage does not exist.
 # NOTE
 1. "3B" is [open-llama 3B](https://github.com/ggerganov/llama.cpp/pull/1588).
 2. Model names are subject to change: we may support something like X-3B, Y-4B, ...
 3. As of Jun 1, this tool is still in early stage, will be changed frequently in
   recent couple of days (or weeks).
--- a/examples/mulmat-tune/mulmat-tune.cpp
+++ b/examples/mulmat-tune/mulmat-tune.cpp
@ -0,0 +1,277 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include "build-info.h"
 #include "ggml-tune.h"
 #include "ggml.h"
 #include "llama.h"
 #define UNUSED(x) (void)(x)
 static void print_build_tips(void) {
    const char *a = "LLAMA_NO_ACCELERATE";
    fprintf(stderr, "Tips on how to build with various backend vendors:\n\n");
    fprintf(stderr, "CUDA:       make clean; LLAMA_CUBLAS=1 make\n");
    fprintf(stderr, "CL:         make clean; LLAMA_CLBLAST=1 make\n");
    fprintf(stderr, "Accelerate: make clean; %s=  make\n", a);
    fprintf(stderr, "OpenBLAS:   make clean; %s=1 LLAMA_OPENBLAS=1 make\n", a);
    fprintf(stderr, "BLIS:       make clean; %s=1 LLAMA_BLIS=1 make\n", a);
    fprintf(stderr, "\n");
    fprintf(stderr, "NOTE: for CUDA/CL, use %s=1 to disable ACCELERATE\n", a);
 }
 static bool prompt_yes_no(const char *prompt) {
    char buf[2];
    while (true) {
        fprintf(stderr, "%s (Y|n)\n", prompt);
        buf[0] = 0;
        buf[1] = 0;
        int i = 0;
        int c = 0;
        while (c != '\n') {
            c = fgetc(stdin);
            buf[i % 2] = c;
            i++;
        }
        if (i == 1) {
            if (buf[0] == '\n') {
                return true;
            }
        } else if (i == 2) {
            if (buf[0] == 'Y' || buf[0] == 'y') {
                return true;
            }
            if (buf[0] == 'N' || buf[0] == 'n') {
                return false;
            }
        }
    }
 }
 static void cmd_analyze(struct ggml_mulmat_tune *tune);
 static void usage(char *prog) {
    const char *usage_lines[] = {
        "usage: %s args",
        "",
        "bench [-m MODEL] [-t TYPE] [-f FILE] [-y]",
        "--model     MODEL    3B | 7B | 13B | 30B | 65B",
        "                     default 7B",
        "--ftype     FTYPE    ggml ftype:",
        "                     0:  all F32",
        "                     1:  mostly F16",
        "                     2:  mostly Q4_0",
        "                     3:  mostly Q4_1",
        "                     4:  mostly Q4_1, some F16",
        "                     7:  mostly Q8_0",
        "                     8:  mostly Q5_0",
        "                     9:  mostly Q5_1",
        "                     10: mostly Q2_K",
        "                     11: mostly Q3_K",
        "                     12: mostly Q4_K",
        "                     13: mostly Q5_K",
        "                     14: mostly Q6_K",
        "                     default 2 (mostly Q4_0)",
        "--m_num     M_NUM    number of M, the max M = 2^(M_NUM-1)",
        "                     requires between [6, 12]",
        "                     default 10",
        "--n_pass    PASS     number of passes to run",
        "                     default 1",
        "                     requires: between [1, 3]",
        "--n_threads NTH      bench with this number of threads",
        "                     requires: between [1, 16]",
        "                     default 1",
        "--file      FILE     data file to write",
        "                     default stdout",
        "-y                   always answer \"yes\" to all prompts",
    };
    int len = (int)(sizeof(usage_lines) / sizeof(char *));
    for (int i = 0; i < len; i++) {
        const char *line = usage_lines[i];
        if (i == 0) {
            fprintf(stderr, line, prog);
        } else {
            fprintf(stderr, "%s\n", line);
        }
    }
    printf("\n");
    print_build_tips();
    printf("\n");
 }
 int main(int argc, char **argv) {
    if (argc == 2) {
        if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) {
            usage(argv[0]);
            return 0;
        }
    }
    int arg_start = 1;
    const char *arg_model = NULL;
    const char *arg_ftype = NULL;
    const char *arg_m_num = NULL;
    const char *arg_n_threads = NULL;
    const char *arg_n_pass = NULL;
    const char *arg_file = NULL;
    bool always_yes = false;
    for (int i = arg_start; i < argc; i++) {
        if (strcmp(argv[i], "--model") == 0) {
            if (i + 1 < argc) {
                arg_model = argv[i + 1];
                ++i;
            }
        } else if (strcmp(argv[i], "--ftype") == 0) {
            if (i + 1 < argc) {
                arg_ftype = argv[i + 1];
                ++i;
            }
        } else if (strcmp(argv[i], "--m_num") == 0) {
            if (i + 1 < argc) {
                arg_m_num = argv[i + 1];
                ++i;
            }
        } else if (strcmp(argv[i], "--n_pass") == 0) {
            if (i + 1 < argc) {
                arg_n_pass = argv[i + 1];
                ++i;
            }
        } else if (strcmp(argv[i], "--n_threads") == 0) {
            if (i + 1 < argc) {
                arg_n_threads = argv[i + 1];
                ++i;
            }
        } else if (strcmp(argv[i], "--file") == 0) {
            if (i + 1 < argc) {
                arg_file = argv[i + 1];
                ++i;
            }
        } else if (strcmp(argv[i], "-y") == 0) {
            always_yes = true;
        } else {
            fprintf(stderr, "invalid arg: %s\n", argv[i]);
            usage(argv[0]);
            return 1;
        }
    }
    enum ggml_ftype ftype = GGML_FTYPE_MOSTLY_Q4_0;
    {
        if (arg_ftype != NULL) {
            int v = atoi(arg_ftype);
            ftype = (enum ggml_ftype)v;
        }
        if (ftype > GGML_FTYPE_MOSTLY_Q5_1) {
            fprintf(stderr, "k_quants type %d is not implemented\n", ftype);
            return 1;
        }
    }
    if (arg_file != NULL && !always_yes) {
        struct stat st;
        int rc = stat(arg_file, &st);
        UNUSED(st);
        if (rc == 0) { // prompt
            size_t len = strlen(arg_file) + 50;
            char *prompt = (char *)malloc(len);
            GGML_ASSERT(prompt);
            snprintf(prompt, len, "data file '%s' exists, override?", arg_file);
            if (!prompt_yes_no(prompt)) {
                printf("Aborted.\n");
                return 1;
            }
            free(prompt);
        }
    }
    int m_num = 10;
    {
        if (arg_m_num != NULL) {
            int v = atoi(arg_m_num);
            m_num = v;
        }
        if (m_num < 6 || m_num > 12) {
            fprintf(stderr, "invalid m_num: %d, expect between [6, 12]\n",
                    m_num);
            usage(argv[0]);
            return 1;
        }
    }
    int n_pass = 1;
    {
        if (arg_n_pass != NULL) {
            int v = atoi(arg_n_pass);
            n_pass = v;
        }
        if (n_pass < 1 || n_pass > GGML_MULMAT_MAX_PASS) {
            fprintf(stderr, "invalid n_pass: %d, expect between [1, %d]\n",
                    n_pass, GGML_MULMAT_MAX_PASS);
            usage(argv[0]);
            return 1;
        }
    }
    int n_threads = 1;
    {
        if (arg_n_threads != NULL) {
            int v = atoi(arg_n_threads);
            n_threads = v;
            if (n_threads < 1 || n_threads > 16) {
                fprintf(stderr,
                        "invalid n_threads: %d, expect between [1, 16]\n",
                        n_threads);
                usage(argv[0]);
                return 1;
            }
        }
    }
    const char *model_name = "7B";
    {
        if (arg_model != NULL) {
            model_name = arg_model;
        }
    }
    // Let init message print earlier.
    {
        struct ggml_init_params init_params = {
            /*.mem_size   =*/1,
            /*.mem_buffer =*/NULL,
            /*.no_alloc   =*/0,
        };
        struct ggml_context *ctx = ggml_init(init_params);
        GGML_ASSERT(ctx);
        ggml_free(ctx);
    }
    struct ggml_mulmat_tune tune;
    struct ggml_mulmat_tune_params params;
    memset(&params, 0, sizeof(struct ggml_mulmat_tune_params));
    ggml_mulmat_init_task_profiles();
    ggml_mulmat_tune_model_init(&params.model, model_name, ftype);
    params.m_num = m_num;
    params.n_pass = n_pass;
    params.n_threads = n_threads;
    params.progress = true;
    params.output_console = true;
    params.fname = arg_file;
    bool ok = ggml_mulmat_tune_bench(&tune, &params);
    return ok ? 0 : 1;
 }
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -158,6 +158,16 @@ int main(int argc, char ** argv) {
        return 1;
    }
 #ifdef GGML_USE_MULMAT_TUNE
    if (params.tune || !params.tune_file.empty()){
        bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
        if (!ok || (params.tune && !params.tune_file.empty())) {
            llama_free(ctx);
            return ok? 0: 1;
        }
    }
 #endif
    // print system information
    {
        fprintf(stderr, "\n");
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -2571,7 +2571,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
            func = ggml_cuda_rms_norm;
            break;
        case GGML_OP_MUL_MAT:
-            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
+            if (!any_on_device/* && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)*/) {
                return false;
            }
            func = ggml_cuda_mul_mat;
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -1628,7 +1628,7 @@ bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_
 }
 void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
-    GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
+    // GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
    if (src0->type == GGML_TYPE_F32) {
        ggml_cl_mul_mat_f32(src0, src1, dst);
--- a/ggml-threading.c
+++ b/ggml-threading.c
@ -0,0 +1,620 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "ggml-threading.h"
 #include "ggml.h"
 #define UNUSED(x) (void)(x)
 // see https://github.com/ggerganov/llama.cpp/pull/1314
 #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
 #include <emmintrin.h>
 static inline void ggml_spin_pause(void) { _mm_pause(); }
 #else
 static inline void ggml_spin_pause(void) {}
 #endif
 #if defined(_WIN32)
 #include <windows.h>
 typedef volatile LONG atomic_int;
 typedef atomic_int atomic_bool;
 typedef LONG atomic_flag;
 typedef CRITICAL_SECTION pthread_mutex_t;
 typedef CONDITION_VARIABLE pthread_cond_t;
 typedef void pthread_mutexattr_t;
 typedef void pthread_condattr_t;
 typedef HANDLE pthread_t;
 static void atomic_store(atomic_int *ptr, LONG val) {
    InterlockedExchange(ptr, val);
 }
 static LONG atomic_load(atomic_int *ptr) {
    return InterlockedCompareExchange(ptr, 0, 0);
 }
 static LONG atomic_fetch_add(atomic_int *ptr, LONG inc) {
    return InterlockedExchangeAdd(ptr, inc);
 }
 static LONG atomic_fetch_sub(atomic_int *ptr, LONG dec) {
    return atomic_fetch_add(ptr, -(dec));
 }
 static inline LONG atomic_flag_test_and_set(volatile atomic_flag *ptr) {
    return InterlockedCompareExchange(ptr, 1, 0);
 }
 static inline LONG atomic_flag_clear(volatile atomic_flag *ptr) {
    return InterlockedExchange(ptr, 0);
 }
 static int pthread_create(pthread_t *out, void *unused,
                          ggml_thread_ret_t (*func)(void *), void *arg) {
    (void)unused;
    HANDLE handle =
        CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, arg, 0, NULL);
    if (handle == NULL) {
        return EAGAIN;
    }
    *out = handle;
    return 0;
 }
 static int pthread_join(pthread_t thread, void *unused) {
    (void)unused;
    return (int)WaitForSingleObject(thread, INFINITE);
 }
 static int pthread_mutex_init(pthread_mutex_t *mutex,
                              pthread_mutexattr_t *attr) {
    (void)attr;
    InitializeCriticalSection(mutex);
    return 0;
 }
 static int pthread_mutex_destroy(pthread_mutex_t *mutex) {
    DeleteCriticalSection(mutex);
    return 0;
 }
 static int pthread_mutex_lock(pthread_mutex_t *mutex) {
    EnterCriticalSection(mutex);
    return 0;
 }
 static int pthread_mutex_unlock(pthread_mutex_t *mutex) {
    LeaveCriticalSection(mutex);
    return 0;
 }
 static int pthread_cond_init(pthread_cond_t *cond, pthread_condattr_t *attr) {
    (void)attr;
    InitializeConditionVariable(cond);
    return 0;
 }
 static int pthread_cond_destroy(pthread_cond_t *cond) {
    (void)cond;
    return 0;
 }
 static int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex) {
    SleepConditionVariableCS(cond, mutex, INFINITE);
    return 0;
 }
 static int pthread_cond_signal(pthread_cond_t *cond) {
    WakeConditionVariable(cond);
    return 0;
 }
 static int pthread_cond_broadcast(pthread_cond_t *cond) {
    WakeAllConditionVariable(cond);
    return 0;
 }
 static int sched_yield(void) {
    // https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-yieldprocessor
    YieldProcessor();
    return 0;
 }
 #else // ! _WIN32
 #include <pthread.h>
 #include <stdatomic.h>
 #include <stdbool.h>
 #endif
 // #define GGML_THREADING_DEBUG 1
 #ifdef GGML_THREADING_DEBUG
 #define PRINT_DEBUG(...) fprintf(stdout, __VA_ARGS__)
 #else
 #define PRINT_DEBUG(...)
 #endif
 struct ggml_perf_stats {
    int runs;
    // total cycles
    atomic_int cycles;
    // total time in us.
    atomic_int time_us;
 };
 struct ggml_compute_state_shared {
    atomic_flag spin;
    pthread_mutex_t mutex;
    pthread_cond_t cond;
    // number of threads that has entered thread runner.
    atomic_int n_ready;
    // number of assigned but unfinished tasks, workers decrease it.
    atomic_int n_tasks;
    // number of waiting workers, workers increase it.
    atomic_int n_waiting;
    // commands.
    atomic_bool wait_now;
    atomic_bool wait_on_done;
    atomic_bool stop;
    ggml_threading_task_runner *task_runner;
    struct ggml_threading_context *ctx;
 };
 struct ggml_compute_state {
    pthread_t thrd;
    atomic_bool has_work;
    struct ggml_compute_params params;
    struct ggml_tensor *node;
    struct ggml_compute_state_shared *shared;
 };
 struct ggml_threading_context {
    int n_threads;
    struct ggml_compute_state_shared shared;
    struct ggml_compute_state *workers;
    enum ggml_threading_features features;
    struct ggml_perf_stats wait_perf;
    struct ggml_perf_stats wakeup_perf;
    int64_t *stages_time;
 };
 // NOTE: ggml_spin_lock and ggml_spin_unlock may can be noop if
 // feature wait_on_done is off.
 static inline void ggml_spin_lock(volatile atomic_flag *obj) {
    while (atomic_flag_test_and_set(obj)) {
        ggml_spin_pause();
    }
 }
 static inline void ggml_spin_unlock(volatile atomic_flag *obj) {
    atomic_flag_clear(obj);
 }
 static inline void ggml_perf_collect(struct ggml_perf_stats *st, int64_t c0,
                                     int64_t t0) {
    st->runs++;
    st->cycles += (ggml_cycles() - c0);
    st->time_us += (ggml_time_us() - t0);
 }
 // A worker thread goes cond waiting.
 // NOTE: must be protected by shared->spin
 static void ggml_threading_cond_wait(struct ggml_compute_state *state) {
    struct ggml_compute_state_shared *shared = state->shared;
    int64_t perf_cycles_0 = 0;
    int64_t perf_time_0 = 0;
    if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
        perf_cycles_0 = ggml_cycles();
        perf_time_0 = ggml_time_us();
    }
    GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0);
    if (!shared->wait_now) {
        GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
        ggml_spin_unlock(&shared->spin);
        return;
    }
    shared->n_waiting++;
    ggml_spin_unlock(&shared->spin);
    GGML_ASSERT(pthread_cond_wait(&shared->cond, &shared->mutex) == 0);
    GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
    ggml_spin_lock(&shared->spin);
    shared->n_waiting--;
    if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
        ggml_perf_collect(&shared->ctx->wait_perf, perf_cycles_0, perf_time_0);
    }
 }
 // Wakeup all workers.
 //
 // Workers takes some time to wakeup, and has to lock spin after wakeup. Yield
 // is used to avoid signal frequently. Current implementation is highly
 // experimental. See tests/test-ggml-threading.c for details.
 //
 // NOTE: must be protected by shared->spin
 static void
 ggml_threading_wakeup_workers(struct ggml_compute_state_shared *shared) {
    int64_t perf_cycles_0 = 0;
    int64_t perf_time_0 = 0;
    if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
        perf_cycles_0 = ggml_cycles();
        perf_time_0 = ggml_time_us();
    }
    shared->wait_now = false;
    int loop_counter = 0;
    int notify_counter = 0;
    int64_t last_signal_time = 0;
    while (shared->n_waiting != 0) {
        ggml_spin_unlock(&shared->spin);
        if (loop_counter > 0) {
            ggml_spin_pause();
            if (loop_counter > 3) {
                sched_yield();
            }
        }
        ++loop_counter;
        // TODO: should bench actual average wait/wakeup time.
        if (last_signal_time > 0 && (ggml_time_us() - last_signal_time) < 10) {
            continue;
        }
        GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0);
        GGML_ASSERT(pthread_cond_broadcast(&shared->cond) == 0);
        GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
        ++notify_counter;
        last_signal_time = ggml_time_us();
        ggml_spin_lock(&shared->spin);
    }
    if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
        ggml_perf_collect(&shared->ctx->wakeup_perf, perf_cycles_0,
                          perf_time_0);
    }
    // if (notify_counter > 1) {
    //     printf("%s: loop counter: %d, notify counter: %d\n", __func__,
    //            loop_counter, notify_counter);
    // }
    UNUSED(notify_counter);
 }
 // Setup workers for a task stage.
 // NOTE: must be protected by shared->spin
 static void ggml_threading_setup_workers(struct ggml_threading_context *ctx,
                                         struct ggml_task_profile *profile,
                                         enum ggml_task_type type) {
    PRINT_DEBUG("[main] setup workers for task ...\n");
 #ifdef GGML_THREADING_DEBUG
    int64_t t0 = ggml_time_us();
 #endif
    const int n_worker_threads = ctx->n_threads - 1;
    struct ggml_task_stage *current = &profile->stages[type];
    struct ggml_compute_state_shared *shared = &ctx->shared;
    if (current->parallel) {
        if (shared->n_waiting > 0) {
            ggml_threading_wakeup_workers(shared);
        }
        if ((ctx->features & GGML_THREADING_FEATURE_WAIT_ON_DONE) > 0) {
            // Optimize energy: wait_on_done. We MAY also check following nodes,
            // but that's a bit complicated.
            shared->wait_on_done = false;
            for (int i = type + 1; i <= GGML_TASK_FINALIZE; i++) {
                struct ggml_task_stage *next = &profile->stages[i];
                if (next->parallel) {
                    break;
                }
                if (next->wait) {
                    shared->wait_on_done = true;
                    PRINT_DEBUG("[main] wait_on_done is enabled for "
                                "current task stage\n");
                    break;
                }
            }
        }
    } else if (current->wait) {
        if (shared->n_waiting < n_worker_threads) {
            shared->wait_now = true;
            PRINT_DEBUG("[main] wait_now was set, expect %d workers wait\n",
                        n_worker_threads);
            ggml_spin_unlock(&shared->spin);
            while (shared->n_waiting != n_worker_threads) {
                ggml_spin_pause();
            }
            ggml_spin_lock(&shared->spin);
            PRINT_DEBUG("[main] saw %d workers waiting\n", n_worker_threads);
        }
    }
    PRINT_DEBUG("[main] setup workers for task took %d us\n",
                (int)(ggml_time_us() - t0));
 }
 ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
    GGML_ASSERT(data);
    struct ggml_compute_state *state = (struct ggml_compute_state *)data;
    GGML_ASSERT(state);
    struct ggml_compute_state_shared *shared = state->shared;
    GGML_ASSERT(shared);
    GGML_ASSERT(shared->task_runner);
    shared->n_ready++;
    PRINT_DEBUG("[%d-th] running\n", state->params.ith);
    while (!shared->stop) {
        if (shared->wait_now) {
            ggml_spin_lock(&shared->spin);
            if (!state->has_work) {
                ggml_threading_cond_wait(state);
            }
            ggml_spin_unlock(&shared->spin);
        }
        if (shared->n_tasks > 0 && state->has_work) {
            enum ggml_compute_error err =
                shared->task_runner(&state->params, state->node);
            GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK);
            ggml_spin_lock(&shared->spin);
            state->has_work = false;
            shared->n_tasks--;
            bool wait = shared->wait_on_done && !state->has_work;
            if (wait) {
                ggml_threading_cond_wait(state);
            }
            ggml_spin_unlock(&shared->spin);
            // no need to pause.
            if (wait) {
                continue;
            }
        }
        ggml_spin_pause();
    }
    PRINT_DEBUG("[%d-th] exited\n", state->params.ith);
    return 0;
 }
 enum ggml_compute_error
 ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
                              struct ggml_tensor *node, void *wdata,
                              size_t wsize) {
    GGML_ASSERT(ctx);
    GGML_ASSERT(node);
    GGML_ASSERT(ctx->shared.task_runner);
    struct ggml_compute_state_shared *state_shared = &ctx->shared;
    // This is the params for main thread.
    struct ggml_compute_params params;
    enum ggml_compute_error err;
    for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) {
        if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) {
            continue;
        }
        PRINT_DEBUG("[main] stage: %d\n", type);
        int64_t t_stage = 0;
        if (ctx->stages_time) {
            t_stage = ggml_time_us();
        }
        // n_tasks is the total number of parallel computing tasks
        // (including main thread).
        int n_tasks =
            node->task_profile.stages[type].parallel ? ctx->n_threads : 1;
        ggml_spin_lock(&state_shared->spin);
        if (ctx->n_threads > 1) {
            ggml_threading_setup_workers(ctx, &node->task_profile, type);
        }
        if (n_tasks > 1) {
            // setup compute task parameters.
            for (int j = 0; j < n_tasks - 1; j++) {
                ctx->workers[j].params = (struct ggml_compute_params){
                    .type = type,
                    .ith = j + 1,
                    .nth = n_tasks,
                    .wsize = wsize,
                    .wdata = wdata,
                };
                ctx->workers[j].node = node;
                ctx->workers[j].has_work = true;
            }
            state_shared->n_tasks = n_tasks - 1;
            PRINT_DEBUG("[main] assigned %d tasks\n", state_shared->n_tasks);
        }
        ggml_spin_unlock(&state_shared->spin);
        // main thread always run the 0-th task.
        // TODO: assert(params->nth == 1) instead of
        // assert(params->ith == 0)
        {
            params.type = type;
            params.ith = 0;
            params.nth = n_tasks;
            params.wsize = wsize;
            params.wdata = wdata;
            err = state_shared->task_runner(&params, node);
        }
        // wait for tasks done.
        if (n_tasks > 1) {
            while (state_shared->n_tasks != 0) {
                ggml_spin_pause();
            }
        }
        PRINT_DEBUG("[main] all tasks finished\n\n");
        if (ctx->stages_time) {
            ctx->stages_time[type] = ggml_time_us() - t_stage;
        }
        if (err != GGML_COMPUTE_OK) {
            return err;
        }
    }
    return GGML_COMPUTE_OK;
 }
 struct ggml_threading_context *
 ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner,
                     ggml_threading_task_runner *task_stage_runner,
                     enum ggml_threading_features features,
                     int64_t stages_time[3]) {
    GGML_ASSERT(n_threads > 0);
    GGML_ASSERT(thread_runner);
    GGML_ASSERT(task_stage_runner);
    size_t ctx_sz = sizeof(struct ggml_threading_context);
    struct ggml_threading_context *ctx = malloc(ctx_sz);
    GGML_ASSERT(ctx);
    memset(ctx, 0, ctx_sz);
    ctx->shared = (struct ggml_compute_state_shared){
        .spin = {0},
        .n_ready = 0,
        .n_tasks = 0,
        .n_waiting = 0,
        .wait_now = false,
        .wait_on_done = false,
        .stop = false,
        .task_runner = task_stage_runner,
        .ctx = ctx,
    };
    PRINT_DEBUG("[main] thread start, features: %d\n", features);
    ctx->n_threads = n_threads;
    ctx->features = features;
    ctx->stages_time = stages_time;
    int n_workers = n_threads - 1;
    if (n_workers > 0) {
        GGML_ASSERT(pthread_mutex_init(&ctx->shared.mutex, NULL) == 0);
        GGML_ASSERT(pthread_cond_init(&ctx->shared.cond, NULL) == 0);
        size_t workers_sz = sizeof(struct ggml_compute_state) * n_workers;
        struct ggml_compute_state *workers = malloc(workers_sz);
        GGML_ASSERT(workers);
        memset(workers, 0, workers_sz);
        for (int j = 0; j < n_workers; j++) {
            workers[j].shared = &ctx->shared;
            GGML_ASSERT(pthread_create(&workers[j].thrd, NULL, thread_runner,
                                       &workers[j]) == 0);
        }
        ctx->workers = workers;
        while (ctx->shared.n_ready != n_workers) {
            ggml_spin_pause();
        }
    }
    return ctx;
 }
 static void
 ggml_threading_print_perf_stats(struct ggml_threading_context *ctx) {
    bool print_stats = ctx->features & GGML_THREADING_FEATURE_PERF;
 #ifdef GGML_THREADING_DEBUG
    print_stats = true;
 #endif
    if (!print_stats) {
        return;
    }
    const char *prefix_arr[2] = {"[threading wait  ]", "[threading wakeup]"};
    struct ggml_perf_stats *st_arr[2] = {&ctx->wait_perf, &ctx->wakeup_perf};
    for (int i = 0; i < 2; i++) {
        struct ggml_perf_stats *st = st_arr[i];
        if (st->runs == 0) {
            continue;
        }
        fprintf(stdout,
                "%s runs: %4d, avg cycles: %8.3f ms, avg time: "
                "%8.3f ms\n",
                prefix_arr[i], st->runs,
                1.0 * st->cycles / (st->runs * ggml_cycles_per_ms()),
                1.0 * st->time_us / (st->runs * 1000));
    }
 }
 void ggml_threading_stop(struct ggml_threading_context *ctx) {
    GGML_ASSERT(ctx);
    if (ctx->workers) {
        PRINT_DEBUG("[main] stopping thread pool ...\n");
        ctx->shared.stop = true;
        ggml_spin_lock(&ctx->shared.spin);
        ggml_threading_wakeup_workers(&ctx->shared);
        ggml_spin_unlock(&ctx->shared.spin);
        for (int j = 0; j < ctx->n_threads - 1; j++) {
            GGML_ASSERT(pthread_join(ctx->workers[j].thrd, NULL) == 0);
        }
        free(ctx->workers);
        PRINT_DEBUG("[main] thread pool stopped\n");
    }
    ggml_threading_print_perf_stats(ctx);
    free(ctx);
 }
--- a/ggml-threading.h
+++ b/ggml-threading.h
@ -0,0 +1,68 @@
 #pragma once
 #include "ggml.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 #if defined(_WIN32)
 typedef int ggml_thread_ret_t;
 #else
 typedef void *ggml_thread_ret_t;
 #endif
 struct ggml_threading_context;
 // Optional (experimental) features.
 enum ggml_threading_features {
    GGML_THREADING_FEATURE_NONE = 0,
    GGML_THREADING_FEATURE_WAIT_ON_DONE = 1 << 0,
    GGML_THREADING_FEATURE_PERF = 1 << 1,
 };
 // Compute errors.
 enum ggml_compute_error {
    GGML_COMPUTE_OK = 0,
    GGML_COMPUTE_FALLBACK = 1,
 };
 // The task runner to be called by main thread and workers.
 typedef enum ggml_compute_error(ggml_threading_task_runner)(
    struct ggml_compute_params *params, struct ggml_tensor *node);
 // The thread runner to feed into OS threads.
 typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data);
 // Init and start underlying workers if n_threads > 1.
 //
 // features: optional for configure threading additional features.
 // see `ggml_threading_feature`, default 0.
 // stages_time: optional for collecting per-stage wall clock time.
 struct ggml_threading_context *
 ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread,
                     ggml_threading_task_runner *task_stage_runner,
                     enum ggml_threading_features features,
                     int64_t stages_time[3]);
 // Stop workers (if exist), free memories (including the ctx).
 void ggml_threading_stop(struct ggml_threading_context *ctx);
 // The default implementation of `ggml_threading_thread_runner`
 ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data);
 // Compute a tensor. It computes the enabled task stages one by one.
 // Caller should take care of the return error: retry for fallback error.
 enum ggml_compute_error
 ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
                              struct ggml_tensor *node, void *wdata,
                              size_t wsize);
 // This is an experimental functionality for mulmat tune, as a thin wrapper.
 enum ggml_compute_error
 ggml_compute_forward_wrapper(struct ggml_compute_params *params,
                             struct ggml_tensor *tensor);
 #ifdef __cplusplus
 }
 #endif
--- a/ggml-tune.c
+++ b/ggml-tune.c
@ -0,0 +1,897 @@
 #include <string.h>
 #include "ggml-threading.h"
 #include "ggml-tune.h"
 #include "ggml.h"
 // MUL_MAT fine tunning for non-GPU-offloading cases.
 #define GGML_MULMAT_CACHE_LEN 16
 static struct mm_cache_element default_mm_cache[GGML_MULMAT_CACHE_LEN] = {0};
 #define FNV_OFFSET 14695981039346656037UL
 #define FNV_PRIME 1099511628211UL
 static uint64_t ggml_mulmat_tune_cache_hash(int M, int N, int K) {
    char buf[30];
    snprintf(buf, 30, "%d%d%d", M, N, K);
    uint64_t hash = FNV_OFFSET;
    for (const char *p = buf; *p; p++) {
        hash ^= (uint64_t)(unsigned char)(*p);
        hash *= FNV_PRIME;
    }
    return hash;
 }
 static const char *
 ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) {
    switch (backend) {
    case GGML_TASK_BACKEND_NONE:
        return "";
    case GGML_TASK_BACKEND_CPU:
        return "CPU";
    case GGML_TASK_BACKEND_CPU_BLAS:
        return "BLAS";
    case GGML_TASK_BACKEND_GPU:
        return "GPU";
    case GGML_TASK_BACKEND_GPU_CUDA:
        return "CUDA";
    case GGML_TASK_BACKEND_GPU_CL:
        return "CL";
    default:
        GGML_ASSERT(false);
    }
 }
 const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
    struct ggml_mulmat_tune *tune, int M, int N, int K, enum ggml_type src0_t,
    enum ggml_type src1_t, int stages_time[3]) {
    GGML_ASSERT(tune);
    // TODO: default_mm_cache is thread-unsafe.
    struct mm_cache_element *mm_cache = default_mm_cache;
    int slot = ggml_mulmat_tune_cache_hash(M, N, K) % GGML_MULMAT_CACHE_LEN;
    struct mm_cache_element *e = &mm_cache[slot];
    struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0};
    struct ggml_task_profile *prof = NULL;
    if (e->M == M && e->N == N && e->K == K) {
        prof = e->profile;
        if (stages_time) {
            for (int i = 0; i < 3; i++) {
                stages_time[i] = e->stages_time[i];
            }
        }
    } else {
        const struct ggml_mulmat_tune_shape *shape = NULL;
        shape = ggml_mulmat_tune_get_shape(tune, N, K, src0_t, src1_t);
        if (shape) {
            ggml_mulmat_tune_estimate_time(shape, M, profiles_time);
            int min = INT32_MAX;
            int index = -1;
            for (int i = 0; i < shape->n_profiles; i++) {
                int total = profiles_time[i].total_time;
                if (total < min) {
                    min = total;
                    index = i;
                }
            }
            if (index >= 0) {
                prof = profiles_time[index].profile;
                for (int i = 0; i < 3; i++) {
                    int t = profiles_time[index].stage_time[i];
                    if (stages_time) {
                        stages_time[i] = t;
                    }
                    e->stages_time[i] = t;
                }
                GGML_ASSERT(prof);
                e->profile = prof;
                e->M = M;
                e->N = N;
                e->K = K;
                // to disable this, build with
                // `make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1
                // make`
 #if !defined(GGML_MULMAT_TUNE_NDEBUG)
                const char *names[3];
                for (int i = 0; i < 3; i++) {
                    names[i] = ggml_mulmat_tune_task_backend_name(
                        prof->stages[i].backend);
                }
                printf(
                    "\n[mulmat tune] M: %3d, N: %5d, K: %5d, backends of the "
                    "fastest profile: %s %s %s\n",
                    M, N, K, names[0], names[1], names[2]);
 #endif
            }
        }
    }
    return prof;
 }
 void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
                                 const char *name, enum ggml_ftype ftype) {
    const int n_vocab = 32000;
    int n_embd;
    // n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult
    int n_ff;
    // n_rot = n_embd/n_head;
    int n_rot;
    if (strcmp(name, "3B") == 0) {
        // n_head=32, n_mult=216, n_layer=26
        // https://github.com/ggerganov/llama.cpp/pull/1588
        n_embd = 3200;
        n_ff = 8640;
        n_rot = 100;
    } else if (strcmp(name, "7B") == 0) {
        n_embd = 4096;
        n_ff = 11008;
        n_rot = 128;
    } else if (strcmp(name, "13B") == 0) {
        n_embd = 5120;
        n_ff = 13824;
        n_rot = 128;
    } else if (strcmp(name, "30B") == 0) {
        n_embd = 6656;
        n_ff = 17920;
        n_rot = 128;
    } else if (strcmp(name, "65B") == 0) {
        n_embd = 8192;
        n_ff = 22016;
        n_rot = 128;
    } else {
        GGML_ASSERT(false);
    }
    model->name = name;
    model->ftype = ftype;
    model->n_vocab = n_vocab;
    model->n_embd = n_embd;
    model->n_ff = n_ff;
    model->n_rot = n_rot;
 }
 bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
                           struct ggml_mulmat_tune_params *params,
                           struct ggml_task_profile_factory *pf) {
    struct ggml_mulmat_tune_model *model = &params->model;
    memset(tune, 0, sizeof(struct ggml_mulmat_tune));
    tune->version = GGML_MULMAT_TUNE_VERSION;
    tune->n_threads = params->n_threads;
    tune->ftype = model->ftype;
    size_t name_len = strlen(model->name);
    GGML_ASSERT(name_len > 0);
    strncpy(tune->model, model->name, sizeof(tune->model) - 1);
    const enum ggml_type rot_src0_type = GGML_TYPE_F16;
    const enum ggml_type src1_type = GGML_TYPE_F32;
    int n_vocab = model->n_vocab;
    int n_embd = model->n_embd;
    int n_ff = model->n_ff;
    int n_rot = model->n_rot;
    enum ggml_type type = ggml_ftype_to_ggml_type(model->ftype);
    GGML_ASSERT(GGML_MULMAT_N_SHAPES >= 6);
    tune->n_shapes = GGML_MULMAT_N_SHAPES;
    // Attention layers
    tune->shapes[0] = (struct ggml_mulmat_tune_shape){
        .N = n_embd, .K = n_embd, .src0_type = type, .src1_type = src1_type};
    // Feed forward layers
    tune->shapes[1] = (struct ggml_mulmat_tune_shape){
        .N = n_embd, .K = n_ff, .src0_type = type, .src1_type = src1_type};
    tune->shapes[2] = (struct ggml_mulmat_tune_shape){
        .N = n_ff, .K = n_embd, .src0_type = type, .src1_type = src1_type};
    tune->shapes[3] = (struct ggml_mulmat_tune_shape){
        .N = n_vocab, .K = n_embd, .src0_type = type, .src1_type = src1_type};
    // RoPE
    tune->shapes[4] = (struct ggml_mulmat_tune_shape){
        .N = n_rot, .K = 0, .src0_type = rot_src0_type, .src1_type = src1_type};
    tune->shapes[5] = (struct ggml_mulmat_tune_shape){
        .N = 0, .K = n_rot, .src0_type = rot_src0_type, .src1_type = src1_type};
    for (int i = 0; i < tune->n_shapes; i++) {
        struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
        shape->n_profiles = ggml_mulmat_get_task_profiles(
            pf, shape->src0_type, shape->src1_type, &shape->profiles);
        if (shape->n_profiles == 0) {
            // allowed for testing.
            continue;
        }
        shape->m_num = params->m_num;
        shape->arr_m = malloc(shape->m_num * sizeof(int));
        for (int j = 0; j < shape->m_num; j++) {
            shape->arr_m[j] = 1 << j;
        }
        size_t sz = sizeof(struct ggml_mulmat_tune_m) *
                    (shape->n_profiles * shape->m_num);
        shape->items = malloc(sz);
        GGML_ASSERT(shape->items);
        memset(shape->items, 0, sz);
    }
    return true;
 }
 void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune) {
    for (int i = 0; i < tune->n_shapes; i++) {
        struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
        GGML_ASSERT(shape);
        // arr_m and items can be NULL only when testing.
        if (shape->arr_m) {
            free(shape->arr_m);
        }
        if (shape->items) {
            free(shape->items);
        }
    }
 }
 static bool ggml_mulmat_tune_write_profiles(
    FILE *fp, const struct ggml_task_profile *profiles, int n_profiles) {
    int rc;
    for (int i = 0; i < n_profiles; i++) {
        const struct ggml_task_profile *profile = &profiles[i];
        for (int j = 0; j < 3; j++) {
            const struct ggml_task_stage *ts = &profile->stages[j];
            rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0,
                         ts->wait ? 1 : 0);
            if (rc <= 0) {
                return false;
            }
            if (j < 2) {
                rc = fprintf(fp, "  ");
                if (rc <= 0) {
                    return false;
                }
            }
        }
        rc = fprintf(fp, "\n");
        if (rc <= 0) {
            return false;
        }
    }
    return true;
 }
 static bool
 ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
                                   const char *model, int ftype, int n_threads,
                                   char *errbuf, int errbuf_len) {
    if (tune->version != GGML_MULMAT_TUNE_VERSION) {
        snprintf(errbuf, errbuf_len - 1,
                 "version mismatch, built-in: %d, "
                 "yours: %d",
                 GGML_MULMAT_TUNE_VERSION, tune->version);
        return false;
    } else if (strcmp(model, tune->model) != 0) {
        snprintf(errbuf, errbuf_len - 1,
                 "model mismatch. built-in: %s, yours: %s", model, tune->model);
        return false;
    } else if (ftype != tune->ftype) {
        snprintf(errbuf, errbuf_len - 1,
                 "ftype mismatch. built-in: %d, yours: %d\n", ftype,
                 tune->ftype);
        return false;
    } else if (n_threads != tune->n_threads) {
        snprintf(errbuf, errbuf_len - 1,
                 "n_threads mismatch. run-time: %d, yours: %d\n", n_threads,
                 tune->n_threads);
        return false;
    }
    for (int i = 0; i < tune->n_shapes; i++) {
        const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
        struct ggml_task_profile *builtin_profiles = NULL;
        int n_profiles = ggml_mulmat_get_task_profiles(
            NULL, shape->src0_type, shape->src1_type, &builtin_profiles);
        if (n_profiles != shape->n_profiles) {
            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
            return false;
        }
        // TODO: profiles order is relevant, too strict.
        size_t sz = sizeof(struct ggml_task_profile) * n_profiles;
        if (memcmp(builtin_profiles, shape->profiles, sz) != 0) {
            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
            printf("=== built-in profiles:\n");
            ggml_mulmat_tune_write_profiles(stderr, builtin_profiles,
                                            n_profiles);
            printf("=== incoming profiles:\n");
            ggml_mulmat_tune_write_profiles(stderr, shape->profiles,
                                            shape->n_profiles);
            return false;
        }
    }
    return true;
 }
 bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
                               const char *model, int ftype, int n_threads) {
    char errbuf[128];
    bool ok = ggml_mulmat_tune_validate_internal(tune, model, ftype, n_threads,
                                                 errbuf, sizeof(errbuf));
    if (!ok) {
        fprintf(stderr, "[mulmat tune] error: %s. run bench again.\n", errbuf);
    }
    return ok;
 }
 bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
    int rc = fscanf(fp, "%d", &tune->version);
    if (rc <= 0) {
        return false;
    }
    if (tune->version != GGML_MULMAT_TUNE_VERSION) {
        fprintf(stderr, "[mulmat tune] version mismatch, run bench again\n");
        return false;
    }
    rc = fscanf(fp, "%s %d %d %d", tune->model, (int *)&tune->ftype,
                &tune->n_shapes, &tune->n_threads);
    if (rc <= 0) {
        return false;
    }
    for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
        struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
        rc = fscanf(fp, "%d %d %d %d %d %d", &shape->N, &shape->K,
                    (int *)&shape->src0_type, (int *)&shape->src1_type,
                    &shape->n_profiles, &shape->m_num);
        if (rc <= 0) {
            return false;
        }
        {
            size_t item_size = sizeof(struct ggml_mulmat_tune_m) *
                               (shape->n_profiles * shape->m_num);
            shape->items = malloc(item_size);
            if (shape->items == NULL) {
                fprintf(stderr, "[mulmat tune] failed to allocate memory\n");
                return false;
            }
            memset(shape->items, 0, item_size);
        }
        {
            size_t sz = sizeof(struct ggml_task_profile) * shape->n_profiles;
            shape->profiles = malloc(sz);
            GGML_ASSERT(shape->profiles);
            memset(shape->profiles, 0, sz);
        }
        for (int ip = 0; ip < shape->n_profiles; ip++) {
            struct ggml_task_profile *profile = &shape->profiles[ip];
            for (int j = 0; j < 3; j++) {
                struct ggml_task_stage *ts = &profile->stages[j];
                int backend;
                int parallel;
                int wait;
                rc = fscanf(fp, "%d %d %d", &backend, &parallel, &wait);
                if (rc <= 0) {
                    return false;
                }
                ts->backend = (enum ggml_task_backend)backend;
                ts->parallel = parallel ? true : false;
                ts->wait = wait ? true : false;
            }
        }
        for (int i_m = 0; i_m < shape->m_num; i_m++) {
            int M;
            for (int ip = 0; ip < shape->n_profiles; ip++) {
                if (ip == 0) {
                    rc = fscanf(fp, "%d", &M);
                    if (rc <= 0) {
                        return false;
                    }
                }
                struct ggml_mulmat_tune_m *item =
                    &shape->items[ip * shape->m_num + i_m];
                item->M = M;
                rc = fscanf(fp, "%d %d %d", &item->stages_time[0],
                            &item->stages_time[1], &item->stages_time[2]);
                if (rc <= 0) {
                    return false;
                }
            }
        }
    }
    return true;
 }
 bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
                                 FILE *fp) {
    int rc;
    rc = fprintf(fp, "%d %s %d %d %d\n\n", tune->version, tune->model,
                 tune->ftype, tune->n_shapes, tune->n_threads);
    if (rc <= 0) {
        return false;
    }
    for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
        if (i_shape > 0) {
            fprintf(fp, "\n");
        }
        const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
        rc = fprintf(fp, "%d %d %d %d %d %d\n", shape->N, shape->K,
                     shape->src0_type, shape->src1_type, shape->n_profiles,
                     shape->m_num);
        if (rc <= 0) {
            return false;
        }
        rc = ggml_mulmat_tune_write_profiles(fp, shape->profiles,
                                             shape->n_profiles);
        if (rc <= 0) {
            return false;
        }
        for (int i_m = 0; i_m < shape->m_num; i_m++) {
            for (int ip = 0; ip < shape->n_profiles; ip++) {
                struct ggml_mulmat_tune_m *item =
                    &shape->items[ip * shape->m_num + i_m];
                if (ip == 0) {
                    rc = fprintf(fp, "%4d", item->M);
                    if (rc <= 0) {
                        return false;
                    }
                }
                struct ggml_task_profile *profile = &shape->profiles[ip];
                for (int k = 0; k < 3; k++) {
                    if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) {
                        rc = fprintf(fp, "%9d", item->stages_time[k]);
                        if (rc <= 0) {
                            return false;
                        }
                    } else {
                        rc = fprintf(fp, " 0");
                        if (rc <= 0) {
                            return false;
                        }
                    }
                }
            }
            rc = fprintf(fp, "\n");
            if (rc <= 0) {
                return false;
            }
        }
    }
    return true;
 }
 const struct ggml_mulmat_tune_shape *
 ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N,
                           const int K, enum ggml_type src0_type,
                           enum ggml_type src1_type) {
    GGML_ASSERT(N > 0 && K > 0);
    for (int i = 0; i < tune->n_shapes; i++) {
        const struct ggml_mulmat_tune_shape *s = &tune->shapes[i];
        if (s->src0_type != src0_type || s->src1_type != src1_type) {
            continue;
        }
        if (s->N > 0 && s->K > 0) {
            if (s->N == N && s->K == K) {
                return s;
            }
        } else if (s->N > 0 && s->K == 0) {
            if (s->N == N) {
                return s;
            }
        } else if (s->N == 0 && s->K > 0) {
            if (s->K == K) {
                return s;
            }
        }
    }
    return NULL;
 }
 // This is the experimental reference implementation.
 // Requires both n_threads are same at bench time and runtime.
 void ggml_mulmat_tune_estimate_time(
    const struct ggml_mulmat_tune_shape *shape, int M,
    struct ggml_mulmat_tune_time *profile_time) {
    GGML_ASSERT(shape);
    GGML_ASSERT(profile_time);
    const int m_num = shape->m_num;
    const int min_m = shape->items[0].M;
    const int max_m = shape->items[m_num - 1].M;
    for (int ip = 0; ip < shape->n_profiles; ip++) {
        struct ggml_task_profile *profile = &shape->profiles[ip];
        profile_time[ip].total_time = 0;
        profile_time[ip].profile = profile;
        const int items_offset = ip * m_num;
        struct ggml_mulmat_tune_m *p0 = NULL;
        struct ggml_mulmat_tune_m *p1 = NULL;
        if (M < min_m) {
            // first two.
            p0 = &shape->items[items_offset];
            p1 = &shape->items[items_offset + 1];
        } else if (M > max_m) {
            // last two
            p0 = &shape->items[items_offset + m_num - 2];
            p1 = &shape->items[items_offset + m_num - 1];
        } else {
            for (int i = 0; i < m_num; i++) {
                p1 = &shape->items[items_offset + i];
                if (p1->M == M) {
                    p0 = p1;
                    break;
                }
                if (i > 0) {
                    p0 = (struct ggml_mulmat_tune_m *)(p1 - 1);
                    if (M > p0->M && M < p1->M) {
                        break;
                    }
                }
            }
        }
        GGML_ASSERT(p0 && p1);
        for (int i_stage = 0; i_stage < 3; i_stage++) {
            struct ggml_task_stage *stage = &profile->stages[i_stage];
            if (stage->backend == GGML_TASK_BACKEND_NONE) {
                continue;
            }
            int p0_v = p0->stages_time[i_stage];
            int p1_v = p1->stages_time[i_stage];
            GGML_ASSERT(p0_v >= 0);
            GGML_ASSERT(p1_v >= 0);
            // t = aM + b
            double a;
            double b;
            if (p0 == p1) {
                a = 0.0;
                b = p1_v;
            } else {
                a = 1.0 * (p1_v - p0_v) / (p1->M - p0->M);
                b = p1_v - a * p1->M;
            }
            int t = (int)(a * M + b);
            profile_time[ip].stage_time[i_stage] = t;
            profile_time[ip].total_time += t;
        }
    }
 }
 // Experimental: create mul_mat tensor.
 static struct ggml_tensor *ggml_mulmat_new_tensor(int M, int N, int K,
                                                  enum ggml_type src0_type,
                                                  struct ggml_context **ctx) {
    // At most 256, because in `ggml_quantize_qx_x`, the index type of hist is
    // either int8_t or uint8_t.
    // Use 1024 to avoid suddenly broken.
    int64_t hist[1024];
    bool src0_is_quantized = ggml_is_quantized(src0_type);
    size_t ctx_size = 0;
    ctx_size += (size_t)(M * N * ggml_type_sizef(GGML_TYPE_F32)); // src1
    ctx_size += (size_t)(N * K * ggml_type_sizef(src0_type));     // src0
    ctx_size += (size_t)(1024 * 1024 * 64); // experimental
    if (src0_is_quantized) {
        // quantize F32 to Qx_x
        ctx_size += (size_t)(N * K * ggml_type_sizef(GGML_TYPE_F32));
    }
    struct ggml_init_params init_params = {
        .mem_size = ctx_size,
        .mem_buffer = NULL,
        .no_alloc = 0,
    };
    *ctx = ggml_init(init_params);
    GGML_ASSERT(*ctx);
    // src0: N x K
    struct ggml_tensor *src0 =
        ggml_new_tensor_2d(*ctx, src0_type, (int64_t)K, (int64_t)N);
    // src1: M x K
    struct ggml_tensor *src1 =
        ggml_new_tensor_2d(*ctx, GGML_TYPE_F32, (int64_t)K, (int64_t)M);
    ggml_set_f32(src1, 0.5f);
    if (src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_F16) {
        ggml_set_f32(src0, 0.1f);
    } else if (src0_is_quantized) {
        struct ggml_tensor *src0_f32 =
            ggml_new_tensor_2d(*ctx, GGML_TYPE_F32, (int64_t)K, (int64_t)N);
        ggml_set_f32(src0_f32, 0.1f);
        switch (src0_type) {
        case GGML_TYPE_Q4_0:
            ggml_quantize_q4_0((const float *)src0_f32->data, src0->data, N * K,
                               K, hist);
            break;
        case GGML_TYPE_Q4_1:
            ggml_quantize_q4_1((const float *)src0_f32->data, src0->data, N * K,
                               K, hist);
            break;
        case GGML_TYPE_Q5_0:
            ggml_quantize_q5_0((const float *)src0_f32->data, src0->data, N * K,
                               K, hist);
            break;
        case GGML_TYPE_Q5_1:
            ggml_quantize_q5_1((const float *)src0_f32->data, src0->data, N * K,
                               K, hist);
            break;
        case GGML_TYPE_Q8_0:
            ggml_quantize_q8_0((const float *)src0_f32->data, src0->data, N * K,
                               K, hist);
            break;
        default:
            GGML_ASSERT(false);
        }
    } else {
        GGML_ASSERT(false);
    }
    // node: M x N
    // Will compute z = y * xT, z: node, y: src1, x: src0
    return ggml_mul_mat(*ctx, src0, src1);
 }
 // Experimental: allocate memory for wdata with max possible size.
 // This part of code is actually belongs to ggml compute graph.
 static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) {
    // The size is actually determined by cgraph before computing.
    // Apart from the src0_type, wsize is affected by backend, cache line size,
    // n_threads etc.
    const size_t extra = 1024 * 1024;
    size_t sz = (size_t)(N * K * ggml_type_sizef(GGML_TYPE_F32)) + extra;
    void *buf = malloc(sz);
    if (!buf) {
        fprintf(stderr,
                "[mulmat tune] error: failed to allocate %zu MiB memory",
                sz / 1024 / 1024);
        return 0;
    }
    memset(buf, 0, sz);
    *wdata = buf;
    return sz;
 }
 int ggml_mulmat_tune_get_builtin_task_backends(
    enum ggml_task_backend *backends) {
    int i = 0;
    backends[i++] = GGML_TASK_BACKEND_CPU;
    if (ggml_cpu_has_cpublas()) {
        backends[i++] = GGML_TASK_BACKEND_CPU_BLAS;
    }
    if (ggml_cpu_has_cublas()) {
        backends[i++] = GGML_TASK_BACKEND_GPU_CUDA;
    } else if (ggml_cpu_has_clblast()) {
        backends[i++] = GGML_TASK_BACKEND_GPU_CL;
    }
    return i;
 }
 bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
                            struct ggml_mulmat_tune_params *params) {
    GGML_ASSERT(tune);
    GGML_ASSERT(params);
    GGML_ASSERT(params->model.name);
    enum ggml_task_backend backends[16];
    int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
    if (n_backends < 2) {
        fprintf(stderr,
                "[mulmat tune] error: this program was not built with BLAS.\n");
        return false;
    }
    bool ok = ggml_mulmat_tune_init(tune, params, NULL);
    if (!ok) {
        return false;
    }
    {
        char buf[128] = {0};
        int offset = 0;
        for (int i = 0; i < n_backends; i++) {
            if (i > 0) {
                buf[offset++] = ',';
                buf[offset++] = ' ';
            }
            const char *name = ggml_mulmat_tune_task_backend_name(backends[i]);
            size_t len = strlen(name);
            memcpy(&buf[offset], name, len);
            offset += (int)len;
        }
        fprintf(stdout,
                "[mulmat tune] model: %s, ggml ftype: %d, "
                "n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n",
                params->model.name, params->model.ftype, params->n_pass,
                params->n_threads, tune->n_shapes, buf);
    }
    int64_t stages_time[3];
    int64_t t0 = ggml_time_ms();
    struct ggml_threading_context *thrd_ctx = ggml_threading_start(
        tune->n_threads, ggml_threading_graph_compute_thread,
        ggml_compute_forward_wrapper, GGML_THREADING_FEATURE_WAIT_ON_DONE,
        stages_time);
    for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
        const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
        int M;
        int N = shape->N;
        int K = shape->K;
        char buf[20] = {0};
        int buf_len = sizeof(buf) - 1;
        int line_len = 0;
        for (int i_m = 0; i_m < shape->m_num; i_m++) {
            M = shape->arr_m[i_m];
            if (shape->N == 0) {
                N = M;
            } else if (shape->K == 0) {
                K = M;
            }
            if (params->progress) {
                line_len = snprintf(buf, buf_len, "%d %d %d ", N, K, M);
                fprintf(stdout, "%s", buf);
                fflush(stdout);
            }
            char *wdata = NULL;
            size_t wsize = ggml_mulmat_allocate_wdata(N, K, &wdata);
            if (wsize == 0) {
                return false;
            }
            struct ggml_context *ctx = NULL;
            struct ggml_tensor *node =
                ggml_mulmat_new_tensor(M, N, K, shape->src0_type, &ctx);
            for (int ip = 0; ip < shape->n_profiles; ip++) {
                const struct ggml_task_profile *profile = &shape->profiles[ip];
                memcpy(&node->task_profile, profile,
                       sizeof(struct ggml_task_profile));
                struct ggml_mulmat_tune_m *item =
                    &shape->items[ip * shape->m_num + i_m];
                item->M = M;
                int min[3] = {INT32_MAX, INT32_MAX, INT32_MAX};
                for (int k = 0; k < params->n_pass; k++) {
                    for (int j = 0; j < 3; j++) {
                        stages_time[j] = 0;
                    }
                    /*enum ggml_compute_error err = */
                    ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
                    for (int i = 0; i < 3; i++) {
                        int v = (int)stages_time[i];
                        if (v < min[i]) {
                            min[i] = v;
                        }
                    }
                    if (params->progress) {
                        fprintf(stdout, ".");
                        fflush(stdout);
                        line_len++;
                    }
                }
                for (int i = 0; i < 3; i++) {
                    item->stages_time[i] = min[i];
                }
            }
            ggml_free(ctx);
            free(wdata);
            if (params->progress) {
                line_len += 10;
                for (int j = 0; j < line_len; j++) {
                    fprintf(stdout, "\b \b");
                }
                fflush(stdout);
            }
        }
    }
    ggml_threading_stop(thrd_ctx);
    fprintf(stdout, "[mulmat tune] done, elapsed time: %d seconds.\n",
            (int)(ggml_time_ms() - t0) / 1000);
    // output
    if (params->fname && strcmp(params->fname, "") != 0) {
        FILE *fp = fopen(params->fname, "w");
        if (!fp) {
            fprintf(stderr,
                    "[mulmat tune] warn: failed to open file `%s`, print to "
                    "console instead\n\n",
                    params->fname);
            params->output_console = 1;
        } else {
            ok = ggml_mulmat_tune_write_data(tune, fp);
            fclose(fp);
            if (ok) {
                fprintf(stdout, "[mulmat tune] data was written to `%s`\n",
                        params->fname);
            } else {
                fprintf(
                    stderr,
                    "[mulmat tune] warn: failed to write file `%s`, print to "
                    "console instead\n\n",
                    params->fname);
                params->output_console = 1;
            }
        }
    }
    if (params->output_console) {
        return ggml_mulmat_tune_write_data(tune, stdout);
    }
    return true;
 }
--- a/ggml-tune.h
+++ b/ggml-tune.h
@ -0,0 +1,137 @@
 #pragma once
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include "ggml.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 #define GGML_MULMAT_TUNE_VERSION 8
 #define GGML_MULMAT_N_SHAPES 6
 #define GGML_MULMAT_MAX_PASS 3
 struct ggml_mulmat_tune_m {
    int M;
    int stages_time[3];
 };
 struct ggml_mulmat_tune_model {
    const char *name;
    enum ggml_ftype ftype;
    int n_vocab;
    int n_embd;
    // n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult
    int n_ff;
    // n_rot = n_embd/n_head;
    int n_rot;
 };
 struct ggml_mulmat_tune_shape {
    // For RoPE, one of N / K is 0.
    int N;
    int K;
    enum ggml_type src0_type;
    enum ggml_type src1_type;
    int n_profiles;
    struct ggml_task_profile *profiles;
    int m_num;
    int *arr_m;
    struct ggml_mulmat_tune_m *items;
 };
 struct ggml_mulmat_tune {
    int version;
    char model[16];
    enum ggml_ftype ftype;
    int n_shapes;
    // Given N/K, we bench for mul_mat [M,K] x [K,N].
    struct ggml_mulmat_tune_shape shapes[GGML_MULMAT_N_SHAPES];
    int n_threads;
 };
 struct ggml_mulmat_tune_time {
    struct ggml_task_profile *profile;
    int stage_time[3];
    int total_time;
 };
 struct mm_cache_element {
    int M;
    int N;
    int K;
    struct ggml_task_profile *profile;
    int stages_time[3];
 };
 // params for tune/bench.
 struct ggml_mulmat_tune_params {
    struct ggml_mulmat_tune_model model;
    int m_num;
    int n_pass;
    int n_threads;
    bool progress;       // print and clear '.'
    bool output_console; // also print result to console
    const char *fname;
 };
 // NOTE: stages_time is filled if not null.
 const struct ggml_task_profile *
 ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
                                     int N, int K, enum ggml_type src0_t,
                                     enum ggml_type src1_t, int stages_time[3]);
 bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
                               const char *model_name, int ftype,
                               int n_threads);
 void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
                                 const char *name, enum ggml_ftype ftype);
 bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
                           struct ggml_mulmat_tune_params *params,
                           struct ggml_task_profile_factory *profile_factory);
 void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune);
 bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp);
 bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp);
 const struct ggml_mulmat_tune_shape *
 ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, int N, int K,
                           enum ggml_type src0_type, enum ggml_type src1_type);
 void ggml_mulmat_tune_estimate_time(const struct ggml_mulmat_tune_shape *shape,
                                    int M,
                                    struct ggml_mulmat_tune_time *profile_time);
 const char *ggml_task_backend_name(enum ggml_task_backend backend);
 int ggml_mulmat_tune_get_builtin_task_backends(
    enum ggml_task_backend *backends);
 bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
                            struct ggml_mulmat_tune_params *params);
 #ifdef __cplusplus
 }
 #endif
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -1,5 +1,7 @@
 #pragma once
 //
 // GGML Tensor Library
 //
@ -200,6 +202,7 @@
 #define GGML_MAX_OPT           4
 #define GGML_MAX_NAME          32
 #define GGML_DEFAULT_N_THREADS 4
 #define GGML_MAX_TASK_PROFILES 8
 #define GGML_ASSERT(x) \
    do { \
@ -347,7 +350,6 @@ extern "C" {
        GGML_OP_COUNT,
    };
    // ggml object
    struct ggml_object {
        size_t offs;
@ -360,6 +362,54 @@ extern "C" {
    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
    // As part of task config profile solution, `ggml_task_backend` defines
    // backends for each task stage. Similar to `ggml_tensor.backend`,
    // `ggml_tensor.task_profile` generalizes how to configure tensor computing
    // at per task-stage level.
    //
    // The following enum values are designed as combination of hardware and
    // optional software interface.
    enum ggml_task_backend {
        GGML_TASK_BACKEND_NONE     = 0,
        // [0x10, 0x1F]: CPU
        GGML_TASK_BACKEND_CPU      = 0x10,
        GGML_TASK_BACKEND_CPU_BLAS = 0x11,
        // [0x20 - 0x2F]: GPU
        GGML_TASK_BACKEND_GPU      = 0x20,
        GGML_TASK_BACKEND_GPU_CUDA = 0x21,
        GGML_TASK_BACKEND_GPU_CL   = 0x22,
    };
    // config for computing one of the 3 task stages of a tensor.
    struct ggml_task_stage {
        enum ggml_task_backend backend;
        bool parallel;
        // hint idle workers go waiting, valid only when parallel is false.
        bool wait;
    };
    // config for computing a tensor.
    struct ggml_task_profile {
        // index 0: INIT, 1: COMPUTE, 2: FINALIZE
        struct ggml_task_stage stages[3];
        // MUST be used only in testing codes.
        uint8_t dev_flags[4];
    };
    struct ggml_task_profile_factory {
        struct ggml_task_profile f32_f32[GGML_MAX_TASK_PROFILES];
        int n_f32_f32;
        struct ggml_task_profile f16_f32[GGML_MAX_TASK_PROFILES];
        int n_f16_f32;
        struct ggml_task_profile qxx_f32[GGML_MAX_TASK_PROFILES];
        int n_qxx_f32;
    };
    // n-dimensional tensor
    struct ggml_tensor {
        enum ggml_type    type;
@ -383,7 +433,8 @@ extern "C" {
        struct ggml_tensor * opt[GGML_MAX_OPT];
        // thread scheduling
-        int n_tasks;
+
        struct ggml_task_profile task_profile;
        // performance
        int     perf_runs;
@ -396,7 +447,7 @@ extern "C" {
        void * extra; // extra things e.g. for ggml-cuda.cu
-        char padding[4];
+        char padding[12];
    };
    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -407,6 +458,8 @@ extern "C" {
        int n_leafs;
        int n_threads;
        struct ggml_mulmat_tune *tune;
        size_t work_size;
        struct ggml_tensor * work;
@ -1287,9 +1340,21 @@ extern "C" {
    GGML_API int ggml_cpu_has_cublas     (void);
    GGML_API int ggml_cpu_has_clblast    (void);
    GGML_API int ggml_cpu_has_gpublas    (void);
    GGML_API int ggml_cpu_has_cpublas    (void);
    GGML_API int ggml_cpu_has_sse3       (void);
    GGML_API int ggml_cpu_has_vsx        (void);
    //
    // mulmat task profiles
    //
    GGML_API void ggml_mulmat_init_task_profiles(void);
    GGML_API int ggml_mulmat_get_task_profiles(
        struct ggml_task_profile_factory *pf,
        enum ggml_type src0_t,
        enum ggml_type src1_t,
        struct ggml_task_profile **profiles);
    //
    // Internal types and functions exposed for tests and benchmarks
    //
--- a/llama.cpp
+++ b/llama.cpp
@ -4,6 +4,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #endif
 #include "llama-util.h"
@ -20,6 +21,10 @@
 #include "ggml-metal.h"
 #endif
 #ifdef GGML_USE_MULMAT_TUNE
 #include "ggml-tune.h"
 #endif
 #include <array>
 #include <ctime>
 #include <cinttypes>
@ -280,6 +285,10 @@ struct llama_context {
    int    buf_last = 0;
    size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
 #ifdef GGML_USE_MULMAT_TUNE
    struct ggml_mulmat_tune *tune = nullptr;
 #endif
    void use_buf(struct ggml_context * ctx, int i) {
 #if defined(LLAMA_USE_SCRATCH)
        size_t last_size = 0;
@ -1396,10 +1405,12 @@ static bool llama_eval_internal(
    struct ggml_context * ctx0 = ggml_init(params);
    // for big prompts, if BLAS is enabled, it is better to use only one thread
    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
    ggml_cgraph gf = {};
-    gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
+    gf.n_threads = n_threads;
 #ifdef GGML_USE_MULMAT_TUNE
    gf.tune =lctx.tune;
 #endif
    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    ggml_set_name(embd, "embd");
@ -2732,7 +2743,150 @@ struct llama_context * llama_init_from_file(
    return ctx;
 }
 #ifdef GGML_USE_MULMAT_TUNE
 bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) {
    printf("\n");
    if (ctx->model.n_gpu_layers != 0) {
        fprintf(stderr, "[mulmat tune] error: is disabled by GPU offloading\n");
        return false;
    }
    const char *model_name = llama_model_type_name(ctx->model.type);
    llama_hparams *hparams = &ctx->model.hparams;
    enum ggml_ftype ggml_ftype;
    switch (hparams->ftype) {
        case LLAMA_FTYPE_ALL_F32:
        ggml_ftype = GGML_FTYPE_ALL_F32;
        break;
    case LLAMA_FTYPE_MOSTLY_F16:
        ggml_ftype = GGML_FTYPE_MOSTLY_F16;
        break;
    case LLAMA_FTYPE_MOSTLY_Q4_0:
        ggml_ftype = GGML_FTYPE_MOSTLY_Q4_0;
        break;
    case LLAMA_FTYPE_MOSTLY_Q4_1:
        ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1;
        break;
    case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
        ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1_SOME_F16;
        break;
    case LLAMA_FTYPE_MOSTLY_Q5_0:
        ggml_ftype = GGML_FTYPE_MOSTLY_Q5_0;
        break;
    case LLAMA_FTYPE_MOSTLY_Q5_1:
        ggml_ftype = GGML_FTYPE_MOSTLY_Q5_1;
        break;
    case LLAMA_FTYPE_MOSTLY_Q8_0:
        ggml_ftype = GGML_FTYPE_MOSTLY_Q8_0;
        break;
    case LLAMA_FTYPE_MOSTLY_Q2_K:
        ggml_ftype = GGML_FTYPE_MOSTLY_Q2_K;
        break;
    case LLAMA_FTYPE_MOSTLY_Q3_K_S:
    case LLAMA_FTYPE_MOSTLY_Q3_K_M:
    case LLAMA_FTYPE_MOSTLY_Q3_K_L:
        ggml_ftype = GGML_FTYPE_MOSTLY_Q3_K;
        break;
    case LLAMA_FTYPE_MOSTLY_Q4_K_S:
    case LLAMA_FTYPE_MOSTLY_Q4_K_M:
        ggml_ftype = GGML_FTYPE_MOSTLY_Q4_K;
        break;
    case LLAMA_FTYPE_MOSTLY_Q5_K_S:
    case LLAMA_FTYPE_MOSTLY_Q5_K_M:
        ggml_ftype = GGML_FTYPE_MOSTLY_Q5_K;
        break;
    case LLAMA_FTYPE_MOSTLY_Q6_K:
        ggml_ftype = GGML_FTYPE_MOSTLY_Q6_K;
        break;
    default:
        throw std::runtime_error(
            format("invalid output file type %d\n", hparams->ftype));
    }
    int n_vocab = hparams->n_vocab;
    int n_embd = hparams->n_embd;
    int n_rot = hparams->n_rot;
    int n_mult = hparams->n_mult;
    int n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult;
    struct ggml_mulmat_tune_params params = {
        /*.model =*/ {
            /* .name    =*/ model_name,
            /* .ftype   =*/ ggml_ftype,
            /* .n_vocab =*/ n_vocab,
            /* .n_embd  =*/ n_embd,
            /* .n_ff    =*/ n_ff,
            /* .n_rot   =*/ n_rot,
        },
        /* .m_num          =*/ 8,
        /* .n_pass         =*/ 1,
        /* .n_threads      =*/ n_threads,
        /* .prrogress      =*/ true,
        /* .output_console =*/ false,
        /* .fname          =*/ fname,
    };
    bool empty_fname = !fname || strcmp(fname, "") == 0;
    ctx->tune = new(struct ggml_mulmat_tune);
    if (!ctx->tune) {
        throw std::runtime_error(format("failed to allocate memory for tune\n"));
    }
    if (tune) {
        bool ok = ggml_mulmat_tune_bench(ctx->tune, &params);
        if (!ok) {
            ggml_mulmat_tune_free(ctx->tune);
            return false;
        }
        if (!empty_fname) {
            ggml_mulmat_tune_free(ctx->tune);
            return true;
        }
    } else {
        if (empty_fname) {
            return false;
        }
    }
    if (!empty_fname) {
        FILE *fp = fopen(fname, "r");
        if (!fp) {
            fprintf(stderr, "[mulmat tune] failed to open file %s.\n",
                    fname);
        } else {
            bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp);
            fclose(fp);
            if (!ok) {
                fprintf(stderr,
                        "[mulmat tune] failed to read data from %s\n",
                        fname);
                return false;
            }
            fprintf(stderr, "[mulmat tune] loaded data from %s\n", fname);
            ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads);
            if (!ok) {
                return false;
            }
        }
    }
    return true;
 }
 #endif
 void llama_free(struct llama_context * ctx) {
 #ifdef GGML_USE_MULMAT_TUNE
    if (ctx->tune) {
        delete(ctx->tune);
    }
 #endif
    delete ctx;
 }
--- a/llama.h
+++ b/llama.h
@ -300,6 +300,9 @@ extern "C" {
    // Print system information
    LLAMA_API const char * llama_print_system_info(void);
    // Experimental utility functionality for mulmat tunning.
    LLAMA_API bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname);
 #ifdef __cplusplus
 }
 #endif
--- a/tests/.gitignore
+++ b/tests/.gitignore
@ -0,0 +1,2 @@
 /test-ggml-threading
 /test-ggml-tune
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -12,3 +12,5 @@ llama_add_test(test-sampling.cpp)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
 # llama_add_test(test-grad0.c) # SLOW
 # llama_add_test(test-opt.c) # SLOW
 llama_add_test(test-ggml-threading.c)
 llama_add_test(test-ggml-tune.c)
--- a/tests/test-ggml-threading.c
+++ b/tests/test-ggml-threading.c
@ -0,0 +1,345 @@
 #include "ggml-threading.h"
 #include "ggml.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 // Purposes:
 // 1. general overview of the threading behaviors.
 // 2. race (dead lock) detection.
 // # build
 // cd build
 //
 // # build release:
 //   cmake .. && cmake --build . --config Release
 //
 // # build with sanitize:
 //   cmake .. -DLLAMA_SANITIZE_THREAD=ON && cmake --build . --config Release
 //
 // # run:
 // ./bin/test-ggml-threading
 // How to turn off the warning on Apple: malloc: nano zone abandoned due to
 // inability to reserve vm space?
 // ==> export MallocNanoZone=0, no need to rebuild.
 // See `nano_init()` from
 // https://opensource.apple.com/source/libmalloc/libmalloc-140.40.1/src/nano_malloc.c.auto.html
 // How to view the threading debug:
 // ==> uncomment `#define GGML_THREADING_DEBUG 1` from file ggml-threading.c
 #define UNUSED(x) (void)(x)
 #define MAX_N_THREADS 16
 static const int n_repeat = 10;
 // It's frustrating to use atomic with c11 on Windows, let's replace atomic
 // counter with array.
 static int work_done_arr[MAX_N_THREADS];
 static enum ggml_compute_error
 mock_task_runner(struct ggml_compute_params *params, struct ggml_tensor *node) {
    int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000;
    if (node->task_profile.stages[params->type].parallel) {
        loops /= params->nth;
    }
    volatile int64_t j = 0;
    for (int i = 0; i < loops; i++) {
        j++;
    }
    UNUSED(j);
    work_done_arr[params->ith]++;
    return GGML_COMPUTE_OK;
 }
 int test_driver(int id, struct ggml_tensor *node, int n_threads) {
    printf("\n[test-ggml-threading] #%d, n_threads: %d\n", id, n_threads);
    for (int i = 0; i < n_threads; i++) {
        work_done_arr[i] = 0;
    }
    bool wait_on_done = (node->task_profile.dev_flags[0] > 0u);
    enum ggml_threading_features features = GGML_THREADING_FEATURE_PERF;
    if (wait_on_done) {
        features |= GGML_THREADING_FEATURE_WAIT_ON_DONE;
    }
    int t0 = (int)ggml_time_us();
    struct ggml_threading_context *ctx =
        ggml_threading_start(n_threads, ggml_threading_graph_compute_thread,
                             mock_task_runner, features, /*stages_time*/ NULL);
    int t1 = (int)ggml_time_us();
    for (int i = 0; i < n_repeat; i++) {
        enum ggml_compute_error err = ggml_threading_compute_tensor(
            ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
        if (err != GGML_COMPUTE_OK) {
            ggml_threading_stop(ctx);
            fprintf(stderr,
                    "ggml_threading_compute_tensor failed with error: %d.\n",
                    err);
            return 1;
        }
    }
    int t2 = (int)ggml_time_us();
    ggml_threading_stop(ctx);
    int t3 = (int)ggml_time_us();
    int expect = 0;
    for (int i = 0; i < 3; i++) {
        struct ggml_task_stage *ts = &node->task_profile.stages[i];
        if (ts->backend != GGML_TASK_BACKEND_NONE) {
            if (ts->parallel) {
                expect += n_threads;
            } else {
                expect++;
            }
        }
    }
    expect *= n_repeat;
    int actual = 0;
    for (int i = 0; i < n_threads; i++) {
        actual += work_done_arr[i];
    }
    uint8_t loops = node->task_profile.dev_flags[1];
    printf("\tloops: %2d million(s), ---wait_on_done---: %d\n\tstage-0: "
           "(parallel: %d, "
           "wait: %d)\n"
           "\tstage-1: (parallel: %d, wait: %d)\n",
           loops, wait_on_done, node->task_profile.stages[0].parallel,
           node->task_profile.stages[0].wait,
           node->task_profile.stages[1].parallel,
           node->task_profile.stages[1].wait);
    if (actual == expect) {
        printf("\tthreading: init %6.3f ms, compute %6.3f ms, cleanup %6.3f "
               "ms, total %6.3f ms\n",
               1.0 * (t1 - t0) / 1000, 1.0 * (t2 - t1) / 1000,
               1.0 * (t3 - t2) / 1000, 1.0 * (t3 - t0) / 1000);
        return 0;
    }
    fprintf(stderr, "\t== failed. expect %d done, actual %d done\n\n", expect,
            actual);
    return 2;
 }
 static enum ggml_compute_error
 mock_task_runner_fallback(struct ggml_compute_params *params,
                          struct ggml_tensor *node) {
    UNUSED(params);
    if (node->backend == GGML_BACKEND_GPU) {
        // ... finally failed to compute in GPU.
        node->backend = GGML_BACKEND_CPU;
        return GGML_COMPUTE_FALLBACK;
    } else {
        return GGML_COMPUTE_OK;
    }
 }
 // By design, fallback should happen when attempt computing tensor in GPU,
 // thus it is not parallelled.
 int test_fallback(struct ggml_tensor *node) {
    struct ggml_threading_context *ctx = ggml_threading_start(
        1, ggml_threading_graph_compute_thread, mock_task_runner_fallback,
        /*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL);
    enum ggml_compute_error err =
        ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
    if (err == GGML_COMPUTE_FALLBACK) {
        err = ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL,
                                            /*wsize*/ 0);
    }
    ggml_threading_stop(ctx);
    if (err != GGML_COMPUTE_OK) {
        fprintf(stderr,
                "ggml_threading_compute_tensor failed with error: %d.\n", err);
        return 1;
    }
    return 0;
 }
 int main(void) {
    ggml_time_init();
    struct ggml_tensor node;
    memset(&node, 0, sizeof(struct ggml_tensor));
    struct ggml_task_stage *stages = node.task_profile.stages;
    stages[0].backend = GGML_TASK_BACKEND_CPU;
    stages[1].backend = GGML_TASK_BACKEND_CPU;
    stages[2].backend = GGML_TASK_BACKEND_NONE;
    int n_passed = 0;
    int n_tests = 0;
    int parallel[3] = {0, 1, 2};
    // In github build actions (windows-latest-cmake and ubuntu-latest-cmake):
    // When n_threads >= 4, the thread init time and compute time suddenly goes
    // down to 100x ~ 1000x slow -- comparing to n_threads == 2.
    //
    // But the tests (n_threads 1, 2, 4, 6) looks sound on my devices:
    // - MacBook air 2013, ubuntu 22.04
    // - MacBook pro 2018, macOS 13.4
    //
    // So I assume the github build host has limited multi-cpu quota.
    // Will skip computing when threading init time is too slow.
    //
    // NOTE: it's observed that when workload is 0 and n_threads >= number of
    // physical cores:
    // - the wait/wakeup time varies much: can be up to tens or hundreds of the
    //   average time, thus greatly punishes those small workloads.
    // - wait_on_done is general faster than wait_now, can be 10x faster.
    int threads_arr[] = {1, 2, 4, 8};
    int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
    // millions of loops.
    uint8_t workload_arr[] = {0u, 1u, 10u};
    int workload_arr_len = sizeof(workload_arr) / sizeof(workload_arr[0]);
    // node.task_profile.dev_flags: byte 0 for wait_on_done, byte 1 for loops.
    for (int x = 0; x < workload_arr_len; x++) {
        node.task_profile.dev_flags[1] = workload_arr[x];
        for (int i = 0; i < threads_arr_len; i++) {
            int n_threads = threads_arr[i];
            if (n_threads > MAX_N_THREADS) {
                abort();
            }
            printf("\n[test-ggml-threading] ==== n_nodes: %d, n_threads: %d, "
                   "loops: %2d million(s) ====\n",
                   n_repeat, n_threads, workload_arr[x]);
            if (n_threads > 1) { // skip this n_threads when too slow.
                int t0 = (int)ggml_time_us();
                struct ggml_threading_context *ctx = ggml_threading_start(
                    n_threads, ggml_threading_graph_compute_thread,
                    mock_task_runner, 0, /*stages_time*/ NULL);
                int t1 = (int)ggml_time_us();
                ggml_threading_stop(ctx);
                int elapsed_us = t1 - t0;
                if (elapsed_us > 500 * n_threads) {
                    fprintf(stderr,
                            "[test-ggml-threading] warning: it took took %.3f "
                            "ms to start %d worker thread(s).\n",
                            1.0 * elapsed_us / 1000, n_threads - 1);
                    fprintf(stderr, "[test-ggml-threading] warning: looks like "
                                    "the environment is too slow to run this "
                                    "number of threads, skip.\n");
                    continue;
                }
            }
            // multi-threads: parallel + wait_now/wait_on_done
            if (n_threads == 1) {
                stages[0].parallel = false;
                stages[1].parallel = false;
                stages[0].wait = false;
                stages[1].wait = false;
                n_tests++;
                if (test_driver(n_tests, &node, n_threads) == 0) {
                    n_passed++;
                }
                continue;
            }
            for (int j = 0; j < 3; j++) {
                stages[0].wait = false;
                stages[1].wait = false;
                node.task_profile.dev_flags[0] = 0u;
                if (parallel[j] == 0) {
                    stages[0].parallel = false;
                    stages[1].parallel = false;
                    n_tests++;
                    if (test_driver(n_tests, &node, n_threads) == 0) {
                        n_passed++;
                    }
                } else if (parallel[j] == 1) {
                    stages[0].parallel = true;
                    stages[1].parallel = false;
                    for (int k = 0; k < 2; k++) {
                        stages[1].wait = (k == 1);
                        if (!stages[1].wait) {
                            n_tests++;
                            if (test_driver(n_tests, &node, n_threads) == 0) {
                                n_passed++;
                            }
                            continue;
                        }
                        // wait
                        for (int m = 0; m < 2; m++) {
                            if (m == 1) {
                                node.task_profile.dev_flags[0] = 1u;
                            }
                            n_tests++;
                            if (test_driver(n_tests, &node, n_threads) == 0) {
                                n_passed++;
                            }
                            node.task_profile.dev_flags[0] = 0u;
                        }
                    }
                } else {
                    stages[0].parallel = true;
                    stages[1].parallel = true;
                    n_tests++;
                    if (test_driver(n_tests, &node, n_threads) == 0) {
                        n_passed++;
                    }
                }
            }
        }
    }
    {
        ++n_tests;
        node.backend = GGML_BACKEND_GPU;
        if (test_fallback(&node) == 0) {
            ++n_passed;
            printf("\n[test-ggml-threading] test fallback: ok\n\n");
        }
    }
    printf("[test-ggml-threading] %d/%d passed.\n", n_passed, n_tests);
    return (n_passed == n_tests) ? 0 : 1;
 }
--- a/tests/test-ggml-tune.c
+++ b/tests/test-ggml-tune.c
@ -0,0 +1,200 @@
 #include "ggml-tune.h"
 #include "ggml.h"
 #include <string.h>
 static int bench(void);
 static int estimate_time_non_zero_NK(void);
 static void init_params(struct ggml_mulmat_tune_params *params, int m_num) {
    *params = (struct ggml_mulmat_tune_params){
        .model =
            (struct ggml_mulmat_tune_model){
                .name = "3B", // fake
                .ftype = GGML_FTYPE_MOSTLY_Q4_0,
                .n_vocab = 4096,
                .n_embd = 1024,
                .n_ff = 2048,
                .n_rot = 128,
            },
        .m_num = m_num,
        .n_pass = 1,
        .n_threads = 1,
        .progress = false,
        .output_console = true,
        .fname = NULL};
 }
 int main(void) {
    int rv = bench();
    if (rv != 0) {
        return rv;
    }
    printf("\n");
    rv = estimate_time_non_zero_NK();
    if (rv != 0) {
        return rv;
    }
    printf("\n");
    return 0;
 }
 static int bench(void) {
    printf("test: %s\n", __func__);
    {
        enum ggml_task_backend backends[16];
        int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
        if (n_backends < 2) {
            printf("test: %s, skipped because no BLAS\n", __func__);
            return 0;
        }
    }
    {
        struct ggml_init_params init_params = {
            /*.mem_size   =*/1,
            /*.mem_buffer =*/NULL,
            /*.no_alloc   =*/0,
        };
        struct ggml_context *ctx = ggml_init(init_params);
        GGML_ASSERT(ctx);
        ggml_free(ctx);
    }
    struct ggml_mulmat_tune tune;
    struct ggml_mulmat_tune_params params;
    init_params(&params, /*m_num*/ 4);
    bool ok = ggml_mulmat_tune_bench(&tune, &params);
    ggml_mulmat_tune_free(&tune);
    return ok ? 0 : 1;
 }
 int estimate_time_non_zero_NK(void) {
    printf("test: %s\n", __func__);
    struct test_data_t {
        int M;
        int time[3]; // 3 profiles.
    };
    struct ggml_mulmat_tune tune = {
        .version = 1,
        .ftype = GGML_FTYPE_MOSTLY_Q4_0,
    };
    const int m_num = 2;
    struct ggml_task_profile_factory pf;
    memset(&pf, 0, sizeof(struct ggml_task_profile_factory));
    {
        pf.n_qxx_f32 = 2;
        pf.qxx_f32[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
        pf.qxx_f32[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
        pf.qxx_f32[1].stages[0].backend = GGML_TASK_BACKEND_CPU;
        pf.qxx_f32[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
    }
    struct ggml_mulmat_tune_params params;
    init_params(&params, m_num);
    ggml_mulmat_tune_init(&tune, &params, &pf);
    struct ggml_mulmat_tune_shape *shape = NULL;
    for (int i = 0; i < tune.n_shapes; i++) {
        if (tune.shapes[i].N > 0 && tune.shapes[i].K > 0) {
            shape = &tune.shapes[i];
            break;
        }
    }
    GGML_ASSERT(shape);
    GGML_ASSERT(shape->n_profiles == 2);
    GGML_ASSERT(ggml_is_quantized(shape->src0_type));
    printf("shape: N: %d, K: %d, n_profiles: %d\n", shape->N, shape->K,
           shape->n_profiles);
    {
        shape->items[0] =
            (struct ggml_mulmat_tune_m){.M = 2, .stages_time = {2, 4, 0}};
        shape->items[1] =
            (struct ggml_mulmat_tune_m){.M = 4, .stages_time = {4, 8, 0}};
        shape->items[2] =
            (struct ggml_mulmat_tune_m){.M = 2, .stages_time = {4, 4, 0}};
        shape->items[3] =
            (struct ggml_mulmat_tune_m){.M = 4, .stages_time = {4, 4, 0}};
    }
    const struct test_data_t test_data[] = {
        {
            .M = 1, // out of range
            .time = {3, 8},
        },
        {
            .M = 2,
            .time = {6, 8},
        },
        {
            .M = 3,
            .time = {9, 8},
        },
        {
            .M = 4,
            .time = {12, 8},
        },
        {
            .M = 5, // out of range
            .time = {15, 8},
        },
    };
    int n_tests = (int)(sizeof(test_data) / sizeof(struct test_data_t));
    struct ggml_mulmat_tune_time profile_time[GGML_MAX_TASK_PROFILES];
    size_t profile_time_sz =
        sizeof(struct ggml_mulmat_tune_time) * GGML_MAX_TASK_PROFILES;
    int n_passed = 0;
    for (int i = 0; i < n_tests; i++) {
        memset(profile_time, 0, profile_time_sz);
        const struct test_data_t *e = &test_data[i];
        const struct ggml_mulmat_tune_shape *matched_shape =
            ggml_mulmat_tune_get_shape(&tune, shape->N, shape->K,
                                       shape->src0_type, shape->src1_type);
        GGML_ASSERT(matched_shape);
        GGML_ASSERT(matched_shape == shape);
        ggml_mulmat_tune_estimate_time(matched_shape, e->M, profile_time);
        for (int j = 0; j < shape->n_profiles; j++) {
            int actual = profile_time[j].total_time;
            int expect = e->time[j];
            if (expect != actual) {
                fprintf(stderr,
                        "test fail. i: %d, j: %d, M: %d, expect: "
                        "%d, actual: %d\n",
                        i, j, e->M, expect, actual);
            } else {
                ++n_passed;
            }
        }
    }
    n_tests *= shape->n_profiles;
    printf("%2d of %2d pass\n", n_passed, n_tests);
    ggml_mulmat_tune_free(&tune);
    return n_passed == n_tests ? 0 : 1;
 }