From 213f133701e947a94bdd070315cc0528429bd27d Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Wed, 14 Jun 2023 18:33:14 +0800
Subject: [PATCH] initial

---
 .gitignore                           |    1 +
 CMakeLists.txt                       |    9 +
 Makefile                             |   31 +-
 examples/CMakeLists.txt              |    1 +
 examples/common.cpp                  |   14 +
 examples/common.h                    |    2 +
 examples/main/main.cpp               |   10 +
 examples/mulmat-tune/CMakeLists.txt  |   14 +
 examples/mulmat-tune/README.md       |  272 +++++++
 examples/mulmat-tune/mulmat-tune.cpp |  277 +++++++
 examples/perplexity/perplexity.cpp   |   10 +
 ggml-cuda.cu                         |    2 +-
 ggml-opencl.cpp                      |    2 +-
 ggml-threading.c                     |  620 ++++++++++++++++
 ggml-threading.h                     |   68 ++
 ggml-tune.c                          |  897 ++++++++++++++++++++++
 ggml-tune.h                          |  137 ++++
 ggml.c                               | 1032 +++++++++++++-------------
 ggml.h                               |   71 +-
 llama.cpp                            |  160 +++-
 llama.h                              |    3 +
 tests/.gitignore                     |    2 +
 tests/CMakeLists.txt                 |    2 +
 tests/test-ggml-threading.c          |  345 +++++++++
 tests/test-ggml-tune.c               |  200 +++++
 25 files changed, 3664 insertions(+), 518 deletions(-)
 create mode 100644 examples/mulmat-tune/CMakeLists.txt
 create mode 100644 examples/mulmat-tune/README.md
 create mode 100644 examples/mulmat-tune/mulmat-tune.cpp
 create mode 100644 ggml-threading.c
 create mode 100644 ggml-threading.h
 create mode 100644 ggml-tune.c
 create mode 100644 ggml-tune.h
 create mode 100644 tests/.gitignore
 create mode 100644 tests/test-ggml-threading.c
 create mode 100644 tests/test-ggml-tune.c

diff --git a/.gitignore b/.gitignore
index e7bfd52e3..c2e2a0ab0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,6 +40,7 @@ models/*
 /server
 /Pipfile
 /libllama.so
+/mulmat-tune
 
 build-info.h
 arm_neon.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 736771954..832c1e986 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,6 +78,7 @@ option(LLAMA_K_QUANTS                        "llama: use k-quants"
 option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER               "llama: build server example"                           OFF)
+option(LLAMA_MULMAT_TUNE                "llama: mulmat tune"                                    OFF)
 
 #
 # Build info header
@@ -214,6 +215,7 @@ if (LLAMA_BLAS)
         message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
         add_compile_options(${BLAS_LINKER_FLAGS})
         add_compile_definitions(GGML_USE_OPENBLAS)
+        add_compile_definitions(GGML_BLAS_VENDOR="${LLAMA_BLAS_VENDOR}")
         set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
         set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
 
@@ -276,6 +278,11 @@ if (LLAMA_METAL)
         )
 endif()
 
+if (LLAMA_MULMAT_TUNE)
+    add_compile_definitions(GGML_USE_MULMAT_TUNE)
+    add_compile_definitions(GGML_MULMAT_TUNE_NDEBUG)
+endif()
+
 if (LLAMA_K_QUANTS)
     set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
     add_compile_definitions(GGML_USE_K_QUANTS)
@@ -450,6 +457,8 @@ endif()
 
 add_library(ggml OBJECT
             ggml.c
+            ggml-threading.c
+            ggml-tune.c
             ggml.h
             ${GGML_SOURCES_CUDA}
             ${GGML_SOURCES_OPENCL}
diff --git a/Makefile b/Makefile
index afd06e0a6..a8d1bdc09 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple mulmat-tune
 
 ifdef LLAMA_BUILD_SERVER
 	BUILD_TARGETS += server
@@ -47,7 +47,8 @@ endif
 OPT = -O3
 CFLAGS   = -I.              $(OPT) -std=c11   -fPIC
 CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
-LDFLAGS  =
+# -lm fixed error: ggml.o: undefined reference to symbol 'tanhf@@GLIBC_2.2.5' from ubuntu 22.04
+LDFLAGS  = -lm
 
 ifdef LLAMA_DEBUG
 	CFLAGS   += -O0 -g
@@ -134,8 +135,7 @@ ifndef LLAMA_NO_K_QUANTS
 endif
 
 ifndef LLAMA_NO_ACCELERATE
-	# Mac M1 - include Accelerate framework.
-	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
+	# Mac Intel & M1 - include Accelerate framework.
 	ifeq ($(UNAME_S),Darwin)
 		CFLAGS  += -DGGML_USE_ACCELERATE
 		LDFLAGS += -framework Accelerate
@@ -145,10 +145,16 @@ endif # LLAMA_NO_ACCELERATE
 ifdef LLAMA_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
 	LDFLAGS += -lopenblas
+	ifeq ($(UNAME_S),Darwin)
+		# openblas installed with Homebew on macOS.
+		CFLAGS  += -I/usr/local/opt/openblas/include
+		LDFLAGS += -L/usr/local/opt/openblas/lib
+	endif
 endif # LLAMA_OPENBLAS
 
 ifdef LLAMA_BLIS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
+	CFLAGS  += -DGGML_BLAS_VENDOR="\"BLIS\""
 	LDFLAGS += -lblis -L/usr/local/lib
 endif # LLAMA_BLIS
 
@@ -230,6 +236,11 @@ k_quants.o: k_quants.c k_quants.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_NO_K_QUANTS
 
+ifdef LLAMA_MULMAT_TUNE
+	CFLAGS   += -DGGML_USE_MULMAT_TUNE -DGGML_MULMAT_TUNE_NDEBUG
+	CXXFLAGS += -DGGML_USE_MULMAT_TUNE
+endif
+
 #
 # Print build information
 #
@@ -245,6 +256,8 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )
 
+OBJS += ggml-tune.o ggml-threading.o
+
 #
 # Build library
 #
@@ -253,7 +266,12 @@ ggml.o: ggml.c ggml.h ggml-cuda.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
 
 llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ggml-threading.o: ggml-threading.c ggml.h
+	$(CC)  $(CFLAGS) -c $< -o $@
+
+ggml-tune.o: ggml-tune.c ggml.h
+	$(CC)  $(CFLAGS) -c $< -o $@
 
 common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
@@ -298,6 +316,9 @@ server: examples/server/server.cpp examples/server/httplib.h examples/server/jso
 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+mulmat-tune: examples/mulmat-tune/mulmat-tune.cpp build-info.h ggml.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o mulmat-tune $(LDFLAGS)
+
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index cf9c4a223..cf01b8a2a 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -39,6 +39,7 @@ else()
     add_subdirectory(baby-llama)
     add_subdirectory(train-text-from-scratch)
     add_subdirectory(simple)
+    add_subdirectory(mulmat-tune)
     if (LLAMA_METAL)
         add_subdirectory(metal)
     endif()
diff --git a/examples/common.cpp b/examples/common.cpp
index fed24e027..882e90c9c 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -345,6 +345,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.mem_test = true;
         } else if (arg == "--export") {
             params.export_cgraph = true;
+#ifdef GGML_USE_MULMAT_TUNE
+        } else if (arg == "--tune") {
+            params.tune = true;
+        } else if (arg == "--tune-file") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.tune_file = argv[i];
+#endif // GGML_USE_MULMAT_TUNE
         } else if (arg == "--verbose-prompt") {
             params.verbose_prompt = true;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
@@ -498,6 +508,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #endif
     fprintf(stderr, "  --mtest               compute maximum memory usage\n");
     fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
+#ifdef GGML_USE_MULMAT_TUNE
+    fprintf(stderr, "  --tune                mulmat tune enable. If tune-file is set then exit after bench\n");
+    fprintf(stderr, "  --tune-file FILE      mulmat tune data file. If tune is true, then write bench result to this file, else load the file and run\n");
+#endif
     fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
     fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
     fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
diff --git a/examples/common.h b/examples/common.h
index 6c2953cb2..5e394b218 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -77,6 +77,8 @@ struct gpt_params {
     bool mem_test          = false; // compute maximum memory usage
     bool export_cgraph     = false; // export the computation graph
     bool verbose_prompt    = false; // print prompt tokens before generation
+    bool tune              = false; // mulmat tune: enable
+    std::string tune_file  = "";    // mulmat tune: data file
 };
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 941312f9c..542e463bf 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -117,6 +117,16 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+#ifdef GGML_USE_MULMAT_TUNE
+    if (params.tune || !params.tune_file.empty()) {
+        bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
+        if (!ok || (params.tune && !params.tune_file.empty())) {
+            llama_free(ctx);
+            return ok? 0: 1;
+        }
+    }
+#endif
+
     // print system information
     {
         fprintf(stderr, "\n");
diff --git a/examples/mulmat-tune/CMakeLists.txt b/examples/mulmat-tune/CMakeLists.txt
new file mode 100644
index 000000000..51e1053e8
--- /dev/null
+++ b/examples/mulmat-tune/CMakeLists.txt
@@ -0,0 +1,14 @@
+set(TARGET mulmat-tune)
+add_executable(${TARGET} mulmat-tune.cpp)
+
+if (XCODE OR MSVC)
+  set(MULMAT_TUNE_LIBS ggml)
+else()
+  set(MULMAT_TUNE_LIBS ggml m)
+endif()
+
+target_link_libraries(${TARGET} PRIVATE ${MULMAT_TUNE_LIBS} ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/mulmat-tune/README.md b/examples/mulmat-tune/README.md
new file mode 100644
index 000000000..cff8a3d64
--- /dev/null
+++ b/examples/mulmat-tune/README.md
@@ -0,0 +1,272 @@
+# Mulmat Benchmark and Tunning
+
+Apart from the standalone tool `mulmat-tune`, mulmat tune is also integrated into
+`main` and `perplexity`. To avoid too many new cli options, I just added two options.
+To make it run faster, the `m_num` is set as 8 thus max M is 128, and the `n_pass`
+is set as 1.
+
+With the newly added cli options, we can use `main` and `perplexity` with the
+following three ways:
+
+* bench and run:  --tune
+* bench and exit: --tune --tune-file <FILE>
+* load  and run:  --tune-file <FILE>
+
+The `load` mode reads existing data file. Although this is fine because we can
+run bench ahead of time (saving tens of seconds), but there are two shortcomings:
+- have to re-run when format changed, this is OK because we are acknowledged.
+- the most subtle problem is algorithm was changed silently but we are using the
+  outdated format. So I integrated mulmat tune into `main` and `perplexity` as
+  a complementary solution.
+
+## Build into main and perplexity
+
+Makefile:
+```
+make clean && LLAMA_MULMAT_TUNE=1 make
+```
+
+CMake (with BLAS):
+```
+cmake --build . --target clean
+cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON
+cmake --build . --config Release
+```
+
+Run examples:
+
+```
+# bench and run:
+
+./main -m ./models/3B/open-llama-3b-q4-0.bin -c 512 -b 1024 -n 256 --keep 48 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt -t 4 --tune
+
+# bench then exit:
+./main -m ./models/3B/open-llama-3b-q4-0.bin --tune --tune-file <FILE>
+
+# load and run
+
+./main -m ./models/3B/open-llama-3b-q4-0.bin -c 512 -b 1024 -n 256 --keep 48 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt -t 4 --tune-file <FILE>
+```
+
+# Build the standalone `mulmat-tune`
+
+Makefile:
+```
+make clean && LLAMA_MULMAT_TUNE=1 make
+```
+
+CMake (with BLAS)
+```
+cmake --build . --target clean
+cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON
+cmake --build . --config Release
+```
+
+Run examples:
+
+```
+./mulmat-tune -h
+
+# run with default params (7B, Q4_0, ...)
+./mulmat-tune
+
+# set model
+./mulmat-tune --model 13B
+
+# set ggml ftype, 2 for Q4_0, 3 for Q4_1, run `mulmat-tune -h` for help.
+./mulmat-tune --ftype 3
+
+# customized m_num
+./mulmat-tune --m_num 8
+
+# customized n_pass: run 1 pass only instead of the default 3.
+./mulmat-tune --n_pass 1
+
+# customized n_threads instead of the default 1.
+./mulmat-tune --n_threads 4
+
+# save to file
+./mulmat-tune --file <FILE>
+
+# save to file, always override if exists (CAUTION!)
+./mulmat-tune --file <FILE> -y
+
+```
+
+# End to End Test
+
+## Compare With Master
+
+You may want to run the following commands. Make sure the tune result file is
+setup properly.
+
+General steps:
+
+1. run `./mulmat-tune -h` to see how to build for misc vendors.
+   you can build with `GGML_MULMAT_TUNE_NDEBUG=` to enable the the debug, e.g:
+   ```
+   make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1 LLAMA_NO_ACCELERATE=1 LLAMA_CLBLAST=1 make
+   ```
+   On `macOS`, `ACCELERATE` is enabled by default. When `ACCELERATE` is built along
+   with `CUDA` or `CL`, you may not see `CUDA` or `CL` from debug because `CPU`
+   or `CPU_BLAS` is more faster (as of the estimation from mulmat tune).
+2. create a small prompt file:
+   ```
+   head -n 5 ./models/wikitext-2-raw/wiki.valid.raw > ./models/wiki.valid-5.raw
+   ```
+3. run any of the following example commands.
+   ```
+   ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 1 -b 32
+   ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 4 -b 64
+   ```
+   * `--mlock` is recommended for `macOS`, you may not want to use it.
+   * don't change `-c 128`: too large `context size` causes 0 perplexity trunk.
+   * `-t` is the number of threads, recommend `1`, `2`, `4` or `6`.
+   * you can change the batch size (`-b`) between `1` and `128`.
+   * you may want to add other cli options.
+
+The following results are generated with Accelerate compiled.
+
+### 1 thread
+
+**Master (2d43387d)**
+
+```
+| M   | perplexity (seconds per pass) | prompt eval time (ms per token) |
+| --- | --------------- |
+|  8  |  43.53 | 339.95 |
+|  16 |  44.31 | 346.12 |
+|  24 |  43.14 | 336.90 |
+|  32 |  33.59 | 262.25 |
+|  40 |  27.64 | 215.77 |
+|  48 |  24.52 | 191.42 |
+```
+
+**This branch (tune)**
+
+```
+|  M  |  perplexity (seconds per pass) | prompt eval time (ms per token) |
+| --- | --------------- |
+|  8  |  43.78 | 341.96 |
+|  16 |  42.88 | 334.93 |
+|  24 |  42.06 | 328.42 |
+|  32 |  33.07 | 258.25 |
+|  40 |  28.69 | 223.98 |
+|  48 |  25.65 | 200.19 |
+```
+
+### 4 threads
+
+**Master (2d43387d)**
+
+```
+|  M  |  perplexity (seconds per pass) | prompt eval time (ms per token) |
+| --- | --------------- |
+|   8 |  12.43 |  96.99 |
+|  16 |  12.10 |  94.44 |
+|  24 |  12.81 |  99.95 |
+|  32 |  31.64 | 247.04 |
+|  48 |  24.55 | 191.63 |
+|  64 |  17.56 | 137.09 |
+|  96 |  17.59 | 137.25 |
+| 128 |  10.73 |  83.74 |
+```
+
+**This branch (no tune)**
+
+```
+|  M  |  perplexity (seconds per pass) | prompt eval time (ms per token) |
+| --- | --------------- |
+|   8 |  12.31 |  96.07 |
+|  16 |  12.00 |  93.63 |
+|  24 |  12.07 |  94.15 |
+|  32 |  20.34 | 158.76 |
+|  48 |  15.86 | 123.73 |
+|  64 |  10.98 |  85.69 |
+|  96 |  11.24 |  87.66 |
+| 128 |   7.53 |  58.77 |
+```
+
+**This branch (tune)**
+
+```
+|  M  |  perplexity (seconds per pass) | prompt eval time (ms per token) |
+| --- | --------------- |
+|   8 |  12.48 |  97.37 |
+|  16 |  12.26 |  95.70 |
+|  24 |  12.25 |  95.53 |
+|  32 |  11.98 |  93.58 |
+|  48 |  12.57 |  98.12 |
+|  64 |  11.28 |  88.05 |
+|  96 |   9.55 |  74.53 |
+| 128 |   7.51 |  58.61 |
+```
+
+# Bench Data Format
+
+**Example**
+
+```
+5 3B 2 6 1
+
+3200 3200  2 0 3 10
+16 0 0 0  16 1 0 1   0 0 0 0
+16 1 0 2  17 0 1 0   0 0 0 0
+ 0 0 0 0  34 0 1 0   0 0 0 0
+   1        1      793 0     9103     2102 0 0     6014 0
+   2        2     1591 0     8034     2305 0 0    30982 0
+   4        4     2236 0     6476     2484 0 0    31388 0
+   8        7     4161 0     6623     2389 0 0    29204 0
+  16       15     8339 0     6434     2752 0 0    34303 0
+  32       32    16919 0     6915     3651 0 0    42511 0
+  64      200    34270 0     6574     4528 0 0    68212 0
+ 128      188    69400 0     6325     6839 0 0    74437 0
+ 256      303   134597 0     6168    11544 0 0   110180 0
+ 512      687   279685 0     6337    29712 0 0   159728 0
+
+3200 8640  2 0 2 10
+
+ ...
+
+ ```
+
+**Informal Explanation**
+
+```
+head
+groups+
+
+head := version model ggml_ftype n_shapes n_threads
+shape+
+
+# head
+version: 1
+model: "3B" | "7B" | "13B" | "30B" | "65B"
+ggml_ftype: 0 - 4, 7 - 14
+n_shapes: number of shapes
+n_threads: number of threads
+
+shape := N K  m_num n_profiles
+task_conf_profile+
+bench_item+
+
+task_conf_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize)
+stage_conf: backend parallel wait
+backend: 0 (NONE) | 16 (CPU) | 17 (CPU_BLAS) | 32 (GPU) | 33 (GPU_CUDA) | 34 (GPU_CL)
+parallel: 0 (false) | 1 (true)
+wait: 0 (false) | 1 (true)
+
+bench_item: M profile_time+
+profile_time := stage_time[3]
+stage_time[3]: init_time, compute_time, finalize_time
+```
+
+A task stage is invalid if it's backend equals to `GGML_TASK_BACKEND_NONE`.
+Time unit is `us`. A column is all zeros when that stage does not exist.
+
+# NOTE
+
+1. "3B" is [open-llama 3B](https://github.com/ggerganov/llama.cpp/pull/1588).
+2. Model names are subject to change: we may support something like X-3B, Y-4B, ...
+3. As of Jun 1, this tool is still in early stage, will be changed frequently in
+   recent couple of days (or weeks).
diff --git a/examples/mulmat-tune/mulmat-tune.cpp b/examples/mulmat-tune/mulmat-tune.cpp
new file mode 100644
index 000000000..62f1da277
--- /dev/null
+++ b/examples/mulmat-tune/mulmat-tune.cpp
@@ -0,0 +1,277 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "build-info.h"
+#include "ggml-tune.h"
+#include "ggml.h"
+#include "llama.h"
+
+#define UNUSED(x) (void)(x)
+
+static void print_build_tips(void) {
+    const char *a = "LLAMA_NO_ACCELERATE";
+    fprintf(stderr, "Tips on how to build with various backend vendors:\n\n");
+    fprintf(stderr, "CUDA:       make clean; LLAMA_CUBLAS=1 make\n");
+    fprintf(stderr, "CL:         make clean; LLAMA_CLBLAST=1 make\n");
+    fprintf(stderr, "Accelerate: make clean; %s=  make\n", a);
+    fprintf(stderr, "OpenBLAS:   make clean; %s=1 LLAMA_OPENBLAS=1 make\n", a);
+    fprintf(stderr, "BLIS:       make clean; %s=1 LLAMA_BLIS=1 make\n", a);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "NOTE: for CUDA/CL, use %s=1 to disable ACCELERATE\n", a);
+}
+
+static bool prompt_yes_no(const char *prompt) {
+    char buf[2];
+    while (true) {
+        fprintf(stderr, "%s (Y|n)\n", prompt);
+        buf[0] = 0;
+        buf[1] = 0;
+        int i = 0;
+        int c = 0;
+
+        while (c != '\n') {
+            c = fgetc(stdin);
+            buf[i % 2] = c;
+            i++;
+        }
+        if (i == 1) {
+            if (buf[0] == '\n') {
+                return true;
+            }
+        } else if (i == 2) {
+            if (buf[0] == 'Y' || buf[0] == 'y') {
+                return true;
+            }
+            if (buf[0] == 'N' || buf[0] == 'n') {
+                return false;
+            }
+        }
+    }
+}
+
+static void cmd_analyze(struct ggml_mulmat_tune *tune);
+
+static void usage(char *prog) {
+    const char *usage_lines[] = {
+        "usage: %s args",
+        "",
+        "bench [-m MODEL] [-t TYPE] [-f FILE] [-y]",
+        "--model     MODEL    3B | 7B | 13B | 30B | 65B",
+        "                     default 7B",
+        "--ftype     FTYPE    ggml ftype:",
+        "                     0:  all F32",
+        "                     1:  mostly F16",
+        "                     2:  mostly Q4_0",
+        "                     3:  mostly Q4_1",
+        "                     4:  mostly Q4_1, some F16",
+        "                     7:  mostly Q8_0",
+        "                     8:  mostly Q5_0",
+        "                     9:  mostly Q5_1",
+        "                     10: mostly Q2_K",
+        "                     11: mostly Q3_K",
+        "                     12: mostly Q4_K",
+        "                     13: mostly Q5_K",
+        "                     14: mostly Q6_K",
+        "                     default 2 (mostly Q4_0)",
+        "--m_num     M_NUM    number of M, the max M = 2^(M_NUM-1)",
+        "                     requires between [6, 12]",
+        "                     default 10",
+        "--n_pass    PASS     number of passes to run",
+        "                     default 1",
+        "                     requires: between [1, 3]",
+        "--n_threads NTH      bench with this number of threads",
+        "                     requires: between [1, 16]",
+        "                     default 1",
+        "--file      FILE     data file to write",
+        "                     default stdout",
+        "-y                   always answer \"yes\" to all prompts",
+    };
+
+    int len = (int)(sizeof(usage_lines) / sizeof(char *));
+    for (int i = 0; i < len; i++) {
+        const char *line = usage_lines[i];
+        if (i == 0) {
+            fprintf(stderr, line, prog);
+        } else {
+            fprintf(stderr, "%s\n", line);
+        }
+    }
+
+    printf("\n");
+    print_build_tips();
+    printf("\n");
+}
+
+int main(int argc, char **argv) {
+    if (argc == 2) {
+        if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) {
+            usage(argv[0]);
+            return 0;
+        }
+    }
+
+    int arg_start = 1;
+
+    const char *arg_model = NULL;
+    const char *arg_ftype = NULL;
+    const char *arg_m_num = NULL;
+    const char *arg_n_threads = NULL;
+    const char *arg_n_pass = NULL;
+    const char *arg_file = NULL;
+    bool always_yes = false;
+
+    for (int i = arg_start; i < argc; i++) {
+        if (strcmp(argv[i], "--model") == 0) {
+            if (i + 1 < argc) {
+                arg_model = argv[i + 1];
+                ++i;
+            }
+        } else if (strcmp(argv[i], "--ftype") == 0) {
+            if (i + 1 < argc) {
+                arg_ftype = argv[i + 1];
+                ++i;
+            }
+        } else if (strcmp(argv[i], "--m_num") == 0) {
+            if (i + 1 < argc) {
+                arg_m_num = argv[i + 1];
+                ++i;
+            }
+        } else if (strcmp(argv[i], "--n_pass") == 0) {
+            if (i + 1 < argc) {
+                arg_n_pass = argv[i + 1];
+                ++i;
+            }
+        } else if (strcmp(argv[i], "--n_threads") == 0) {
+            if (i + 1 < argc) {
+                arg_n_threads = argv[i + 1];
+                ++i;
+            }
+        } else if (strcmp(argv[i], "--file") == 0) {
+            if (i + 1 < argc) {
+                arg_file = argv[i + 1];
+                ++i;
+            }
+        } else if (strcmp(argv[i], "-y") == 0) {
+            always_yes = true;
+        } else {
+            fprintf(stderr, "invalid arg: %s\n", argv[i]);
+            usage(argv[0]);
+            return 1;
+        }
+    }
+
+    enum ggml_ftype ftype = GGML_FTYPE_MOSTLY_Q4_0;
+    {
+        if (arg_ftype != NULL) {
+            int v = atoi(arg_ftype);
+            ftype = (enum ggml_ftype)v;
+        }
+
+        if (ftype > GGML_FTYPE_MOSTLY_Q5_1) {
+            fprintf(stderr, "k_quants type %d is not implemented\n", ftype);
+            return 1;
+        }
+    }
+
+    if (arg_file != NULL && !always_yes) {
+        struct stat st;
+        int rc = stat(arg_file, &st);
+        UNUSED(st);
+        if (rc == 0) { // prompt
+            size_t len = strlen(arg_file) + 50;
+            char *prompt = (char *)malloc(len);
+            GGML_ASSERT(prompt);
+            snprintf(prompt, len, "data file '%s' exists, override?", arg_file);
+
+            if (!prompt_yes_no(prompt)) {
+                printf("Aborted.\n");
+                return 1;
+            }
+            free(prompt);
+        }
+    }
+
+    int m_num = 10;
+    {
+        if (arg_m_num != NULL) {
+            int v = atoi(arg_m_num);
+            m_num = v;
+        }
+
+        if (m_num < 6 || m_num > 12) {
+            fprintf(stderr, "invalid m_num: %d, expect between [6, 12]\n",
+                    m_num);
+            usage(argv[0]);
+            return 1;
+        }
+    }
+
+    int n_pass = 1;
+    {
+        if (arg_n_pass != NULL) {
+            int v = atoi(arg_n_pass);
+            n_pass = v;
+        }
+        if (n_pass < 1 || n_pass > GGML_MULMAT_MAX_PASS) {
+            fprintf(stderr, "invalid n_pass: %d, expect between [1, %d]\n",
+                    n_pass, GGML_MULMAT_MAX_PASS);
+            usage(argv[0]);
+            return 1;
+        }
+    }
+
+    int n_threads = 1;
+    {
+        if (arg_n_threads != NULL) {
+            int v = atoi(arg_n_threads);
+            n_threads = v;
+            if (n_threads < 1 || n_threads > 16) {
+                fprintf(stderr,
+                        "invalid n_threads: %d, expect between [1, 16]\n",
+                        n_threads);
+                usage(argv[0]);
+                return 1;
+            }
+        }
+    }
+
+    const char *model_name = "7B";
+    {
+        if (arg_model != NULL) {
+            model_name = arg_model;
+        }
+    }
+
+    // Let init message print earlier.
+    {
+        struct ggml_init_params init_params = {
+            /*.mem_size   =*/1,
+            /*.mem_buffer =*/NULL,
+            /*.no_alloc   =*/0,
+        };
+        struct ggml_context *ctx = ggml_init(init_params);
+        GGML_ASSERT(ctx);
+        ggml_free(ctx);
+    }
+
+    struct ggml_mulmat_tune tune;
+
+    struct ggml_mulmat_tune_params params;
+    memset(&params, 0, sizeof(struct ggml_mulmat_tune_params));
+
+    ggml_mulmat_init_task_profiles();
+
+    ggml_mulmat_tune_model_init(&params.model, model_name, ftype);
+    params.m_num = m_num;
+    params.n_pass = n_pass;
+    params.n_threads = n_threads;
+    params.progress = true;
+    params.output_console = true;
+    params.fname = arg_file;
+
+    bool ok = ggml_mulmat_tune_bench(&tune, &params);
+    return ok ? 0 : 1;
+}
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index ae8cfe0af..1f14c18de 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -158,6 +158,16 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+#ifdef GGML_USE_MULMAT_TUNE
+    if (params.tune || !params.tune_file.empty()){
+        bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
+        if (!ok || (params.tune && !params.tune_file.empty())) {
+            llama_free(ctx);
+            return ok? 0: 1;
+        }
+    }
+#endif
+
     // print system information
     {
         fprintf(stderr, "\n");
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 16488b9f9..cf52109bc 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -2571,7 +2571,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
             func = ggml_cuda_rms_norm;
             break;
         case GGML_OP_MUL_MAT:
-            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
+            if (!any_on_device/* && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)*/) {
                 return false;
             }
             func = ggml_cuda_mul_mat;
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index 95f4cec6d..b2300a104 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -1628,7 +1628,7 @@ bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_
 }
 
 void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
-    GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
+    // GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
 
     if (src0->type == GGML_TYPE_F32) {
         ggml_cl_mul_mat_f32(src0, src1, dst);
diff --git a/ggml-threading.c b/ggml-threading.c
new file mode 100644
index 000000000..cf17793f6
--- /dev/null
+++ b/ggml-threading.c
@@ -0,0 +1,620 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ggml-threading.h"
+#include "ggml.h"
+
+#define UNUSED(x) (void)(x)
+
+// see https://github.com/ggerganov/llama.cpp/pull/1314
+#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
+#include <emmintrin.h>
+static inline void ggml_spin_pause(void) { _mm_pause(); }
+#else
+static inline void ggml_spin_pause(void) {}
+#endif
+
+#if defined(_WIN32)
+
+#include <windows.h>
+
+typedef volatile LONG atomic_int;
+typedef atomic_int atomic_bool;
+typedef LONG atomic_flag;
+
+typedef CRITICAL_SECTION pthread_mutex_t;
+typedef CONDITION_VARIABLE pthread_cond_t;
+typedef void pthread_mutexattr_t;
+typedef void pthread_condattr_t;
+
+typedef HANDLE pthread_t;
+
+static void atomic_store(atomic_int *ptr, LONG val) {
+    InterlockedExchange(ptr, val);
+}
+
+static LONG atomic_load(atomic_int *ptr) {
+    return InterlockedCompareExchange(ptr, 0, 0);
+}
+
+static LONG atomic_fetch_add(atomic_int *ptr, LONG inc) {
+    return InterlockedExchangeAdd(ptr, inc);
+}
+
+static LONG atomic_fetch_sub(atomic_int *ptr, LONG dec) {
+    return atomic_fetch_add(ptr, -(dec));
+}
+
+static inline LONG atomic_flag_test_and_set(volatile atomic_flag *ptr) {
+    return InterlockedCompareExchange(ptr, 1, 0);
+}
+static inline LONG atomic_flag_clear(volatile atomic_flag *ptr) {
+    return InterlockedExchange(ptr, 0);
+}
+static int pthread_create(pthread_t *out, void *unused,
+                          ggml_thread_ret_t (*func)(void *), void *arg) {
+    (void)unused;
+    HANDLE handle =
+        CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, arg, 0, NULL);
+    if (handle == NULL) {
+        return EAGAIN;
+    }
+
+    *out = handle;
+    return 0;
+}
+
+static int pthread_join(pthread_t thread, void *unused) {
+    (void)unused;
+    return (int)WaitForSingleObject(thread, INFINITE);
+}
+
+static int pthread_mutex_init(pthread_mutex_t *mutex,
+                              pthread_mutexattr_t *attr) {
+    (void)attr;
+    InitializeCriticalSection(mutex);
+    return 0;
+}
+
+static int pthread_mutex_destroy(pthread_mutex_t *mutex) {
+    DeleteCriticalSection(mutex);
+    return 0;
+}
+
+static int pthread_mutex_lock(pthread_mutex_t *mutex) {
+    EnterCriticalSection(mutex);
+    return 0;
+}
+
+static int pthread_mutex_unlock(pthread_mutex_t *mutex) {
+    LeaveCriticalSection(mutex);
+    return 0;
+}
+
+static int pthread_cond_init(pthread_cond_t *cond, pthread_condattr_t *attr) {
+    (void)attr;
+    InitializeConditionVariable(cond);
+    return 0;
+}
+
+static int pthread_cond_destroy(pthread_cond_t *cond) {
+    (void)cond;
+    return 0;
+}
+
+static int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex) {
+    SleepConditionVariableCS(cond, mutex, INFINITE);
+    return 0;
+}
+
+static int pthread_cond_signal(pthread_cond_t *cond) {
+    WakeConditionVariable(cond);
+    return 0;
+}
+
+static int pthread_cond_broadcast(pthread_cond_t *cond) {
+    WakeAllConditionVariable(cond);
+    return 0;
+}
+
+static int sched_yield(void) {
+    // https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-yieldprocessor
+    YieldProcessor();
+    return 0;
+}
+
+#else // ! _WIN32
+
+#include <pthread.h>
+#include <stdatomic.h>
+#include <stdbool.h>
+
+#endif
+
+// #define GGML_THREADING_DEBUG 1
+
+#ifdef GGML_THREADING_DEBUG
+#define PRINT_DEBUG(...) fprintf(stdout, __VA_ARGS__)
+#else
+#define PRINT_DEBUG(...)
+#endif
+
+struct ggml_perf_stats {
+    int runs;
+
+    // total cycles
+    atomic_int cycles;
+
+    // total time in us.
+    atomic_int time_us;
+};
+
+struct ggml_compute_state_shared {
+    atomic_flag spin;
+    pthread_mutex_t mutex;
+    pthread_cond_t cond;
+
+    // number of threads that has entered thread runner.
+    atomic_int n_ready;
+
+    // number of assigned but unfinished tasks, workers decrease it.
+    atomic_int n_tasks;
+
+    // number of waiting workers, workers increase it.
+    atomic_int n_waiting;
+
+    // commands.
+    atomic_bool wait_now;
+    atomic_bool wait_on_done;
+    atomic_bool stop;
+
+    ggml_threading_task_runner *task_runner;
+
+    struct ggml_threading_context *ctx;
+};
+struct ggml_compute_state {
+    pthread_t thrd;
+
+    atomic_bool has_work;
+    struct ggml_compute_params params;
+    struct ggml_tensor *node;
+
+    struct ggml_compute_state_shared *shared;
+};
+struct ggml_threading_context {
+    int n_threads;
+    struct ggml_compute_state_shared shared;
+    struct ggml_compute_state *workers;
+
+    enum ggml_threading_features features;
+
+    struct ggml_perf_stats wait_perf;
+    struct ggml_perf_stats wakeup_perf;
+
+    int64_t *stages_time;
+};
+
+// NOTE: ggml_spin_lock and ggml_spin_unlock may can be noop if
+// feature wait_on_done is off.
+static inline void ggml_spin_lock(volatile atomic_flag *obj) {
+    while (atomic_flag_test_and_set(obj)) {
+        ggml_spin_pause();
+    }
+}
+
+static inline void ggml_spin_unlock(volatile atomic_flag *obj) {
+    atomic_flag_clear(obj);
+}
+
+static inline void ggml_perf_collect(struct ggml_perf_stats *st, int64_t c0,
+                                     int64_t t0) {
+    st->runs++;
+    st->cycles += (ggml_cycles() - c0);
+    st->time_us += (ggml_time_us() - t0);
+}
+
+// A worker thread goes cond waiting.
+// NOTE: must be protected by shared->spin
+static void ggml_threading_cond_wait(struct ggml_compute_state *state) {
+    struct ggml_compute_state_shared *shared = state->shared;
+
+    int64_t perf_cycles_0 = 0;
+    int64_t perf_time_0 = 0;
+
+    if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
+        perf_cycles_0 = ggml_cycles();
+        perf_time_0 = ggml_time_us();
+    }
+
+    GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0);
+
+    if (!shared->wait_now) {
+        GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
+        ggml_spin_unlock(&shared->spin);
+        return;
+    }
+
+    shared->n_waiting++;
+    ggml_spin_unlock(&shared->spin);
+
+    GGML_ASSERT(pthread_cond_wait(&shared->cond, &shared->mutex) == 0);
+    GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
+
+    ggml_spin_lock(&shared->spin);
+
+    shared->n_waiting--;
+
+    if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
+        ggml_perf_collect(&shared->ctx->wait_perf, perf_cycles_0, perf_time_0);
+    }
+}
+
+// Wakeup all workers.
+//
+// Workers takes some time to wakeup, and has to lock spin after wakeup. Yield
+// is used to avoid signal frequently. Current implementation is highly
+// experimental. See tests/test-ggml-threading.c for details.
+//
+// NOTE: must be protected by shared->spin
+static void
+ggml_threading_wakeup_workers(struct ggml_compute_state_shared *shared) {
+    int64_t perf_cycles_0 = 0;
+    int64_t perf_time_0 = 0;
+
+    if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
+        perf_cycles_0 = ggml_cycles();
+        perf_time_0 = ggml_time_us();
+    }
+
+    shared->wait_now = false;
+
+    int loop_counter = 0;
+    int notify_counter = 0;
+    int64_t last_signal_time = 0;
+
+    while (shared->n_waiting != 0) {
+        ggml_spin_unlock(&shared->spin);
+
+        if (loop_counter > 0) {
+            ggml_spin_pause();
+            if (loop_counter > 3) {
+                sched_yield();
+            }
+        }
+        ++loop_counter;
+
+        // TODO: should bench actual average wait/wakeup time.
+        if (last_signal_time > 0 && (ggml_time_us() - last_signal_time) < 10) {
+            continue;
+        }
+
+        GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0);
+        GGML_ASSERT(pthread_cond_broadcast(&shared->cond) == 0);
+        GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
+        ++notify_counter;
+        last_signal_time = ggml_time_us();
+
+        ggml_spin_lock(&shared->spin);
+    }
+
+    if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
+        ggml_perf_collect(&shared->ctx->wakeup_perf, perf_cycles_0,
+                          perf_time_0);
+    }
+
+    // if (notify_counter > 1) {
+    //     printf("%s: loop counter: %d, notify counter: %d\n", __func__,
+    //            loop_counter, notify_counter);
+    // }
+    UNUSED(notify_counter);
+}
+
+// Setup workers for a task stage.
+// NOTE: must be protected by shared->spin
+static void ggml_threading_setup_workers(struct ggml_threading_context *ctx,
+                                         struct ggml_task_profile *profile,
+                                         enum ggml_task_type type) {
+    PRINT_DEBUG("[main] setup workers for task ...\n");
+
+#ifdef GGML_THREADING_DEBUG
+    int64_t t0 = ggml_time_us();
+#endif
+
+    const int n_worker_threads = ctx->n_threads - 1;
+    struct ggml_task_stage *current = &profile->stages[type];
+    struct ggml_compute_state_shared *shared = &ctx->shared;
+
+    if (current->parallel) {
+        if (shared->n_waiting > 0) {
+            ggml_threading_wakeup_workers(shared);
+        }
+
+        if ((ctx->features & GGML_THREADING_FEATURE_WAIT_ON_DONE) > 0) {
+            // Optimize energy: wait_on_done. We MAY also check following nodes,
+            // but that's a bit complicated.
+            shared->wait_on_done = false;
+            for (int i = type + 1; i <= GGML_TASK_FINALIZE; i++) {
+                struct ggml_task_stage *next = &profile->stages[i];
+                if (next->parallel) {
+                    break;
+                }
+                if (next->wait) {
+                    shared->wait_on_done = true;
+                    PRINT_DEBUG("[main] wait_on_done is enabled for "
+                                "current task stage\n");
+                    break;
+                }
+            }
+        }
+    } else if (current->wait) {
+        if (shared->n_waiting < n_worker_threads) {
+            shared->wait_now = true;
+            PRINT_DEBUG("[main] wait_now was set, expect %d workers wait\n",
+                        n_worker_threads);
+            ggml_spin_unlock(&shared->spin);
+
+            while (shared->n_waiting != n_worker_threads) {
+                ggml_spin_pause();
+            }
+
+            ggml_spin_lock(&shared->spin);
+            PRINT_DEBUG("[main] saw %d workers waiting\n", n_worker_threads);
+        }
+    }
+
+    PRINT_DEBUG("[main] setup workers for task took %d us\n",
+                (int)(ggml_time_us() - t0));
+}
+
+ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
+    GGML_ASSERT(data);
+    struct ggml_compute_state *state = (struct ggml_compute_state *)data;
+    GGML_ASSERT(state);
+
+    struct ggml_compute_state_shared *shared = state->shared;
+    GGML_ASSERT(shared);
+    GGML_ASSERT(shared->task_runner);
+
+    shared->n_ready++;
+
+    PRINT_DEBUG("[%d-th] running\n", state->params.ith);
+
+    while (!shared->stop) {
+        if (shared->wait_now) {
+            ggml_spin_lock(&shared->spin);
+            if (!state->has_work) {
+                ggml_threading_cond_wait(state);
+            }
+            ggml_spin_unlock(&shared->spin);
+        }
+
+        if (shared->n_tasks > 0 && state->has_work) {
+            enum ggml_compute_error err =
+                shared->task_runner(&state->params, state->node);
+
+            GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK);
+
+            ggml_spin_lock(&shared->spin);
+
+            state->has_work = false;
+            shared->n_tasks--;
+
+            bool wait = shared->wait_on_done && !state->has_work;
+            if (wait) {
+                ggml_threading_cond_wait(state);
+            }
+
+            ggml_spin_unlock(&shared->spin);
+
+            // no need to pause.
+            if (wait) {
+                continue;
+            }
+        }
+
+        ggml_spin_pause();
+    }
+
+    PRINT_DEBUG("[%d-th] exited\n", state->params.ith);
+    return 0;
+}
+
+enum ggml_compute_error
+ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
+                              struct ggml_tensor *node, void *wdata,
+                              size_t wsize) {
+    GGML_ASSERT(ctx);
+    GGML_ASSERT(node);
+
+    GGML_ASSERT(ctx->shared.task_runner);
+    struct ggml_compute_state_shared *state_shared = &ctx->shared;
+
+    // This is the params for main thread.
+    struct ggml_compute_params params;
+    enum ggml_compute_error err;
+
+    for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) {
+        if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) {
+            continue;
+        }
+
+        PRINT_DEBUG("[main] stage: %d\n", type);
+
+        int64_t t_stage = 0;
+        if (ctx->stages_time) {
+            t_stage = ggml_time_us();
+        }
+
+        // n_tasks is the total number of parallel computing tasks
+        // (including main thread).
+        int n_tasks =
+            node->task_profile.stages[type].parallel ? ctx->n_threads : 1;
+
+        ggml_spin_lock(&state_shared->spin);
+
+        if (ctx->n_threads > 1) {
+            ggml_threading_setup_workers(ctx, &node->task_profile, type);
+        }
+
+        if (n_tasks > 1) {
+            // setup compute task parameters.
+            for (int j = 0; j < n_tasks - 1; j++) {
+                ctx->workers[j].params = (struct ggml_compute_params){
+                    .type = type,
+                    .ith = j + 1,
+                    .nth = n_tasks,
+                    .wsize = wsize,
+                    .wdata = wdata,
+                };
+                ctx->workers[j].node = node;
+                ctx->workers[j].has_work = true;
+            }
+            state_shared->n_tasks = n_tasks - 1;
+            PRINT_DEBUG("[main] assigned %d tasks\n", state_shared->n_tasks);
+        }
+
+        ggml_spin_unlock(&state_shared->spin);
+
+        // main thread always run the 0-th task.
+        // TODO: assert(params->nth == 1) instead of
+        // assert(params->ith == 0)
+        {
+            params.type = type;
+            params.ith = 0;
+            params.nth = n_tasks;
+            params.wsize = wsize;
+            params.wdata = wdata;
+
+            err = state_shared->task_runner(&params, node);
+        }
+
+        // wait for tasks done.
+        if (n_tasks > 1) {
+            while (state_shared->n_tasks != 0) {
+                ggml_spin_pause();
+            }
+        }
+
+        PRINT_DEBUG("[main] all tasks finished\n\n");
+
+        if (ctx->stages_time) {
+            ctx->stages_time[type] = ggml_time_us() - t_stage;
+        }
+
+        if (err != GGML_COMPUTE_OK) {
+            return err;
+        }
+    }
+
+    return GGML_COMPUTE_OK;
+}
+
+struct ggml_threading_context *
+ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner,
+                     ggml_threading_task_runner *task_stage_runner,
+                     enum ggml_threading_features features,
+                     int64_t stages_time[3]) {
+    GGML_ASSERT(n_threads > 0);
+    GGML_ASSERT(thread_runner);
+    GGML_ASSERT(task_stage_runner);
+
+    size_t ctx_sz = sizeof(struct ggml_threading_context);
+    struct ggml_threading_context *ctx = malloc(ctx_sz);
+    GGML_ASSERT(ctx);
+    memset(ctx, 0, ctx_sz);
+
+    ctx->shared = (struct ggml_compute_state_shared){
+        .spin = {0},
+        .n_ready = 0,
+        .n_tasks = 0,
+        .n_waiting = 0,
+        .wait_now = false,
+        .wait_on_done = false,
+        .stop = false,
+        .task_runner = task_stage_runner,
+        .ctx = ctx,
+    };
+
+    PRINT_DEBUG("[main] thread start, features: %d\n", features);
+
+    ctx->n_threads = n_threads;
+    ctx->features = features;
+    ctx->stages_time = stages_time;
+
+    int n_workers = n_threads - 1;
+    if (n_workers > 0) {
+        GGML_ASSERT(pthread_mutex_init(&ctx->shared.mutex, NULL) == 0);
+        GGML_ASSERT(pthread_cond_init(&ctx->shared.cond, NULL) == 0);
+
+        size_t workers_sz = sizeof(struct ggml_compute_state) * n_workers;
+        struct ggml_compute_state *workers = malloc(workers_sz);
+        GGML_ASSERT(workers);
+        memset(workers, 0, workers_sz);
+
+        for (int j = 0; j < n_workers; j++) {
+            workers[j].shared = &ctx->shared;
+            GGML_ASSERT(pthread_create(&workers[j].thrd, NULL, thread_runner,
+                                       &workers[j]) == 0);
+        }
+
+        ctx->workers = workers;
+
+        while (ctx->shared.n_ready != n_workers) {
+            ggml_spin_pause();
+        }
+    }
+
+    return ctx;
+}
+
+static void
+ggml_threading_print_perf_stats(struct ggml_threading_context *ctx) {
+    bool print_stats = ctx->features & GGML_THREADING_FEATURE_PERF;
+#ifdef GGML_THREADING_DEBUG
+    print_stats = true;
+#endif
+
+    if (!print_stats) {
+        return;
+    }
+
+    const char *prefix_arr[2] = {"[threading wait  ]", "[threading wakeup]"};
+    struct ggml_perf_stats *st_arr[2] = {&ctx->wait_perf, &ctx->wakeup_perf};
+    for (int i = 0; i < 2; i++) {
+        struct ggml_perf_stats *st = st_arr[i];
+        if (st->runs == 0) {
+            continue;
+        }
+        fprintf(stdout,
+                "%s runs: %4d, avg cycles: %8.3f ms, avg time: "
+                "%8.3f ms\n",
+                prefix_arr[i], st->runs,
+                1.0 * st->cycles / (st->runs * ggml_cycles_per_ms()),
+                1.0 * st->time_us / (st->runs * 1000));
+    }
+}
+
+void ggml_threading_stop(struct ggml_threading_context *ctx) {
+    GGML_ASSERT(ctx);
+
+    if (ctx->workers) {
+        PRINT_DEBUG("[main] stopping thread pool ...\n");
+        ctx->shared.stop = true;
+
+        ggml_spin_lock(&ctx->shared.spin);
+        ggml_threading_wakeup_workers(&ctx->shared);
+        ggml_spin_unlock(&ctx->shared.spin);
+
+        for (int j = 0; j < ctx->n_threads - 1; j++) {
+            GGML_ASSERT(pthread_join(ctx->workers[j].thrd, NULL) == 0);
+        }
+        free(ctx->workers);
+        PRINT_DEBUG("[main] thread pool stopped\n");
+    }
+
+    ggml_threading_print_perf_stats(ctx);
+
+    free(ctx);
+}
diff --git a/ggml-threading.h b/ggml-threading.h
new file mode 100644
index 000000000..f3214efc7
--- /dev/null
+++ b/ggml-threading.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+typedef int ggml_thread_ret_t;
+#else
+typedef void *ggml_thread_ret_t;
+#endif
+
+struct ggml_threading_context;
+
+// Optional (experimental) features.
+enum ggml_threading_features {
+    GGML_THREADING_FEATURE_NONE = 0,
+    GGML_THREADING_FEATURE_WAIT_ON_DONE = 1 << 0,
+    GGML_THREADING_FEATURE_PERF = 1 << 1,
+};
+
+// Compute errors.
+enum ggml_compute_error {
+    GGML_COMPUTE_OK = 0,
+    GGML_COMPUTE_FALLBACK = 1,
+};
+
+// The task runner to be called by main thread and workers.
+typedef enum ggml_compute_error(ggml_threading_task_runner)(
+    struct ggml_compute_params *params, struct ggml_tensor *node);
+
+// The thread runner to feed into OS threads.
+typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data);
+
+// Init and start underlying workers if n_threads > 1.
+//
+// features: optional for configure threading additional features.
+// see `ggml_threading_feature`, default 0.
+// stages_time: optional for collecting per-stage wall clock time.
+struct ggml_threading_context *
+ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread,
+                     ggml_threading_task_runner *task_stage_runner,
+                     enum ggml_threading_features features,
+                     int64_t stages_time[3]);
+
+// Stop workers (if exist), free memories (including the ctx).
+void ggml_threading_stop(struct ggml_threading_context *ctx);
+
+// The default implementation of `ggml_threading_thread_runner`
+ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data);
+
+// Compute a tensor. It computes the enabled task stages one by one.
+// Caller should take care of the return error: retry for fallback error.
+enum ggml_compute_error
+ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
+                              struct ggml_tensor *node, void *wdata,
+                              size_t wsize);
+
+// This is an experimental functionality for mulmat tune, as a thin wrapper.
+enum ggml_compute_error
+ggml_compute_forward_wrapper(struct ggml_compute_params *params,
+                             struct ggml_tensor *tensor);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml-tune.c b/ggml-tune.c
new file mode 100644
index 000000000..fbca953ed
--- /dev/null
+++ b/ggml-tune.c
@@ -0,0 +1,897 @@
+#include <string.h>
+
+#include "ggml-threading.h"
+#include "ggml-tune.h"
+#include "ggml.h"
+
+// MUL_MAT fine tunning for non-GPU-offloading cases.
+
+#define GGML_MULMAT_CACHE_LEN 16
+static struct mm_cache_element default_mm_cache[GGML_MULMAT_CACHE_LEN] = {0};
+
+#define FNV_OFFSET 14695981039346656037UL
+#define FNV_PRIME 1099511628211UL
+static uint64_t ggml_mulmat_tune_cache_hash(int M, int N, int K) {
+    char buf[30];
+    snprintf(buf, 30, "%d%d%d", M, N, K);
+
+    uint64_t hash = FNV_OFFSET;
+    for (const char *p = buf; *p; p++) {
+        hash ^= (uint64_t)(unsigned char)(*p);
+        hash *= FNV_PRIME;
+    }
+    return hash;
+}
+
+static const char *
+ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) {
+    switch (backend) {
+    case GGML_TASK_BACKEND_NONE:
+        return "";
+    case GGML_TASK_BACKEND_CPU:
+        return "CPU";
+    case GGML_TASK_BACKEND_CPU_BLAS:
+        return "BLAS";
+    case GGML_TASK_BACKEND_GPU:
+        return "GPU";
+    case GGML_TASK_BACKEND_GPU_CUDA:
+        return "CUDA";
+    case GGML_TASK_BACKEND_GPU_CL:
+        return "CL";
+    default:
+        GGML_ASSERT(false);
+    }
+}
+
+const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
+    struct ggml_mulmat_tune *tune, int M, int N, int K, enum ggml_type src0_t,
+    enum ggml_type src1_t, int stages_time[3]) {
+    GGML_ASSERT(tune);
+
+    // TODO: default_mm_cache is thread-unsafe.
+    struct mm_cache_element *mm_cache = default_mm_cache;
+    int slot = ggml_mulmat_tune_cache_hash(M, N, K) % GGML_MULMAT_CACHE_LEN;
+    struct mm_cache_element *e = &mm_cache[slot];
+
+    struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0};
+
+    struct ggml_task_profile *prof = NULL;
+
+    if (e->M == M && e->N == N && e->K == K) {
+        prof = e->profile;
+        if (stages_time) {
+            for (int i = 0; i < 3; i++) {
+                stages_time[i] = e->stages_time[i];
+            }
+        }
+    } else {
+        const struct ggml_mulmat_tune_shape *shape = NULL;
+        shape = ggml_mulmat_tune_get_shape(tune, N, K, src0_t, src1_t);
+        if (shape) {
+            ggml_mulmat_tune_estimate_time(shape, M, profiles_time);
+
+            int min = INT32_MAX;
+            int index = -1;
+            for (int i = 0; i < shape->n_profiles; i++) {
+                int total = profiles_time[i].total_time;
+                if (total < min) {
+                    min = total;
+                    index = i;
+                }
+            }
+
+            if (index >= 0) {
+                prof = profiles_time[index].profile;
+                for (int i = 0; i < 3; i++) {
+                    int t = profiles_time[index].stage_time[i];
+                    if (stages_time) {
+                        stages_time[i] = t;
+                    }
+                    e->stages_time[i] = t;
+                }
+
+                GGML_ASSERT(prof);
+
+                e->profile = prof;
+                e->M = M;
+                e->N = N;
+                e->K = K;
+
+                // to disable this, build with
+                // `make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1
+                // make`
+#if !defined(GGML_MULMAT_TUNE_NDEBUG)
+                const char *names[3];
+                for (int i = 0; i < 3; i++) {
+                    names[i] = ggml_mulmat_tune_task_backend_name(
+                        prof->stages[i].backend);
+                }
+                printf(
+                    "\n[mulmat tune] M: %3d, N: %5d, K: %5d, backends of the "
+                    "fastest profile: %s %s %s\n",
+                    M, N, K, names[0], names[1], names[2]);
+#endif
+            }
+        }
+    }
+
+    return prof;
+}
+
+void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
+                                 const char *name, enum ggml_ftype ftype) {
+    const int n_vocab = 32000;
+    int n_embd;
+    // n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult
+    int n_ff;
+    // n_rot = n_embd/n_head;
+    int n_rot;
+
+    if (strcmp(name, "3B") == 0) {
+        // n_head=32, n_mult=216, n_layer=26
+        // https://github.com/ggerganov/llama.cpp/pull/1588
+        n_embd = 3200;
+        n_ff = 8640;
+        n_rot = 100;
+    } else if (strcmp(name, "7B") == 0) {
+        n_embd = 4096;
+        n_ff = 11008;
+        n_rot = 128;
+    } else if (strcmp(name, "13B") == 0) {
+        n_embd = 5120;
+        n_ff = 13824;
+        n_rot = 128;
+    } else if (strcmp(name, "30B") == 0) {
+        n_embd = 6656;
+        n_ff = 17920;
+        n_rot = 128;
+    } else if (strcmp(name, "65B") == 0) {
+        n_embd = 8192;
+        n_ff = 22016;
+        n_rot = 128;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    model->name = name;
+    model->ftype = ftype;
+    model->n_vocab = n_vocab;
+    model->n_embd = n_embd;
+    model->n_ff = n_ff;
+    model->n_rot = n_rot;
+}
+
+bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
+                           struct ggml_mulmat_tune_params *params,
+                           struct ggml_task_profile_factory *pf) {
+
+    struct ggml_mulmat_tune_model *model = &params->model;
+
+    memset(tune, 0, sizeof(struct ggml_mulmat_tune));
+
+    tune->version = GGML_MULMAT_TUNE_VERSION;
+    tune->n_threads = params->n_threads;
+    tune->ftype = model->ftype;
+
+    size_t name_len = strlen(model->name);
+    GGML_ASSERT(name_len > 0);
+    strncpy(tune->model, model->name, sizeof(tune->model) - 1);
+
+    const enum ggml_type rot_src0_type = GGML_TYPE_F16;
+    const enum ggml_type src1_type = GGML_TYPE_F32;
+
+    int n_vocab = model->n_vocab;
+    int n_embd = model->n_embd;
+    int n_ff = model->n_ff;
+    int n_rot = model->n_rot;
+
+    enum ggml_type type = ggml_ftype_to_ggml_type(model->ftype);
+
+    GGML_ASSERT(GGML_MULMAT_N_SHAPES >= 6);
+    tune->n_shapes = GGML_MULMAT_N_SHAPES;
+
+    // Attention layers
+    tune->shapes[0] = (struct ggml_mulmat_tune_shape){
+        .N = n_embd, .K = n_embd, .src0_type = type, .src1_type = src1_type};
+    // Feed forward layers
+    tune->shapes[1] = (struct ggml_mulmat_tune_shape){
+        .N = n_embd, .K = n_ff, .src0_type = type, .src1_type = src1_type};
+    tune->shapes[2] = (struct ggml_mulmat_tune_shape){
+        .N = n_ff, .K = n_embd, .src0_type = type, .src1_type = src1_type};
+    tune->shapes[3] = (struct ggml_mulmat_tune_shape){
+        .N = n_vocab, .K = n_embd, .src0_type = type, .src1_type = src1_type};
+    // RoPE
+    tune->shapes[4] = (struct ggml_mulmat_tune_shape){
+        .N = n_rot, .K = 0, .src0_type = rot_src0_type, .src1_type = src1_type};
+    tune->shapes[5] = (struct ggml_mulmat_tune_shape){
+        .N = 0, .K = n_rot, .src0_type = rot_src0_type, .src1_type = src1_type};
+
+    for (int i = 0; i < tune->n_shapes; i++) {
+        struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
+        shape->n_profiles = ggml_mulmat_get_task_profiles(
+            pf, shape->src0_type, shape->src1_type, &shape->profiles);
+        if (shape->n_profiles == 0) {
+            // allowed for testing.
+            continue;
+        }
+
+        shape->m_num = params->m_num;
+        shape->arr_m = malloc(shape->m_num * sizeof(int));
+        for (int j = 0; j < shape->m_num; j++) {
+            shape->arr_m[j] = 1 << j;
+        }
+
+        size_t sz = sizeof(struct ggml_mulmat_tune_m) *
+                    (shape->n_profiles * shape->m_num);
+        shape->items = malloc(sz);
+        GGML_ASSERT(shape->items);
+        memset(shape->items, 0, sz);
+    }
+
+    return true;
+}
+
+void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune) {
+    for (int i = 0; i < tune->n_shapes; i++) {
+        struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
+        GGML_ASSERT(shape);
+
+        // arr_m and items can be NULL only when testing.
+        if (shape->arr_m) {
+            free(shape->arr_m);
+        }
+        if (shape->items) {
+            free(shape->items);
+        }
+    }
+}
+
+static bool ggml_mulmat_tune_write_profiles(
+    FILE *fp, const struct ggml_task_profile *profiles, int n_profiles) {
+    int rc;
+    for (int i = 0; i < n_profiles; i++) {
+        const struct ggml_task_profile *profile = &profiles[i];
+        for (int j = 0; j < 3; j++) {
+            const struct ggml_task_stage *ts = &profile->stages[j];
+            rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0,
+                         ts->wait ? 1 : 0);
+            if (rc <= 0) {
+                return false;
+            }
+            if (j < 2) {
+                rc = fprintf(fp, "  ");
+                if (rc <= 0) {
+                    return false;
+                }
+            }
+        }
+        rc = fprintf(fp, "\n");
+        if (rc <= 0) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static bool
+ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
+                                   const char *model, int ftype, int n_threads,
+                                   char *errbuf, int errbuf_len) {
+
+    if (tune->version != GGML_MULMAT_TUNE_VERSION) {
+        snprintf(errbuf, errbuf_len - 1,
+                 "version mismatch, built-in: %d, "
+                 "yours: %d",
+                 GGML_MULMAT_TUNE_VERSION, tune->version);
+        return false;
+    } else if (strcmp(model, tune->model) != 0) {
+        snprintf(errbuf, errbuf_len - 1,
+                 "model mismatch. built-in: %s, yours: %s", model, tune->model);
+        return false;
+    } else if (ftype != tune->ftype) {
+        snprintf(errbuf, errbuf_len - 1,
+                 "ftype mismatch. built-in: %d, yours: %d\n", ftype,
+                 tune->ftype);
+        return false;
+    } else if (n_threads != tune->n_threads) {
+        snprintf(errbuf, errbuf_len - 1,
+                 "n_threads mismatch. run-time: %d, yours: %d\n", n_threads,
+                 tune->n_threads);
+        return false;
+    }
+
+    for (int i = 0; i < tune->n_shapes; i++) {
+        const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
+
+        struct ggml_task_profile *builtin_profiles = NULL;
+        int n_profiles = ggml_mulmat_get_task_profiles(
+            NULL, shape->src0_type, shape->src1_type, &builtin_profiles);
+
+        if (n_profiles != shape->n_profiles) {
+            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
+            return false;
+        }
+
+        // TODO: profiles order is relevant, too strict.
+        size_t sz = sizeof(struct ggml_task_profile) * n_profiles;
+        if (memcmp(builtin_profiles, shape->profiles, sz) != 0) {
+            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
+
+            printf("=== built-in profiles:\n");
+            ggml_mulmat_tune_write_profiles(stderr, builtin_profiles,
+                                            n_profiles);
+
+            printf("=== incoming profiles:\n");
+            ggml_mulmat_tune_write_profiles(stderr, shape->profiles,
+                                            shape->n_profiles);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
+                               const char *model, int ftype, int n_threads) {
+    char errbuf[128];
+    bool ok = ggml_mulmat_tune_validate_internal(tune, model, ftype, n_threads,
+                                                 errbuf, sizeof(errbuf));
+    if (!ok) {
+        fprintf(stderr, "[mulmat tune] error: %s. run bench again.\n", errbuf);
+    }
+
+    return ok;
+}
+
+bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
+    int rc = fscanf(fp, "%d", &tune->version);
+    if (rc <= 0) {
+        return false;
+    }
+
+    if (tune->version != GGML_MULMAT_TUNE_VERSION) {
+        fprintf(stderr, "[mulmat tune] version mismatch, run bench again\n");
+        return false;
+    }
+
+    rc = fscanf(fp, "%s %d %d %d", tune->model, (int *)&tune->ftype,
+                &tune->n_shapes, &tune->n_threads);
+    if (rc <= 0) {
+        return false;
+    }
+
+    for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
+        struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
+
+        rc = fscanf(fp, "%d %d %d %d %d %d", &shape->N, &shape->K,
+                    (int *)&shape->src0_type, (int *)&shape->src1_type,
+                    &shape->n_profiles, &shape->m_num);
+        if (rc <= 0) {
+            return false;
+        }
+
+        {
+            size_t item_size = sizeof(struct ggml_mulmat_tune_m) *
+                               (shape->n_profiles * shape->m_num);
+            shape->items = malloc(item_size);
+            if (shape->items == NULL) {
+                fprintf(stderr, "[mulmat tune] failed to allocate memory\n");
+                return false;
+            }
+            memset(shape->items, 0, item_size);
+        }
+
+        {
+            size_t sz = sizeof(struct ggml_task_profile) * shape->n_profiles;
+            shape->profiles = malloc(sz);
+            GGML_ASSERT(shape->profiles);
+            memset(shape->profiles, 0, sz);
+        }
+
+        for (int ip = 0; ip < shape->n_profiles; ip++) {
+            struct ggml_task_profile *profile = &shape->profiles[ip];
+            for (int j = 0; j < 3; j++) {
+                struct ggml_task_stage *ts = &profile->stages[j];
+                int backend;
+                int parallel;
+                int wait;
+                rc = fscanf(fp, "%d %d %d", &backend, &parallel, &wait);
+                if (rc <= 0) {
+                    return false;
+                }
+                ts->backend = (enum ggml_task_backend)backend;
+                ts->parallel = parallel ? true : false;
+                ts->wait = wait ? true : false;
+            }
+        }
+
+        for (int i_m = 0; i_m < shape->m_num; i_m++) {
+            int M;
+            for (int ip = 0; ip < shape->n_profiles; ip++) {
+                if (ip == 0) {
+                    rc = fscanf(fp, "%d", &M);
+                    if (rc <= 0) {
+                        return false;
+                    }
+                }
+                struct ggml_mulmat_tune_m *item =
+                    &shape->items[ip * shape->m_num + i_m];
+                item->M = M;
+                rc = fscanf(fp, "%d %d %d", &item->stages_time[0],
+                            &item->stages_time[1], &item->stages_time[2]);
+                if (rc <= 0) {
+                    return false;
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
+                                 FILE *fp) {
+    int rc;
+    rc = fprintf(fp, "%d %s %d %d %d\n\n", tune->version, tune->model,
+                 tune->ftype, tune->n_shapes, tune->n_threads);
+    if (rc <= 0) {
+        return false;
+    }
+
+    for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
+        if (i_shape > 0) {
+            fprintf(fp, "\n");
+        }
+        const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
+        rc = fprintf(fp, "%d %d %d %d %d %d\n", shape->N, shape->K,
+                     shape->src0_type, shape->src1_type, shape->n_profiles,
+                     shape->m_num);
+        if (rc <= 0) {
+            return false;
+        }
+
+        rc = ggml_mulmat_tune_write_profiles(fp, shape->profiles,
+                                             shape->n_profiles);
+        if (rc <= 0) {
+            return false;
+        }
+
+        for (int i_m = 0; i_m < shape->m_num; i_m++) {
+            for (int ip = 0; ip < shape->n_profiles; ip++) {
+                struct ggml_mulmat_tune_m *item =
+                    &shape->items[ip * shape->m_num + i_m];
+                if (ip == 0) {
+                    rc = fprintf(fp, "%4d", item->M);
+                    if (rc <= 0) {
+                        return false;
+                    }
+                }
+
+                struct ggml_task_profile *profile = &shape->profiles[ip];
+                for (int k = 0; k < 3; k++) {
+                    if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) {
+                        rc = fprintf(fp, "%9d", item->stages_time[k]);
+                        if (rc <= 0) {
+                            return false;
+                        }
+                    } else {
+                        rc = fprintf(fp, " 0");
+                        if (rc <= 0) {
+                            return false;
+                        }
+                    }
+                }
+            }
+            rc = fprintf(fp, "\n");
+            if (rc <= 0) {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+const struct ggml_mulmat_tune_shape *
+ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N,
+                           const int K, enum ggml_type src0_type,
+                           enum ggml_type src1_type) {
+    GGML_ASSERT(N > 0 && K > 0);
+
+    for (int i = 0; i < tune->n_shapes; i++) {
+        const struct ggml_mulmat_tune_shape *s = &tune->shapes[i];
+        if (s->src0_type != src0_type || s->src1_type != src1_type) {
+            continue;
+        }
+
+        if (s->N > 0 && s->K > 0) {
+            if (s->N == N && s->K == K) {
+                return s;
+            }
+        } else if (s->N > 0 && s->K == 0) {
+            if (s->N == N) {
+                return s;
+            }
+        } else if (s->N == 0 && s->K > 0) {
+            if (s->K == K) {
+                return s;
+            }
+        }
+    }
+
+    return NULL;
+}
+
+// This is the experimental reference implementation.
+// Requires both n_threads are same at bench time and runtime.
+void ggml_mulmat_tune_estimate_time(
+    const struct ggml_mulmat_tune_shape *shape, int M,
+    struct ggml_mulmat_tune_time *profile_time) {
+
+    GGML_ASSERT(shape);
+    GGML_ASSERT(profile_time);
+
+    const int m_num = shape->m_num;
+    const int min_m = shape->items[0].M;
+    const int max_m = shape->items[m_num - 1].M;
+
+    for (int ip = 0; ip < shape->n_profiles; ip++) {
+        struct ggml_task_profile *profile = &shape->profiles[ip];
+        profile_time[ip].total_time = 0;
+        profile_time[ip].profile = profile;
+
+        const int items_offset = ip * m_num;
+
+        struct ggml_mulmat_tune_m *p0 = NULL;
+        struct ggml_mulmat_tune_m *p1 = NULL;
+        if (M < min_m) {
+            // first two.
+            p0 = &shape->items[items_offset];
+            p1 = &shape->items[items_offset + 1];
+        } else if (M > max_m) {
+            // last two
+            p0 = &shape->items[items_offset + m_num - 2];
+            p1 = &shape->items[items_offset + m_num - 1];
+        } else {
+            for (int i = 0; i < m_num; i++) {
+                p1 = &shape->items[items_offset + i];
+                if (p1->M == M) {
+                    p0 = p1;
+                    break;
+                }
+
+                if (i > 0) {
+                    p0 = (struct ggml_mulmat_tune_m *)(p1 - 1);
+                    if (M > p0->M && M < p1->M) {
+                        break;
+                    }
+                }
+            }
+        }
+
+        GGML_ASSERT(p0 && p1);
+
+        for (int i_stage = 0; i_stage < 3; i_stage++) {
+            struct ggml_task_stage *stage = &profile->stages[i_stage];
+            if (stage->backend == GGML_TASK_BACKEND_NONE) {
+                continue;
+            }
+
+            int p0_v = p0->stages_time[i_stage];
+            int p1_v = p1->stages_time[i_stage];
+
+            GGML_ASSERT(p0_v >= 0);
+            GGML_ASSERT(p1_v >= 0);
+
+            // t = aM + b
+            double a;
+            double b;
+
+            if (p0 == p1) {
+                a = 0.0;
+                b = p1_v;
+            } else {
+                a = 1.0 * (p1_v - p0_v) / (p1->M - p0->M);
+                b = p1_v - a * p1->M;
+            }
+            int t = (int)(a * M + b);
+
+            profile_time[ip].stage_time[i_stage] = t;
+            profile_time[ip].total_time += t;
+        }
+    }
+}
+
+// Experimental: create mul_mat tensor.
+static struct ggml_tensor *ggml_mulmat_new_tensor(int M, int N, int K,
+                                                  enum ggml_type src0_type,
+                                                  struct ggml_context **ctx) {
+    // At most 256, because in `ggml_quantize_qx_x`, the index type of hist is
+    // either int8_t or uint8_t.
+    // Use 1024 to avoid suddenly broken.
+    int64_t hist[1024];
+
+    bool src0_is_quantized = ggml_is_quantized(src0_type);
+
+    size_t ctx_size = 0;
+    ctx_size += (size_t)(M * N * ggml_type_sizef(GGML_TYPE_F32)); // src1
+    ctx_size += (size_t)(N * K * ggml_type_sizef(src0_type));     // src0
+    ctx_size += (size_t)(1024 * 1024 * 64); // experimental
+
+    if (src0_is_quantized) {
+        // quantize F32 to Qx_x
+        ctx_size += (size_t)(N * K * ggml_type_sizef(GGML_TYPE_F32));
+    }
+
+    struct ggml_init_params init_params = {
+        .mem_size = ctx_size,
+        .mem_buffer = NULL,
+        .no_alloc = 0,
+    };
+
+    *ctx = ggml_init(init_params);
+    GGML_ASSERT(*ctx);
+
+    // src0: N x K
+    struct ggml_tensor *src0 =
+        ggml_new_tensor_2d(*ctx, src0_type, (int64_t)K, (int64_t)N);
+
+    // src1: M x K
+    struct ggml_tensor *src1 =
+        ggml_new_tensor_2d(*ctx, GGML_TYPE_F32, (int64_t)K, (int64_t)M);
+    ggml_set_f32(src1, 0.5f);
+
+    if (src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_F16) {
+        ggml_set_f32(src0, 0.1f);
+    } else if (src0_is_quantized) {
+        struct ggml_tensor *src0_f32 =
+            ggml_new_tensor_2d(*ctx, GGML_TYPE_F32, (int64_t)K, (int64_t)N);
+        ggml_set_f32(src0_f32, 0.1f);
+
+        switch (src0_type) {
+        case GGML_TYPE_Q4_0:
+            ggml_quantize_q4_0((const float *)src0_f32->data, src0->data, N * K,
+                               K, hist);
+            break;
+        case GGML_TYPE_Q4_1:
+            ggml_quantize_q4_1((const float *)src0_f32->data, src0->data, N * K,
+                               K, hist);
+            break;
+        case GGML_TYPE_Q5_0:
+            ggml_quantize_q5_0((const float *)src0_f32->data, src0->data, N * K,
+                               K, hist);
+            break;
+        case GGML_TYPE_Q5_1:
+            ggml_quantize_q5_1((const float *)src0_f32->data, src0->data, N * K,
+                               K, hist);
+            break;
+        case GGML_TYPE_Q8_0:
+            ggml_quantize_q8_0((const float *)src0_f32->data, src0->data, N * K,
+                               K, hist);
+            break;
+        default:
+            GGML_ASSERT(false);
+        }
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    // node: M x N
+    // Will compute z = y * xT, z: node, y: src1, x: src0
+    return ggml_mul_mat(*ctx, src0, src1);
+}
+
+// Experimental: allocate memory for wdata with max possible size.
+// This part of code is actually belongs to ggml compute graph.
+static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) {
+    // The size is actually determined by cgraph before computing.
+    // Apart from the src0_type, wsize is affected by backend, cache line size,
+    // n_threads etc.
+
+    const size_t extra = 1024 * 1024;
+    size_t sz = (size_t)(N * K * ggml_type_sizef(GGML_TYPE_F32)) + extra;
+    void *buf = malloc(sz);
+
+    if (!buf) {
+        fprintf(stderr,
+                "[mulmat tune] error: failed to allocate %zu MiB memory",
+                sz / 1024 / 1024);
+        return 0;
+    }
+
+    memset(buf, 0, sz);
+    *wdata = buf;
+    return sz;
+}
+
+int ggml_mulmat_tune_get_builtin_task_backends(
+    enum ggml_task_backend *backends) {
+    int i = 0;
+    backends[i++] = GGML_TASK_BACKEND_CPU;
+
+    if (ggml_cpu_has_cpublas()) {
+        backends[i++] = GGML_TASK_BACKEND_CPU_BLAS;
+    }
+
+    if (ggml_cpu_has_cublas()) {
+        backends[i++] = GGML_TASK_BACKEND_GPU_CUDA;
+    } else if (ggml_cpu_has_clblast()) {
+        backends[i++] = GGML_TASK_BACKEND_GPU_CL;
+    }
+    return i;
+}
+
+bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
+                            struct ggml_mulmat_tune_params *params) {
+    GGML_ASSERT(tune);
+    GGML_ASSERT(params);
+    GGML_ASSERT(params->model.name);
+
+    enum ggml_task_backend backends[16];
+    int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
+    if (n_backends < 2) {
+        fprintf(stderr,
+                "[mulmat tune] error: this program was not built with BLAS.\n");
+        return false;
+    }
+
+    bool ok = ggml_mulmat_tune_init(tune, params, NULL);
+    if (!ok) {
+        return false;
+    }
+
+    {
+        char buf[128] = {0};
+        int offset = 0;
+
+        for (int i = 0; i < n_backends; i++) {
+            if (i > 0) {
+                buf[offset++] = ',';
+                buf[offset++] = ' ';
+            }
+            const char *name = ggml_mulmat_tune_task_backend_name(backends[i]);
+            size_t len = strlen(name);
+            memcpy(&buf[offset], name, len);
+            offset += (int)len;
+        }
+
+        fprintf(stdout,
+                "[mulmat tune] model: %s, ggml ftype: %d, "
+                "n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n",
+                params->model.name, params->model.ftype, params->n_pass,
+                params->n_threads, tune->n_shapes, buf);
+    }
+
+    int64_t stages_time[3];
+    int64_t t0 = ggml_time_ms();
+
+    struct ggml_threading_context *thrd_ctx = ggml_threading_start(
+        tune->n_threads, ggml_threading_graph_compute_thread,
+        ggml_compute_forward_wrapper, GGML_THREADING_FEATURE_WAIT_ON_DONE,
+        stages_time);
+
+    for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
+        const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
+        int M;
+        int N = shape->N;
+        int K = shape->K;
+
+        char buf[20] = {0};
+        int buf_len = sizeof(buf) - 1;
+        int line_len = 0;
+
+        for (int i_m = 0; i_m < shape->m_num; i_m++) {
+            M = shape->arr_m[i_m];
+            if (shape->N == 0) {
+                N = M;
+            } else if (shape->K == 0) {
+                K = M;
+            }
+
+            if (params->progress) {
+                line_len = snprintf(buf, buf_len, "%d %d %d ", N, K, M);
+                fprintf(stdout, "%s", buf);
+                fflush(stdout);
+            }
+
+            char *wdata = NULL;
+            size_t wsize = ggml_mulmat_allocate_wdata(N, K, &wdata);
+            if (wsize == 0) {
+                return false;
+            }
+
+            struct ggml_context *ctx = NULL;
+            struct ggml_tensor *node =
+                ggml_mulmat_new_tensor(M, N, K, shape->src0_type, &ctx);
+
+            for (int ip = 0; ip < shape->n_profiles; ip++) {
+                const struct ggml_task_profile *profile = &shape->profiles[ip];
+
+                memcpy(&node->task_profile, profile,
+                       sizeof(struct ggml_task_profile));
+
+                struct ggml_mulmat_tune_m *item =
+                    &shape->items[ip * shape->m_num + i_m];
+                item->M = M;
+
+                int min[3] = {INT32_MAX, INT32_MAX, INT32_MAX};
+
+                for (int k = 0; k < params->n_pass; k++) {
+                    for (int j = 0; j < 3; j++) {
+                        stages_time[j] = 0;
+                    }
+
+                    /*enum ggml_compute_error err = */
+                    ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
+
+                    for (int i = 0; i < 3; i++) {
+                        int v = (int)stages_time[i];
+                        if (v < min[i]) {
+                            min[i] = v;
+                        }
+                    }
+
+                    if (params->progress) {
+                        fprintf(stdout, ".");
+                        fflush(stdout);
+                        line_len++;
+                    }
+                }
+                for (int i = 0; i < 3; i++) {
+                    item->stages_time[i] = min[i];
+                }
+            }
+
+            ggml_free(ctx);
+            free(wdata);
+
+            if (params->progress) {
+                line_len += 10;
+                for (int j = 0; j < line_len; j++) {
+                    fprintf(stdout, "\b \b");
+                }
+                fflush(stdout);
+            }
+        }
+    }
+
+    ggml_threading_stop(thrd_ctx);
+
+    fprintf(stdout, "[mulmat tune] done, elapsed time: %d seconds.\n",
+            (int)(ggml_time_ms() - t0) / 1000);
+
+    // output
+
+    if (params->fname && strcmp(params->fname, "") != 0) {
+        FILE *fp = fopen(params->fname, "w");
+        if (!fp) {
+            fprintf(stderr,
+                    "[mulmat tune] warn: failed to open file `%s`, print to "
+                    "console instead\n\n",
+                    params->fname);
+            params->output_console = 1;
+        } else {
+            ok = ggml_mulmat_tune_write_data(tune, fp);
+            fclose(fp);
+
+            if (ok) {
+                fprintf(stdout, "[mulmat tune] data was written to `%s`\n",
+                        params->fname);
+            } else {
+                fprintf(
+                    stderr,
+                    "[mulmat tune] warn: failed to write file `%s`, print to "
+                    "console instead\n\n",
+                    params->fname);
+                params->output_console = 1;
+            }
+        }
+    }
+
+    if (params->output_console) {
+        return ggml_mulmat_tune_write_data(tune, stdout);
+    }
+
+    return true;
+}
diff --git a/ggml-tune.h b/ggml-tune.h
new file mode 100644
index 000000000..404f1f1c4
--- /dev/null
+++ b/ggml-tune.h
@@ -0,0 +1,137 @@
+#pragma once
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define GGML_MULMAT_TUNE_VERSION 8
+#define GGML_MULMAT_N_SHAPES 6
+
+#define GGML_MULMAT_MAX_PASS 3
+
+struct ggml_mulmat_tune_m {
+    int M;
+
+    int stages_time[3];
+};
+
+struct ggml_mulmat_tune_model {
+    const char *name;
+
+    enum ggml_ftype ftype;
+
+    int n_vocab;
+
+    int n_embd;
+
+    // n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult
+    int n_ff;
+
+    // n_rot = n_embd/n_head;
+    int n_rot;
+};
+
+struct ggml_mulmat_tune_shape {
+    // For RoPE, one of N / K is 0.
+    int N;
+    int K;
+
+    enum ggml_type src0_type;
+    enum ggml_type src1_type;
+
+    int n_profiles;
+    struct ggml_task_profile *profiles;
+
+    int m_num;
+    int *arr_m;
+
+    struct ggml_mulmat_tune_m *items;
+};
+
+struct ggml_mulmat_tune {
+    int version;
+
+    char model[16];
+
+    enum ggml_ftype ftype;
+
+    int n_shapes;
+    // Given N/K, we bench for mul_mat [M,K] x [K,N].
+    struct ggml_mulmat_tune_shape shapes[GGML_MULMAT_N_SHAPES];
+
+    int n_threads;
+};
+
+struct ggml_mulmat_tune_time {
+    struct ggml_task_profile *profile;
+    int stage_time[3];
+    int total_time;
+};
+
+struct mm_cache_element {
+    int M;
+    int N;
+    int K;
+    struct ggml_task_profile *profile;
+    int stages_time[3];
+};
+
+// params for tune/bench.
+struct ggml_mulmat_tune_params {
+    struct ggml_mulmat_tune_model model;
+    int m_num;
+    int n_pass;
+    int n_threads;
+    bool progress;       // print and clear '.'
+    bool output_console; // also print result to console
+    const char *fname;
+};
+
+// NOTE: stages_time is filled if not null.
+const struct ggml_task_profile *
+ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
+                                     int N, int K, enum ggml_type src0_t,
+                                     enum ggml_type src1_t, int stages_time[3]);
+
+bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
+                               const char *model_name, int ftype,
+                               int n_threads);
+
+void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
+                                 const char *name, enum ggml_ftype ftype);
+
+bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
+                           struct ggml_mulmat_tune_params *params,
+                           struct ggml_task_profile_factory *profile_factory);
+
+void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune);
+
+bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp);
+
+bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp);
+
+const struct ggml_mulmat_tune_shape *
+ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, int N, int K,
+                           enum ggml_type src0_type, enum ggml_type src1_type);
+
+void ggml_mulmat_tune_estimate_time(const struct ggml_mulmat_tune_shape *shape,
+                                    int M,
+                                    struct ggml_mulmat_tune_time *profile_time);
+
+const char *ggml_task_backend_name(enum ggml_task_backend backend);
+
+int ggml_mulmat_tune_get_builtin_task_backends(
+    enum ggml_task_backend *backends);
+
+bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
+                            struct ggml_mulmat_tune_params *params);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml.c b/ggml.c
index 78c365354..5d0b83b1d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -61,26 +61,6 @@ static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) {
     return atomic_fetch_add(ptr, -(dec));
 }
 
-typedef HANDLE pthread_t;
-
-typedef DWORD thread_ret_t;
-static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
-    (void) unused;
-    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
-    if (handle == NULL)
-    {
-        return EAGAIN;
-    }
-
-    *out = handle;
-    return 0;
-}
-
-static int pthread_join(pthread_t thread, void* unused) {
-    (void) unused;
-    return (int) WaitForSingleObject(thread, INFINITE);
-}
-
 static int sched_yield (void) {
     Sleep (0);
     return 0;
@@ -88,8 +68,6 @@ static int sched_yield (void) {
 #else
 #include <pthread.h>
 #include <stdatomic.h>
-
-typedef void* thread_ret_t;
 #endif
 
 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -166,6 +144,12 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #include "ggml-opencl.h"
 #endif
 
+#if defined(GGML_USE_MULMAT_TUNE)
+    #include "ggml-tune.h"
+#endif
+
+#include "ggml-threading.h"
+
 #undef MIN
 #undef MAX
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@@ -4059,6 +4043,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         ggml_cl_init();
 #endif
 
+        ggml_mulmat_init_task_profiles();
+
         is_first_call = false;
     }
 
@@ -4302,7 +4288,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
         /*.src0         =*/ NULL,
         /*.src1         =*/ NULL,
         /*.opt          =*/ { NULL },
-        /*.n_tasks      =*/ 0,
+        /*.task_profile =*/ { 0 },
         /*.perf_runs    =*/ 0,
         /*.perf_cycles  =*/ 0,
         /*.perf_time_us =*/ 0,
@@ -8516,14 +8502,19 @@ static void ggml_compute_forward_mul_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
+    enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
+    if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
 #ifdef GGML_USE_CLBLAST
-    if (src1->backend == GGML_BACKEND_GPU) {
-        if (ith == 0) {
-            ggml_cl_mul(src0, src1, dst);
+        if (src1->backend == GGML_BACKEND_GPU) {
+            if (ith == 0) {
+                ggml_cl_mul(src0, src1, dst);
+            }
+            return;
         }
-        return;
-    }
+#else
+        GGML_ASSERT(false);
 #endif
+    };
 
     const int64_t nr = ggml_nrows(src0);
 
@@ -9950,36 +9941,6 @@ static void ggml_compute_forward_rms_norm_back(
 }
 
 
-// ggml_compute_forward_mul_mat
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-// helper function to determine if it is better to use BLAS or not
-// for large matrices, BLAS is faster
-static bool ggml_compute_forward_mul_mat_use_blas(
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    //const int64_t ne00 = src0->ne[0];
-    //const int64_t ne01 = src0->ne[1];
-
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    // TODO: find the optimal values for these
-    if (ggml_is_contiguous(src0) &&
-        ggml_is_contiguous(src1) &&
-        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
-
-        /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
-        return true;
-    }
-
-    return false;
-}
-#endif
-
 static void ggml_compute_forward_mul_mat_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -10050,28 +10011,25 @@ static void ggml_compute_forward_mul_mat_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
+    enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
+
+    if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
 #if defined(GGML_USE_CLBLAST)
-    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
-        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
-            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        }
+        GGML_ASSERT(params->nth == 1);
+        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
+        ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
         return;
-    }
+#else
+        GGML_ASSERT(false);
 #endif
+    }
 
+    GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
+
+    if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
-        if (params->ith != 0) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_INIT) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_FINALIZE) {
-            return;
-        }
+        GGML_ASSERT(params->nth == 1);
+        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
 
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -10089,16 +10047,13 @@ static void ggml_compute_forward_mul_mat_f32(
         //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
 
         return;
-    }
+#else
+        GGML_ASSERT(false);
 #endif
-
-    if (params->type == GGML_TASK_INIT) {
-        return;
     }
 
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
+    GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
+    GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU);
 
     // parallelize by src0 rows using ggml_vec_dot_f32
 
@@ -10215,30 +10170,26 @@ static void ggml_compute_forward_mul_mat_f16_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
+    enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
+
+    if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
 #if defined(GGML_USE_CLBLAST)
-    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
-        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
-            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        }
+        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
+        ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
         return;
-    }
+#else
+        GGML_ASSERT(false);
 #endif
+    }
 
+    enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend;
+    GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
+
+    if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         GGML_ASSERT(nb10 == sizeof(float));
-
-        if (params->ith != 0) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_INIT) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_FINALIZE) {
-            return;
-        }
+        GGML_ASSERT(params->nth == 1);
+        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
 
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -10271,8 +10222,14 @@ static void ggml_compute_forward_mul_mat_f16_f32(
         /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
 
         return;
-    }
+#else
+        GGML_ASSERT(false);
 #endif
+    }
+
+    GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE);
+    GGML_ASSERT(init_backend == GGML_TASK_BACKEND_CPU);
+    GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU);
 
     if (params->type == GGML_TASK_INIT) {
         ggml_fp16_t * const wdata = params->wdata;
@@ -10293,9 +10250,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
         return;
     }
 
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
+    GGML_ASSERT (params->type == GGML_TASK_COMPUTE);
 
     // fp16 -> half the size, so divide by 2
     // TODO: do not support transposed src1
@@ -10420,50 +10375,62 @@ static void ggml_compute_forward_mul_mat_q_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
+    enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
+
+    if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
 #if defined(GGML_USE_CLBLAST)
-    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
-        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
-            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        }
+        GGML_ASSERT(params->nth == 1);
+        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
+        ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
         return;
-    }
+#else
+        GGML_ASSERT(false);
 #endif
+    }
 
+    enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend;
+    GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
+
+    if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
-        if (params->ith != 0) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_INIT) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_FINALIZE) {
-            return;
-        }
+        GGML_ASSERT (init_backend == GGML_TASK_BACKEND_CPU);
+        GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE);
+        GGML_ASSERT(src0->data);
+        GGML_ASSERT(params->wdata);
 
         float * const wdata = params->wdata;
         dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
 
+        if (params->type == GGML_TASK_INIT) {
+            // rows per thread
+            const int dr = (ne01 + nth - 1)/nth;
+
+            // row range for this thread
+            const int ir0 = dr*ith;
+            int ir1 = MIN(ir0 + dr, ne01);
+
+            for (int64_t i03 = 0; i03 < ne03; i03++) {
+                for (int64_t i02 = 0; i02 < ne02; i02++) {
+                    char  * data0_offset = (char *) src0->data + i03*nb03 + i02*nb02;
+                    float * wdata_offset = wdata + i03*ne03 + i02*ne02;
+                    for (int64_t i = ir0; i < ir1; ++i) {
+                        dequantize_row_q(data0_offset + i*nb01, wdata_offset + i*ne00, ne00);
+                    }
+                }
+            }
+            return;
+        }
+
+        GGML_ASSERT(nth == 1);
+        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
+
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
                 const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-
                 float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
 
-                {
-                    size_t id = 0;
-                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
-                        dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
-                        id += ne00;
-                    }
-
-                    assert(id*sizeof(float) <= params->wsize);
-                }
-
+                // zT = y * xT
                 const float * x = wdata;
-
                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                         ne11, ne01, ne10,
                         1.0f,    y, ne10,
@@ -10472,13 +10439,19 @@ static void ggml_compute_forward_mul_mat_q_f32(
             }
         }
 
-        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
-
         return;
-    }
+#else
+        GGML_ASSERT(false);
 #endif
+    }
+
+    GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE);
+    GGML_ASSERT(init_backend == GGML_TASK_BACKEND_CPU);
+    GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU);
 
     if (params->type == GGML_TASK_INIT) {
+        GGML_ASSERT(params->nth == 1);
+
         char * wdata = params->wdata;
         const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
 
@@ -10490,13 +10463,10 @@ static void ggml_compute_forward_mul_mat_q_f32(
                 }
             }
         }
-
         return;
     }
 
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
+    GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
 
     // parallelize by src0 rows using ggml_vec_dot_q
 
@@ -14324,20 +14294,31 @@ static void ggml_compute_forward_cross_entropy_loss_back(
     }
 }
 
-
 /////////////////////////////////
 
-static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
+static enum ggml_compute_error ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
     GGML_ASSERT(params);
 
-#ifdef GGML_USE_CUBLAS
-    bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
-    if (skip_cpu) {
-        return;
+    enum ggml_task_backend comp_backend = tensor->task_profile.stages[GGML_TASK_COMPUTE].backend;
+
+    if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
+#if defined(GGML_USE_CUBLAS)
+        bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
+        if (skip_cpu) {
+            return GGML_COMPUTE_OK;
+        }
+        GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
+        GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
+        return GGML_COMPUTE_FALLBACK;
+#else
+        GGML_ASSERT(false);
+#endif
     }
-    GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
-    GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
-#endif // GGML_USE_CUBLAS
+
+    // if (tensor->task_profile.stages[params->type].backend > GGML_TASK_BACKEND_CPU) {
+    //     printf("mulmat: test fallback\n");
+    //     return GGML_COMPUTE_FALLBACK;
+    // }
 
     switch (tensor->op) {
         case GGML_OP_DUP:
@@ -14585,6 +14566,15 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
                 GGML_ASSERT(false);
             } break;
     }
+
+    return GGML_COMPUTE_OK;
+}
+
+enum ggml_compute_error ggml_compute_forward_wrapper(struct ggml_compute_params *params,
+    struct ggml_tensor *tensor) {
+    // We call ggml_compute_forward because the CUDA mul_mat entry point
+    // was moved out of `ggml_compute_forward_mul_mat`.
+    return ggml_compute_forward(params, tensor);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -15480,6 +15470,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
         /*.n_nodes      =*/ 0,
         /*.n_leafs      =*/ 0,
         /*.n_threads    =*/ GGML_DEFAULT_N_THREADS,
+        /*.tune         =*/ NULL,
         /*.work_size    =*/ 0,
         /*.work         =*/ NULL,
         /*.nodes        =*/ { NULL },
@@ -15533,175 +15524,288 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
     return result;
 }
 
-//
-// thread data
-//
-// synchronization is done via busy loops
-// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
-//
+// ---- mulmat task profiles  ----
 
-#ifdef __APPLE__
+static struct ggml_task_profile_factory default_task_profile_factory = {0};
 
-//#include <os/lock.h>
-//
-//typedef os_unfair_lock ggml_lock_t;
-//
-//#define ggml_lock_init(x)    UNUSED(x)
-//#define ggml_lock_destroy(x) UNUSED(x)
-//#define ggml_lock_lock       os_unfair_lock_lock
-//#define ggml_lock_unlock     os_unfair_lock_unlock
-//
-//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
+// TODO: thread unsafe. Should be initialized once.
+void ggml_mulmat_init_task_profiles(void) {
+    const size_t sz = sizeof(struct ggml_task_profile_factory);
+    memset(&default_task_profile_factory, 0, sz);
 
-typedef int ggml_lock_t;
+    // f32
+    {
+        struct ggml_task_profile *p = default_task_profile_factory.f32_f32;
+        int i = 0;
 
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#define ggml_lock_lock(x)    UNUSED(x)
-#define ggml_lock_unlock(x)  UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-
-typedef pthread_t ggml_thread_t;
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
-
-#else
-
-//typedef pthread_spinlock_t ggml_lock_t;
-
-//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
-//#define ggml_lock_destroy pthread_spin_destroy
-//#define ggml_lock_lock    pthread_spin_lock
-//#define ggml_lock_unlock  pthread_spin_unlock
-
-typedef int ggml_lock_t;
-
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
-#define ggml_lock_lock(x)    _mm_pause()
-#else
-#define ggml_lock_lock(x)    UNUSED(x)
-#endif
-#define ggml_lock_unlock(x)  UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-
-typedef pthread_t ggml_thread_t;
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
+        p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[i].stages[1].parallel = true;
+        i++;
 
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+        p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
+        p[i].stages[1].wait = true;
+        i++;
 #endif
 
-struct ggml_compute_state_shared {
-    ggml_lock_t spin;
+#if defined(GGML_USE_CUBLAS)
+        p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
+        p[i].stages[1].wait = true;
+        i++;
+#elif defined(GGML_USE_CLBLAST)
+        p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
+        p[i].stages[1].wait = true;
+        i++;
+#endif
+        default_task_profile_factory.n_f32_f32 = i;
+    }
 
-    int n_threads;
+    // f16
+    {
+        struct ggml_task_profile *p = default_task_profile_factory.f16_f32;
+        int i = 0;
 
-    // synchronization primitives
-    atomic_int  n_ready;
-    atomic_bool has_work;
-    atomic_bool stop; // stop all threads
-};
+        p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
+        p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[i].stages[1].parallel = true;
+        i++;
 
-struct ggml_compute_state {
-    ggml_thread_t thrd;
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+        p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
+        p[i].stages[1].wait = true;
+        i++;
+#endif
 
-    struct ggml_compute_params params;
-    struct ggml_tensor * node;
+#if defined(GGML_USE_CUBLAS)
+        p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
+        p[i].stages[1].wait = true;
+        i++;
+#elif defined(GGML_USE_CLBLAST)
+        p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
+        p[i].stages[1].wait = true;
+        i++;
+#endif
+        default_task_profile_factory.n_f16_f32 = i;
+    }
 
-    struct ggml_compute_state_shared * shared;
-};
+    // qxx
+    {
+        struct ggml_task_profile *p = default_task_profile_factory.qxx_f32;
+        int i = 0;
 
-static thread_ret_t ggml_graph_compute_thread(void * data) {
-    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
+        p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
+        p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[i].stages[1].parallel = true;
+        i++;
 
-    const int n_threads = state->shared->n_threads;
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+        p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
+        p[i].stages[0].parallel = true;
+        p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
+        p[i].stages[1].wait = true;
+        i++;
+#endif
 
-    while (true) {
-        if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {
-            atomic_store(&state->shared->has_work, false);
-        } else {
-            while (atomic_load(&state->shared->has_work)) {
-                if (atomic_load(&state->shared->stop)) {
-                    return 0;
+#if defined(GGML_USE_CUBLAS)
+        p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
+        p[i].stages[1].wait = true;
+        i++;
+#elif defined(GGML_USE_CLBLAST)
+        p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
+        p[i].stages[1].wait = true;
+        i++;
+#endif
+        default_task_profile_factory.n_qxx_f32 = i;
+    }
+}
+
+int ggml_mulmat_get_task_profiles(struct ggml_task_profile_factory *pf,
+                                  enum ggml_type src0_t, enum ggml_type src1_t,
+                                  struct ggml_task_profile **profiles) {
+    GGML_ASSERT(profiles);
+
+    if (pf == NULL) {
+        pf = &default_task_profile_factory;
+    }
+
+    GGML_ASSERT(src1_t == GGML_TYPE_F32);
+
+    if (src0_t == GGML_TYPE_F32) {
+        *profiles = pf->f32_f32;
+        return pf->n_f32_f32;
+    }
+
+    if (src0_t == GGML_TYPE_F16) {
+        *profiles = pf->f16_f32;
+        return pf->n_f16_f32;
+    }
+
+    if (ggml_is_quantized(src0_t)) {
+        *profiles = pf->qxx_f32;
+        return pf->n_qxx_f32;
+    }
+
+    GGML_ASSERT(false);
+}
+
+static const struct ggml_task_profile *
+ggml_mulmat_get_default_task_profile(struct ggml_task_profile_factory *pf,
+                                     enum ggml_type src0_type,
+                                     enum ggml_type src1_type) {
+    GGML_ASSERT(src1_type == GGML_TYPE_F32);
+    if (pf == NULL) {
+        pf = &default_task_profile_factory;
+    }
+
+    struct ggml_task_profile *p = NULL;
+
+    if (src0_type == GGML_TYPE_F32) {
+        p = &pf->f32_f32[0];
+    } else if (src0_type == GGML_TYPE_F16) {
+        p = &pf->f16_f32[0];
+    } else if (ggml_is_quantized(src0_type)) {
+        p = &pf->qxx_f32[0];
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    for (int i = 0; i < 3; i++) {
+        GGML_ASSERT(p->stages[i].backend == GGML_TASK_BACKEND_CPU ||
+                    p->stages[i].backend == GGML_TASK_BACKEND_NONE);
+    }
+
+    return p;
+}
+
+// Set task profile for GGML_OP_MUL_MAT or GGML_OP_OUT_PROD.
+static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node,
+                                         struct ggml_mulmat_tune *tune) {
+    GGML_ASSERT(node);
+    GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD);
+
+    enum ggml_type src0_t = node->src0->type;
+    enum ggml_type src1_t = node->src1->type;
+
+    // Type and memory layout requirements for computing mul_mat with BLAS.
+    bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 ||
+                       ggml_is_quantized(src0_t)) &&
+                      src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 &&
+                      ggml_is_contiguous(node->src0) &&
+                      ggml_is_contiguous(node->src1);
+
+    int M = (int)node->ne[1];
+    int N = (int)node->ne[0];
+    int K = (int)node->src1->ne[0];
+
+    struct ggml_task_profile *profiles = NULL;
+    int n_profiles = ggml_mulmat_get_task_profiles(NULL, src0_t, src1_t, &profiles);
+    GGML_ASSERT(n_profiles >= 2);
+    GGML_ASSERT(profiles);
+
+    const struct ggml_task_profile *prof = NULL;
+
+    if (cond_match) {
+#if defined(GGML_USE_MULMAT_TUNE)
+        if (tune != NULL) {
+            int stages_time_us[3];
+            prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t, src1_t, stages_time_us);
+            if (prof != NULL) {
+                 GGML_ASSERT(prof);
+                 memcpy(&node->task_profile, prof, sizeof(struct ggml_task_profile));
+                 // Do not wait if the estimated execution time is too small (e.g. less than 0.1 ms)
+                 // TODO: need bench actual wait/notify time, see ggml-threading.c
+                 for (int i = 0; i < 3; i++) {
+                    if (node->task_profile.stages[i].wait) {
+                        if (stages_time_us[i] < 100) {
+                            node->task_profile.stages[i].wait = false;
+                        }
+                    }
+                 }
+                 return;
+            }
+        }
+#else
+        UNUSED(tune);
+#endif
+
+        if (prof == NULL && M >= 32 && N >= 32 && K >= 32) {
+            for (int j = 0; j < n_profiles; j++) {
+                enum ggml_task_backend comp_be =
+                    profiles[j].stages[GGML_TASK_COMPUTE].backend;
+
+                switch (comp_be) {
+                    case GGML_TASK_BACKEND_GPU_CUDA: {
+                        GGML_ASSERT(ggml_cpu_has_cublas());
+                        prof = &profiles[j];
+                        break;
+                    }
+                    case GGML_TASK_BACKEND_GPU_CL: {
+                        GGML_ASSERT(ggml_cpu_has_clblast());
+                        prof = &profiles[j];
+                        break;
+                    }
+                    case GGML_TASK_BACKEND_CPU_BLAS: {
+                        GGML_ASSERT(ggml_cpu_has_cpublas());
+                        prof = &profiles[j];
+                        break;
+                    }
+                    default: {
+                        break;
+                    }
                 }
-                ggml_lock_lock  (&state->shared->spin);
-                ggml_lock_unlock(&state->shared->spin);
             }
         }
-
-        atomic_fetch_sub(&state->shared->n_ready, 1);
-
-        // wait for work
-        while (!atomic_load(&state->shared->has_work)) {
-            if (atomic_load(&state->shared->stop)) {
-                return 0;
-            }
-            ggml_lock_lock  (&state->shared->spin);
-            ggml_lock_unlock(&state->shared->spin);
-        }
-
-        // check if we should stop
-        if (atomic_load(&state->shared->stop)) {
-            break;
-        }
-
-        if (state->node) {
-            if (state->params.ith < state->params.nth) {
-                ggml_compute_forward(&state->params, state->node);
-            }
-
-            state->node = NULL;
-        } else {
-            break;
-        }
     }
 
-    return 0;
+    if (prof == NULL) {
+        prof = ggml_mulmat_get_default_task_profile(NULL, src0_t, src1_t);
+    }
+
+    GGML_ASSERT(prof);
+    memcpy(&node->task_profile, prof, sizeof(struct ggml_task_profile));
 }
 
 void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
-    const int n_threads = cgraph->n_threads;
+    int n_threads = cgraph->n_threads;
 
-    struct ggml_compute_state_shared state_shared = {
-        /*.spin      =*/ GGML_LOCK_INITIALIZER,
-        /*.n_threads =*/ n_threads,
-        /*.n_ready   =*/ 0,
-        /*.has_work  =*/ false,
-        /*.stop      =*/ false,
-    };
-    struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL;
+    if (ggml_cpu_has_blas()) {
+        for (int i = 0; i < cgraph->n_nodes; i++) {
+            struct ggml_tensor *node = cgraph->nodes[i];
 
-    // create thread pool
-    if (n_threads > 1) {
-        ggml_lock_init(&state_shared.spin);
+            memset(&node->task_profile, 0, sizeof(struct ggml_task_profile));
+            struct ggml_task_stage *stages = node->task_profile.stages;
 
-        atomic_store(&state_shared.has_work, true);
+            // Adapt node->backend: assume GPU at COMPUTE stage.
+            if (node->backend > GGML_BACKEND_CPU) {
+                stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_NONE;
+                stages[GGML_TASK_FINALIZE].backend = GGML_TASK_BACKEND_NONE;
 
-        for (int j = 0; j < n_threads - 1; j++) {
-            workers[j] = (struct ggml_compute_state) {
-                .thrd   = 0,
-                .params = {
-                    .type  = GGML_TASK_COMPUTE,
-                    .ith   = j + 1,
-                    .nth   = n_threads,
-                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                    .wdata = cgraph->work ? cgraph->work->data : NULL,
-                },
-                .node   = NULL,
-                .shared = &state_shared,
-            };
-
-            int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
-            GGML_ASSERT(rc == 0);
-            UNUSED(rc);
+                stages[GGML_TASK_COMPUTE].parallel = false;
+                bool wait = (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL);
+                stages[GGML_TASK_COMPUTE].wait = wait;
+                if (ggml_cpu_has_cublas()) {
+                    stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_GPU_CUDA;
+                } else if (ggml_cpu_has_clblast()) {
+                    stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_GPU_CL;
+                } else {
+                    GGML_ASSERT(false);
+                }
+            } else if (node->op == GGML_OP_MUL_MAT) {
+                struct ggml_mulmat_tune * tune = NULL;
+#if defined(GGML_USE_MULMAT_TUNE)
+                tune = cgraph->tune;
+#endif
+                ggml_mulmat_set_tensor_task_profile(node, tune);
+            } else if (node->op == GGML_OP_OUT_PROD) {
+                ggml_mulmat_set_tensor_task_profile(node, NULL);
+            }
         }
     }
 
+    struct ggml_threading_context *thrd_ctx = ggml_threading_start(
+        n_threads, ggml_threading_graph_compute_thread, ggml_compute_forward,
+        GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL);
+
     // initialize tasks + work buffer
     {
         size_t work_size = 0;
@@ -15709,13 +15813,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         // thread scheduling for the different operations
         for (int i = 0; i < cgraph->n_nodes; i++) {
             struct ggml_tensor * node = cgraph->nodes[i];
+            struct ggml_task_stage *stages = node->task_profile.stages;
 
             switch (node->op) {
                 case GGML_OP_CPY:
                 case GGML_OP_DUP:
                     {
-                        node->n_tasks = n_threads;
-
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                         size_t cur = 0;
                         if (ggml_is_quantized(node->type)) {
                             cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads;
@@ -15726,7 +15830,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_ADD:
                 case GGML_OP_ADD1:
                     {
-                        node->n_tasks = n_threads;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
 
                         size_t cur = 0;
 
@@ -15738,7 +15843,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     } break;
                 case GGML_OP_ACC:
                     {
-                        node->n_tasks = n_threads;
+                        stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
 
                         size_t cur = 0;
 
@@ -15764,9 +15871,15 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_STEP:
                 case GGML_OP_RELU:
                     {
-                        node->n_tasks = 1;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                     } break;
                 case GGML_OP_MUL:
+                    {
+                        if (stages[GGML_TASK_COMPUTE].backend == GGML_TASK_BACKEND_NONE) {
+                            stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                            stages[GGML_TASK_COMPUTE].parallel = true;
+                        }
+                    } break;
                 case GGML_OP_GELU:
                 case GGML_OP_SILU:
                 case GGML_OP_SILU_BACK:
@@ -15774,66 +15887,65 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_RMS_NORM:
                 case GGML_OP_RMS_NORM_BACK:
                     {
-                        node->n_tasks = n_threads;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
                     } break;
                 case GGML_OP_MUL_MAT:
                 case GGML_OP_OUT_PROD:
                     {
-                        node->n_tasks = n_threads;
-
-                        // TODO: use different scheduling for different matrix sizes
-                        //const int nr0 = ggml_nrows(node->src0);
-                        //const int nr1 = ggml_nrows(node->src1);
-
-                        //node->n_tasks = MIN(n_threads, MAX(1, nr0/128));
-                        //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks);
-
                         size_t cur = 0;
+                        enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend;
+                        GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE);
 
-#if defined(GGML_USE_CUBLAS)
-                        if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
-                            node->n_tasks = 1; // TODO: this actually is doing nothing
-                                                //       the threads are still spinning
+                        // TODO: remove this check once we are sure `ggml_mulmat_set_tensor_task_profile()` is correct.
+                        if ((comp_backend & GGML_TASK_BACKEND_GPU) || comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
+                            enum ggml_type src0_t = node->src0->type;
+                            enum ggml_type src1_t = node->src1->type;
+                            bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 ||
+                                ggml_is_quantized(src0_t)) &&
+                                src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 &&
+                                ggml_is_contiguous(node->src0) &&
+                                ggml_is_contiguous(node->src1);
+                            GGML_ASSERT(cond_match);
                         }
-                        else
-#elif defined(GGML_USE_CLBLAST)
-                        if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
-                            node->n_tasks = 1; // TODO: this actually is doing nothing
-                                                //       the threads are still spinning
+
+                        if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
+#if defined(GGML_USE_CLBLAST)
                             cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
-                        }
-                        else
+#else
+                            GGML_ASSERT(false);
 #endif
-                        if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                                node->n_tasks = 1; // TODO: this actually is doing nothing
-                                                   //       the threads are still spinning
+                        } else if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
+                            GGML_ASSERT(ggml_cpu_has_cpublas());
+                            GGML_ASSERT(node->src1->type == GGML_TYPE_F32);
+
+                            if (node->src0->type == GGML_TYPE_F32) {
+                                cur = 0;
+                            } else if (node->src0->type == GGML_TYPE_F16) {
                                 // here we need memory just for single 2D matrix from src0
                                 cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-                            } else {
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
-                            }
-#else
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
-#endif
-                        } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
-                            cur = 0;
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                                node->n_tasks = 1;
-                            }
-#endif
-                        } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                                node->n_tasks = 1;
+                            } else if (ggml_is_quantized(node->src0->type)) {
                                 cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-                            } else
-#endif
-                            {
+                            } else {
+                                GGML_ASSERT(false);
+                            }
+                        } else if (comp_backend == GGML_TASK_BACKEND_CPU || comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
+                            // We have to reseve buffer for CUDA because it may fallback to CPU.
+                            if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
+                                GGML_ASSERT(ggml_cpu_has_cublas());
+                            }
+
+                            GGML_ASSERT(node->src1->type == GGML_TYPE_F32);
+
+                            if (node->src0->type == GGML_TYPE_F32) {
+                                cur = 0;
+                            } else if (node->src0->type == GGML_TYPE_F16) {
+                                cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
+                            } else if (ggml_is_quantized(node->src0->type)) {
                                 const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type;
                                 cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q];
+                            } else {
+                                GGML_ASSERT(false);
                             }
                         } else {
                             GGML_ASSERT(false);
@@ -15843,9 +15955,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     } break;
                 case GGML_OP_SCALE:
                     {
-                        node->n_tasks = n_threads;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
                     } break;
                 case GGML_OP_SET:
+                    {
+                        stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                    } break;
                 case GGML_OP_CONT:
                 case GGML_OP_RESHAPE:
                 case GGML_OP_VIEW:
@@ -15856,7 +15973,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_DIAG:
                 case GGML_OP_DIAG_MASK_ZERO:
                     {
-                        node->n_tasks = 1;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                     } break;
                 case GGML_OP_DIAG_MASK_INF:
                 case GGML_OP_SOFT_MAX:
@@ -15864,20 +15981,23 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_ROPE:
                 case GGML_OP_ROPE_BACK:
                     {
-                        node->n_tasks = n_threads;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
                     } break;
                 case GGML_OP_ALIBI:
                     {
-                        node->n_tasks = 1; //TODO
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                     } break;
                 case GGML_OP_CLAMP:
                     {
-                        node->n_tasks = 1; //TODO
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                     } break;
                 case GGML_OP_CONV_1D_1S:
                 case GGML_OP_CONV_1D_2S:
                     {
-                        node->n_tasks = n_threads;
+                        stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
 
                         GGML_ASSERT(node->src0->ne[3] == 1);
                         GGML_ASSERT(node->src1->ne[2] == 1);
@@ -15906,45 +16026,48 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     } break;
                 case GGML_OP_FLASH_ATTN:
                     {
-                        node->n_tasks = n_threads;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
 
                         size_t cur = 0;
 
                         const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
 
                         if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*ne11*n_threads; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*ne11*n_threads; // this is overestimated by x2
                         }
 
                         if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*ne11*n_threads; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*ne11*n_threads; // this is overestimated by x2
                         }
 
                         work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_FLASH_FF:
                     {
-                        node->n_tasks = n_threads;
-
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
                         size_t cur = 0;
 
                         if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*node->src1->ne[1]*n_threads; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*node->src1->ne[1]*n_threads; // this is overestimated by x2
                         }
 
                         if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*node->src1->ne[1]*n_threads; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*node->src1->ne[1]*n_threads; // this is overestimated by x2
                         }
 
                         work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_FLASH_ATTN_BACK:
                     {
-                        node->n_tasks = n_threads;
+                        stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
 
                         size_t cur = 0;
 
@@ -15952,13 +16075,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                         const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
                         const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
                         if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*mxDn*n_threads; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*mxDn*n_threads; // this is overestimated by x2
                         }
 
                         if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*mxDn*n_threads; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*mxDn*n_threads; // this is overestimated by x2
                         }
 
                         work_size = MAX(work_size, cur);
@@ -15966,32 +16089,38 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_MAP_UNARY:
                 case GGML_OP_MAP_BINARY:
                     {
-                        node->n_tasks = 1;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                     } break;
                 case GGML_OP_CROSS_ENTROPY_LOSS:
                     {
-                        node->n_tasks = n_threads;
+                        stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
+                        stages[GGML_TASK_FINALIZE].backend = GGML_TASK_BACKEND_CPU;
 
-                        size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks);
+                        size_t cur = ggml_type_size(node->type)*(n_threads + node->src0->ne[0]*n_threads);
 
                         work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
                     {
-                        node->n_tasks = n_threads;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
 
-                        size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks;
+                        size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_threads;
 
                         work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_NONE:
                     {
-                        node->n_tasks = 1;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                     } break;
                 case GGML_OP_COUNT:
                     {
                         GGML_ASSERT(false);
                     } break;
+                default:
+                    GGML_ASSERT(false);
             }
         }
 
@@ -16023,126 +16152,27 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         const int64_t perf_node_start_cycles  = ggml_perf_cycles();
         const int64_t perf_node_start_time_us = ggml_perf_time_us();
 
-        // INIT
-        struct ggml_compute_params params = {
-            /*.type  =*/ GGML_TASK_INIT,
-            /*.ith   =*/ 0,
-            /*.nth   =*/ node->n_tasks,
-            /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-            /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
-        };
-
-        ggml_compute_forward(&params, node);
-
-        // COMPUTE
-        if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            // launch thread pool
-            for (int j = 0; j < n_threads - 1; j++) {
-                workers[j].params = (struct ggml_compute_params) {
-                    .type  = GGML_TASK_COMPUTE,
-                    .ith   = j + 1,
-                    .nth   = node->n_tasks,
-                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                    .wdata = cgraph->work ? cgraph->work->data : NULL,
-                };
-                workers[j].node = node;
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) > 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_store(&state_shared.has_work, true);
+        // TODO: can be moved out of loop?
+        void *wdata = NULL;
+        size_t wsize = 0;
+        if (cgraph->work) {
+            wdata = cgraph->work->data;
+            wsize = ggml_nbytes(cgraph->work);
         }
 
-        params.type = GGML_TASK_COMPUTE;
-        ggml_compute_forward(&params, node);
-
-        // wait for thread pool
-        if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) != 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-        }
-
-        // FINALIZE
-        if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            // launch thread pool
-            for (int j = 0; j < n_threads - 1; j++) {
-                workers[j].params = (struct ggml_compute_params) {
-                    .type  = GGML_TASK_FINALIZE,
-                    .ith   = j + 1,
-                    .nth   = node->n_tasks,
-                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                    .wdata = cgraph->work ? cgraph->work->data : NULL,
-                };
-                workers[j].node = node;
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) > 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_store(&state_shared.has_work, true);
-        }
-
-        params.type = GGML_TASK_FINALIZE;
-        ggml_compute_forward(&params, node);
-
-        // wait for thread pool
-        if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) != 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
+        enum ggml_compute_error err =
+            ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
+        if (err == GGML_COMPUTE_FALLBACK) {
+            if (node->op == GGML_OP_MUL_MAT) {
+                    const struct ggml_task_profile *p =
+                        ggml_mulmat_get_default_task_profile(
+                            NULL, node->src0->type, node->src1->type);
+                    memcpy(&node->task_profile, p,
+                           sizeof(struct ggml_task_profile));
             }
+            err = ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
         }
+        GGML_ASSERT(err == GGML_COMPUTE_OK);
 
         // performance stats (node)
         {
@@ -16155,19 +16185,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         }
     }
 
-    // join thread pool
-    if (n_threads > 1) {
-        atomic_store(&state_shared.stop, true);
-        atomic_store(&state_shared.has_work, true);
-
-        for (int j = 0; j < n_threads - 1; j++) {
-            int rc = ggml_thread_join(workers[j].thrd, NULL);
-            GGML_ASSERT(rc == 0);
-            UNUSED(rc);
-        }
-
-        ggml_lock_destroy(&state_shared.spin);
-    }
+    ggml_threading_stop(thrd_ctx);
 
     // performance stats (graph)
     {
@@ -16242,7 +16260,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
             tensor->n_dims,
             ne[0], ne[1], ne[2], ne[3],
             nb[0], nb[1], nb[2], nb[3],
-            tensor->n_tasks,
+            tensor->task_profile.stages[0].parallel, // replaceed n_tasks.
             tensor->data,
             tensor->name);
 }
@@ -18024,14 +18042,6 @@ int ggml_cpu_has_wasm_simd(void) {
 #endif
 }
 
-int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
 int ggml_cpu_has_cublas(void) {
 #if defined(GGML_USE_CUBLAS)
     return 1;
@@ -18048,10 +18058,22 @@ int ggml_cpu_has_clblast(void) {
 #endif
 }
 
+int ggml_cpu_has_cpublas(void) {
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_gpublas(void) {
     return ggml_cpu_has_cublas() || ggml_cpu_has_clblast();
 }
 
+int ggml_cpu_has_blas(void) {
+    return ggml_cpu_has_cpublas() || ggml_cpu_has_gpublas();
+}
+
 int ggml_cpu_has_sse3(void) {
 #if defined(__SSE3__)
     return 1;
diff --git a/ggml.h b/ggml.h
index 1380c530f..f51b658fd 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1,5 +1,7 @@
 #pragma once
 
+
+
 //
 // GGML Tensor Library
 //
@@ -200,6 +202,7 @@
 #define GGML_MAX_OPT           4
 #define GGML_MAX_NAME          32
 #define GGML_DEFAULT_N_THREADS 4
+#define GGML_MAX_TASK_PROFILES 8
 
 #define GGML_ASSERT(x) \
     do { \
@@ -347,7 +350,6 @@ extern "C" {
         GGML_OP_COUNT,
     };
 
-
     // ggml object
     struct ggml_object {
         size_t offs;
@@ -360,6 +362,54 @@ extern "C" {
 
     static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
 
+    // As part of task config profile solution, `ggml_task_backend` defines
+    // backends for each task stage. Similar to `ggml_tensor.backend`,
+    // `ggml_tensor.task_profile` generalizes how to configure tensor computing
+    // at per task-stage level.
+    //
+    // The following enum values are designed as combination of hardware and
+    // optional software interface.
+    enum ggml_task_backend {
+        GGML_TASK_BACKEND_NONE     = 0,
+
+        // [0x10, 0x1F]: CPU
+        GGML_TASK_BACKEND_CPU      = 0x10,
+        GGML_TASK_BACKEND_CPU_BLAS = 0x11,
+
+        // [0x20 - 0x2F]: GPU
+        GGML_TASK_BACKEND_GPU      = 0x20,
+        GGML_TASK_BACKEND_GPU_CUDA = 0x21,
+        GGML_TASK_BACKEND_GPU_CL   = 0x22,
+    };
+
+    // config for computing one of the 3 task stages of a tensor.
+    struct ggml_task_stage {
+        enum ggml_task_backend backend;
+        bool parallel;
+        // hint idle workers go waiting, valid only when parallel is false.
+        bool wait;
+    };
+
+    // config for computing a tensor.
+    struct ggml_task_profile {
+        // index 0: INIT, 1: COMPUTE, 2: FINALIZE
+        struct ggml_task_stage stages[3];
+
+        // MUST be used only in testing codes.
+        uint8_t dev_flags[4];
+    };
+
+    struct ggml_task_profile_factory {
+        struct ggml_task_profile f32_f32[GGML_MAX_TASK_PROFILES];
+        int n_f32_f32;
+
+        struct ggml_task_profile f16_f32[GGML_MAX_TASK_PROFILES];
+        int n_f16_f32;
+
+        struct ggml_task_profile qxx_f32[GGML_MAX_TASK_PROFILES];
+        int n_qxx_f32;
+    };
+
     // n-dimensional tensor
     struct ggml_tensor {
         enum ggml_type    type;
@@ -383,7 +433,8 @@ extern "C" {
         struct ggml_tensor * opt[GGML_MAX_OPT];
 
         // thread scheduling
-        int n_tasks;
+
+        struct ggml_task_profile task_profile;
 
         // performance
         int     perf_runs;
@@ -396,7 +447,7 @@ extern "C" {
 
         void * extra; // extra things e.g. for ggml-cuda.cu
 
-        char padding[4];
+        char padding[12];
     };
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -407,6 +458,8 @@ extern "C" {
         int n_leafs;
         int n_threads;
 
+        struct ggml_mulmat_tune *tune;
+
         size_t work_size;
         struct ggml_tensor * work;
 
@@ -1287,9 +1340,21 @@ extern "C" {
     GGML_API int ggml_cpu_has_cublas     (void);
     GGML_API int ggml_cpu_has_clblast    (void);
     GGML_API int ggml_cpu_has_gpublas    (void);
+    GGML_API int ggml_cpu_has_cpublas    (void);
     GGML_API int ggml_cpu_has_sse3       (void);
     GGML_API int ggml_cpu_has_vsx        (void);
 
+    //
+    // mulmat task profiles
+    //
+    GGML_API void ggml_mulmat_init_task_profiles(void);
+
+    GGML_API int ggml_mulmat_get_task_profiles(
+        struct ggml_task_profile_factory *pf,
+        enum ggml_type src0_t,
+        enum ggml_type src1_t,
+        struct ggml_task_profile **profiles);
+
     //
     // Internal types and functions exposed for tests and benchmarks
     //
diff --git a/llama.cpp b/llama.cpp
index c165d3239..fa5a94e21 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4,6 +4,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
+#include <cstdlib>
 #endif
 
 #include "llama-util.h"
@@ -20,6 +21,10 @@
 #include "ggml-metal.h"
 #endif
 
+#ifdef GGML_USE_MULMAT_TUNE
+#include "ggml-tune.h"
+#endif
+
 #include <array>
 #include <ctime>
 #include <cinttypes>
@@ -280,6 +285,10 @@ struct llama_context {
     int    buf_last = 0;
     size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
 
+#ifdef GGML_USE_MULMAT_TUNE
+    struct ggml_mulmat_tune *tune = nullptr;
+#endif
+
     void use_buf(struct ggml_context * ctx, int i) {
 #if defined(LLAMA_USE_SCRATCH)
         size_t last_size = 0;
@@ -1396,10 +1405,12 @@ static bool llama_eval_internal(
 
     struct ggml_context * ctx0 = ggml_init(params);
 
-    // for big prompts, if BLAS is enabled, it is better to use only one thread
-    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
     ggml_cgraph gf = {};
-    gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
+    gf.n_threads = n_threads;
+
+#ifdef GGML_USE_MULMAT_TUNE
+    gf.tune =lctx.tune;
+#endif
 
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     ggml_set_name(embd, "embd");
@@ -2732,7 +2743,150 @@ struct llama_context * llama_init_from_file(
     return ctx;
 }
 
+#ifdef GGML_USE_MULMAT_TUNE
+bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) {
+    printf("\n");
+    if (ctx->model.n_gpu_layers != 0) {
+        fprintf(stderr, "[mulmat tune] error: is disabled by GPU offloading\n");
+        return false;
+    }
+
+    const char *model_name = llama_model_type_name(ctx->model.type);
+
+    llama_hparams *hparams = &ctx->model.hparams;
+
+    enum ggml_ftype ggml_ftype;
+    switch (hparams->ftype) {
+        case LLAMA_FTYPE_ALL_F32:
+        ggml_ftype = GGML_FTYPE_ALL_F32;
+        break;
+    case LLAMA_FTYPE_MOSTLY_F16:
+        ggml_ftype = GGML_FTYPE_MOSTLY_F16;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q4_0:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q4_0;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q4_1:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1_SOME_F16;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q5_0:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q5_0;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q5_1:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q5_1;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q8_0:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q8_0;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q2_K:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q2_K;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q3_K_S:
+    case LLAMA_FTYPE_MOSTLY_Q3_K_M:
+    case LLAMA_FTYPE_MOSTLY_Q3_K_L:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q3_K;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q4_K_S:
+    case LLAMA_FTYPE_MOSTLY_Q4_K_M:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q4_K;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q5_K_S:
+    case LLAMA_FTYPE_MOSTLY_Q5_K_M:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q5_K;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q6_K:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q6_K;
+        break;
+    default:
+        throw std::runtime_error(
+            format("invalid output file type %d\n", hparams->ftype));
+    }
+
+    int n_vocab = hparams->n_vocab;
+    int n_embd = hparams->n_embd;
+    int n_rot = hparams->n_rot;
+
+    int n_mult = hparams->n_mult;
+    int n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult;
+
+    struct ggml_mulmat_tune_params params = {
+        /*.model =*/ {
+            /* .name    =*/ model_name,
+            /* .ftype   =*/ ggml_ftype,
+            /* .n_vocab =*/ n_vocab,
+            /* .n_embd  =*/ n_embd,
+            /* .n_ff    =*/ n_ff,
+            /* .n_rot   =*/ n_rot,
+        },
+        /* .m_num          =*/ 8,
+        /* .n_pass         =*/ 1,
+        /* .n_threads      =*/ n_threads,
+        /* .prrogress      =*/ true,
+        /* .output_console =*/ false,
+        /* .fname          =*/ fname,
+    };
+
+    bool empty_fname = !fname || strcmp(fname, "") == 0;
+
+    ctx->tune = new(struct ggml_mulmat_tune);
+    if (!ctx->tune) {
+        throw std::runtime_error(format("failed to allocate memory for tune\n"));
+    }
+
+    if (tune) {
+        bool ok = ggml_mulmat_tune_bench(ctx->tune, &params);
+        if (!ok) {
+            ggml_mulmat_tune_free(ctx->tune);
+            return false;
+        }
+        if (!empty_fname) {
+            ggml_mulmat_tune_free(ctx->tune);
+            return true;
+        }
+    } else {
+        if (empty_fname) {
+            return false;
+        }
+    }
+
+    if (!empty_fname) {
+        FILE *fp = fopen(fname, "r");
+        if (!fp) {
+            fprintf(stderr, "[mulmat tune] failed to open file %s.\n",
+                    fname);
+        } else {
+            bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp);
+            fclose(fp);
+
+            if (!ok) {
+                fprintf(stderr,
+                        "[mulmat tune] failed to read data from %s\n",
+                        fname);
+                return false;
+            }
+
+            fprintf(stderr, "[mulmat tune] loaded data from %s\n", fname);
+
+            ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads);
+            if (!ok) {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+#endif
+
 void llama_free(struct llama_context * ctx) {
+#ifdef GGML_USE_MULMAT_TUNE
+    if (ctx->tune) {
+        delete(ctx->tune);
+    }
+#endif
     delete ctx;
 }
 
diff --git a/llama.h b/llama.h
index 1241ba6c0..c3f6a2154 100644
--- a/llama.h
+++ b/llama.h
@@ -300,6 +300,9 @@ extern "C" {
     // Print system information
     LLAMA_API const char * llama_print_system_info(void);
 
+    // Experimental utility functionality for mulmat tunning.
+    LLAMA_API bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/tests/.gitignore b/tests/.gitignore
new file mode 100644
index 000000000..f4b8ee1b3
--- /dev/null
+++ b/tests/.gitignore
@@ -0,0 +1,2 @@
+/test-ggml-threading
+/test-ggml-tune
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 4171c126c..977b8ef6d 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -12,3 +12,5 @@ llama_add_test(test-sampling.cpp)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
 # llama_add_test(test-grad0.c) # SLOW
 # llama_add_test(test-opt.c) # SLOW
+llama_add_test(test-ggml-threading.c)
+llama_add_test(test-ggml-tune.c)
diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c
new file mode 100644
index 000000000..0b47623e2
--- /dev/null
+++ b/tests/test-ggml-threading.c
@@ -0,0 +1,345 @@
+#include "ggml-threading.h"
+#include "ggml.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+// Purposes:
+// 1. general overview of the threading behaviors.
+// 2. race (dead lock) detection.
+
+// # build
+// cd build
+//
+// # build release:
+//   cmake .. && cmake --build . --config Release
+//
+// # build with sanitize:
+//   cmake .. -DLLAMA_SANITIZE_THREAD=ON && cmake --build . --config Release
+//
+// # run:
+// ./bin/test-ggml-threading
+
+// How to turn off the warning on Apple: malloc: nano zone abandoned due to
+// inability to reserve vm space?
+// ==> export MallocNanoZone=0, no need to rebuild.
+// See `nano_init()` from
+// https://opensource.apple.com/source/libmalloc/libmalloc-140.40.1/src/nano_malloc.c.auto.html
+
+// How to view the threading debug:
+// ==> uncomment `#define GGML_THREADING_DEBUG 1` from file ggml-threading.c
+
+#define UNUSED(x) (void)(x)
+
+#define MAX_N_THREADS 16
+
+static const int n_repeat = 10;
+
+// It's frustrating to use atomic with c11 on Windows, let's replace atomic
+// counter with array.
+static int work_done_arr[MAX_N_THREADS];
+
+static enum ggml_compute_error
+mock_task_runner(struct ggml_compute_params *params, struct ggml_tensor *node) {
+    int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000;
+    if (node->task_profile.stages[params->type].parallel) {
+        loops /= params->nth;
+    }
+
+    volatile int64_t j = 0;
+    for (int i = 0; i < loops; i++) {
+        j++;
+    }
+
+    UNUSED(j);
+
+    work_done_arr[params->ith]++;
+    return GGML_COMPUTE_OK;
+}
+
+int test_driver(int id, struct ggml_tensor *node, int n_threads) {
+    printf("\n[test-ggml-threading] #%d, n_threads: %d\n", id, n_threads);
+
+    for (int i = 0; i < n_threads; i++) {
+        work_done_arr[i] = 0;
+    }
+
+    bool wait_on_done = (node->task_profile.dev_flags[0] > 0u);
+
+    enum ggml_threading_features features = GGML_THREADING_FEATURE_PERF;
+    if (wait_on_done) {
+        features |= GGML_THREADING_FEATURE_WAIT_ON_DONE;
+    }
+
+    int t0 = (int)ggml_time_us();
+
+    struct ggml_threading_context *ctx =
+        ggml_threading_start(n_threads, ggml_threading_graph_compute_thread,
+                             mock_task_runner, features, /*stages_time*/ NULL);
+
+    int t1 = (int)ggml_time_us();
+
+    for (int i = 0; i < n_repeat; i++) {
+        enum ggml_compute_error err = ggml_threading_compute_tensor(
+            ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
+        if (err != GGML_COMPUTE_OK) {
+            ggml_threading_stop(ctx);
+            fprintf(stderr,
+                    "ggml_threading_compute_tensor failed with error: %d.\n",
+                    err);
+            return 1;
+        }
+    }
+
+    int t2 = (int)ggml_time_us();
+
+    ggml_threading_stop(ctx);
+
+    int t3 = (int)ggml_time_us();
+
+    int expect = 0;
+    for (int i = 0; i < 3; i++) {
+        struct ggml_task_stage *ts = &node->task_profile.stages[i];
+        if (ts->backend != GGML_TASK_BACKEND_NONE) {
+            if (ts->parallel) {
+                expect += n_threads;
+            } else {
+                expect++;
+            }
+        }
+    }
+    expect *= n_repeat;
+
+    int actual = 0;
+    for (int i = 0; i < n_threads; i++) {
+        actual += work_done_arr[i];
+    }
+
+    uint8_t loops = node->task_profile.dev_flags[1];
+
+    printf("\tloops: %2d million(s), ---wait_on_done---: %d\n\tstage-0: "
+           "(parallel: %d, "
+           "wait: %d)\n"
+           "\tstage-1: (parallel: %d, wait: %d)\n",
+           loops, wait_on_done, node->task_profile.stages[0].parallel,
+           node->task_profile.stages[0].wait,
+           node->task_profile.stages[1].parallel,
+           node->task_profile.stages[1].wait);
+
+    if (actual == expect) {
+        printf("\tthreading: init %6.3f ms, compute %6.3f ms, cleanup %6.3f "
+               "ms, total %6.3f ms\n",
+               1.0 * (t1 - t0) / 1000, 1.0 * (t2 - t1) / 1000,
+               1.0 * (t3 - t2) / 1000, 1.0 * (t3 - t0) / 1000);
+        return 0;
+    }
+
+    fprintf(stderr, "\t== failed. expect %d done, actual %d done\n\n", expect,
+            actual);
+
+    return 2;
+}
+
+static enum ggml_compute_error
+mock_task_runner_fallback(struct ggml_compute_params *params,
+                          struct ggml_tensor *node) {
+    UNUSED(params);
+    if (node->backend == GGML_BACKEND_GPU) {
+        // ... finally failed to compute in GPU.
+
+        node->backend = GGML_BACKEND_CPU;
+        return GGML_COMPUTE_FALLBACK;
+    } else {
+        return GGML_COMPUTE_OK;
+    }
+}
+
+// By design, fallback should happen when attempt computing tensor in GPU,
+// thus it is not parallelled.
+int test_fallback(struct ggml_tensor *node) {
+    struct ggml_threading_context *ctx = ggml_threading_start(
+        1, ggml_threading_graph_compute_thread, mock_task_runner_fallback,
+        /*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL);
+
+    enum ggml_compute_error err =
+        ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
+    if (err == GGML_COMPUTE_FALLBACK) {
+        err = ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL,
+                                            /*wsize*/ 0);
+    }
+
+    ggml_threading_stop(ctx);
+    if (err != GGML_COMPUTE_OK) {
+        fprintf(stderr,
+                "ggml_threading_compute_tensor failed with error: %d.\n", err);
+        return 1;
+    }
+
+    return 0;
+}
+
+int main(void) {
+    ggml_time_init();
+
+    struct ggml_tensor node;
+    memset(&node, 0, sizeof(struct ggml_tensor));
+
+    struct ggml_task_stage *stages = node.task_profile.stages;
+
+    stages[0].backend = GGML_TASK_BACKEND_CPU;
+    stages[1].backend = GGML_TASK_BACKEND_CPU;
+    stages[2].backend = GGML_TASK_BACKEND_NONE;
+
+    int n_passed = 0;
+    int n_tests = 0;
+
+    int parallel[3] = {0, 1, 2};
+
+    // In github build actions (windows-latest-cmake and ubuntu-latest-cmake):
+    // When n_threads >= 4, the thread init time and compute time suddenly goes
+    // down to 100x ~ 1000x slow -- comparing to n_threads == 2.
+    //
+    // But the tests (n_threads 1, 2, 4, 6) looks sound on my devices:
+    // - MacBook air 2013, ubuntu 22.04
+    // - MacBook pro 2018, macOS 13.4
+    //
+    // So I assume the github build host has limited multi-cpu quota.
+    // Will skip computing when threading init time is too slow.
+    //
+    // NOTE: it's observed that when workload is 0 and n_threads >= number of
+    // physical cores:
+    // - the wait/wakeup time varies much: can be up to tens or hundreds of the
+    //   average time, thus greatly punishes those small workloads.
+    // - wait_on_done is general faster than wait_now, can be 10x faster.
+
+    int threads_arr[] = {1, 2, 4, 8};
+    int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
+
+    // millions of loops.
+    uint8_t workload_arr[] = {0u, 1u, 10u};
+    int workload_arr_len = sizeof(workload_arr) / sizeof(workload_arr[0]);
+
+    // node.task_profile.dev_flags: byte 0 for wait_on_done, byte 1 for loops.
+
+    for (int x = 0; x < workload_arr_len; x++) {
+        node.task_profile.dev_flags[1] = workload_arr[x];
+
+        for (int i = 0; i < threads_arr_len; i++) {
+            int n_threads = threads_arr[i];
+            if (n_threads > MAX_N_THREADS) {
+                abort();
+            }
+
+            printf("\n[test-ggml-threading] ==== n_nodes: %d, n_threads: %d, "
+                   "loops: %2d million(s) ====\n",
+                   n_repeat, n_threads, workload_arr[x]);
+
+            if (n_threads > 1) { // skip this n_threads when too slow.
+                int t0 = (int)ggml_time_us();
+
+                struct ggml_threading_context *ctx = ggml_threading_start(
+                    n_threads, ggml_threading_graph_compute_thread,
+                    mock_task_runner, 0, /*stages_time*/ NULL);
+
+                int t1 = (int)ggml_time_us();
+
+                ggml_threading_stop(ctx);
+
+                int elapsed_us = t1 - t0;
+                if (elapsed_us > 500 * n_threads) {
+                    fprintf(stderr,
+                            "[test-ggml-threading] warning: it took took %.3f "
+                            "ms to start %d worker thread(s).\n",
+                            1.0 * elapsed_us / 1000, n_threads - 1);
+                    fprintf(stderr, "[test-ggml-threading] warning: looks like "
+                                    "the environment is too slow to run this "
+                                    "number of threads, skip.\n");
+                    continue;
+                }
+            }
+
+            // multi-threads: parallel + wait_now/wait_on_done
+
+            if (n_threads == 1) {
+                stages[0].parallel = false;
+                stages[1].parallel = false;
+                stages[0].wait = false;
+                stages[1].wait = false;
+
+                n_tests++;
+                if (test_driver(n_tests, &node, n_threads) == 0) {
+                    n_passed++;
+                }
+                continue;
+            }
+
+            for (int j = 0; j < 3; j++) {
+                stages[0].wait = false;
+                stages[1].wait = false;
+                node.task_profile.dev_flags[0] = 0u;
+
+                if (parallel[j] == 0) {
+                    stages[0].parallel = false;
+                    stages[1].parallel = false;
+
+                    n_tests++;
+                    if (test_driver(n_tests, &node, n_threads) == 0) {
+                        n_passed++;
+                    }
+                } else if (parallel[j] == 1) {
+                    stages[0].parallel = true;
+                    stages[1].parallel = false;
+
+                    for (int k = 0; k < 2; k++) {
+                        stages[1].wait = (k == 1);
+
+                        if (!stages[1].wait) {
+                            n_tests++;
+                            if (test_driver(n_tests, &node, n_threads) == 0) {
+                                n_passed++;
+                            }
+                            continue;
+                        }
+
+                        // wait
+
+                        for (int m = 0; m < 2; m++) {
+                            if (m == 1) {
+                                node.task_profile.dev_flags[0] = 1u;
+                            }
+                            n_tests++;
+                            if (test_driver(n_tests, &node, n_threads) == 0) {
+                                n_passed++;
+                            }
+                            node.task_profile.dev_flags[0] = 0u;
+                        }
+                    }
+                } else {
+                    stages[0].parallel = true;
+                    stages[1].parallel = true;
+
+                    n_tests++;
+                    if (test_driver(n_tests, &node, n_threads) == 0) {
+                        n_passed++;
+                    }
+                }
+            }
+        }
+    }
+
+    {
+        ++n_tests;
+
+        node.backend = GGML_BACKEND_GPU;
+        if (test_fallback(&node) == 0) {
+            ++n_passed;
+            printf("\n[test-ggml-threading] test fallback: ok\n\n");
+        }
+    }
+
+    printf("[test-ggml-threading] %d/%d passed.\n", n_passed, n_tests);
+
+    return (n_passed == n_tests) ? 0 : 1;
+}
diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c
new file mode 100644
index 000000000..ed612fff4
--- /dev/null
+++ b/tests/test-ggml-tune.c
@@ -0,0 +1,200 @@
+#include "ggml-tune.h"
+#include "ggml.h"
+
+#include <string.h>
+
+static int bench(void);
+static int estimate_time_non_zero_NK(void);
+
+static void init_params(struct ggml_mulmat_tune_params *params, int m_num) {
+    *params = (struct ggml_mulmat_tune_params){
+        .model =
+            (struct ggml_mulmat_tune_model){
+                .name = "3B", // fake
+                .ftype = GGML_FTYPE_MOSTLY_Q4_0,
+                .n_vocab = 4096,
+                .n_embd = 1024,
+                .n_ff = 2048,
+                .n_rot = 128,
+            },
+        .m_num = m_num,
+        .n_pass = 1,
+        .n_threads = 1,
+        .progress = false,
+        .output_console = true,
+        .fname = NULL};
+}
+
+int main(void) {
+    int rv = bench();
+    if (rv != 0) {
+        return rv;
+    }
+
+    printf("\n");
+
+    rv = estimate_time_non_zero_NK();
+    if (rv != 0) {
+        return rv;
+    }
+    printf("\n");
+
+    return 0;
+}
+
+static int bench(void) {
+    printf("test: %s\n", __func__);
+
+    {
+        enum ggml_task_backend backends[16];
+        int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
+        if (n_backends < 2) {
+            printf("test: %s, skipped because no BLAS\n", __func__);
+            return 0;
+        }
+    }
+
+    {
+        struct ggml_init_params init_params = {
+            /*.mem_size   =*/1,
+            /*.mem_buffer =*/NULL,
+            /*.no_alloc   =*/0,
+        };
+        struct ggml_context *ctx = ggml_init(init_params);
+        GGML_ASSERT(ctx);
+        ggml_free(ctx);
+    }
+
+    struct ggml_mulmat_tune tune;
+
+    struct ggml_mulmat_tune_params params;
+
+    init_params(&params, /*m_num*/ 4);
+
+    bool ok = ggml_mulmat_tune_bench(&tune, &params);
+    ggml_mulmat_tune_free(&tune);
+
+    return ok ? 0 : 1;
+}
+
+int estimate_time_non_zero_NK(void) {
+    printf("test: %s\n", __func__);
+
+    struct test_data_t {
+        int M;
+        int time[3]; // 3 profiles.
+    };
+
+    struct ggml_mulmat_tune tune = {
+        .version = 1,
+        .ftype = GGML_FTYPE_MOSTLY_Q4_0,
+    };
+
+    const int m_num = 2;
+
+    struct ggml_task_profile_factory pf;
+    memset(&pf, 0, sizeof(struct ggml_task_profile_factory));
+
+    {
+        pf.n_qxx_f32 = 2;
+        pf.qxx_f32[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
+        pf.qxx_f32[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+
+        pf.qxx_f32[1].stages[0].backend = GGML_TASK_BACKEND_CPU;
+        pf.qxx_f32[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
+    }
+
+    struct ggml_mulmat_tune_params params;
+    init_params(&params, m_num);
+
+    ggml_mulmat_tune_init(&tune, &params, &pf);
+
+    struct ggml_mulmat_tune_shape *shape = NULL;
+    for (int i = 0; i < tune.n_shapes; i++) {
+        if (tune.shapes[i].N > 0 && tune.shapes[i].K > 0) {
+            shape = &tune.shapes[i];
+            break;
+        }
+    }
+    GGML_ASSERT(shape);
+    GGML_ASSERT(shape->n_profiles == 2);
+    GGML_ASSERT(ggml_is_quantized(shape->src0_type));
+
+    printf("shape: N: %d, K: %d, n_profiles: %d\n", shape->N, shape->K,
+           shape->n_profiles);
+
+    {
+        shape->items[0] =
+            (struct ggml_mulmat_tune_m){.M = 2, .stages_time = {2, 4, 0}};
+        shape->items[1] =
+            (struct ggml_mulmat_tune_m){.M = 4, .stages_time = {4, 8, 0}};
+
+        shape->items[2] =
+            (struct ggml_mulmat_tune_m){.M = 2, .stages_time = {4, 4, 0}};
+        shape->items[3] =
+            (struct ggml_mulmat_tune_m){.M = 4, .stages_time = {4, 4, 0}};
+    }
+
+    const struct test_data_t test_data[] = {
+        {
+            .M = 1, // out of range
+            .time = {3, 8},
+        },
+        {
+            .M = 2,
+            .time = {6, 8},
+        },
+        {
+            .M = 3,
+            .time = {9, 8},
+        },
+        {
+            .M = 4,
+            .time = {12, 8},
+        },
+        {
+            .M = 5, // out of range
+            .time = {15, 8},
+        },
+    };
+
+    int n_tests = (int)(sizeof(test_data) / sizeof(struct test_data_t));
+
+    struct ggml_mulmat_tune_time profile_time[GGML_MAX_TASK_PROFILES];
+    size_t profile_time_sz =
+        sizeof(struct ggml_mulmat_tune_time) * GGML_MAX_TASK_PROFILES;
+
+    int n_passed = 0;
+    for (int i = 0; i < n_tests; i++) {
+        memset(profile_time, 0, profile_time_sz);
+        const struct test_data_t *e = &test_data[i];
+
+        const struct ggml_mulmat_tune_shape *matched_shape =
+            ggml_mulmat_tune_get_shape(&tune, shape->N, shape->K,
+                                       shape->src0_type, shape->src1_type);
+        GGML_ASSERT(matched_shape);
+        GGML_ASSERT(matched_shape == shape);
+
+        ggml_mulmat_tune_estimate_time(matched_shape, e->M, profile_time);
+
+        for (int j = 0; j < shape->n_profiles; j++) {
+            int actual = profile_time[j].total_time;
+            int expect = e->time[j];
+            if (expect != actual) {
+                fprintf(stderr,
+                        "test fail. i: %d, j: %d, M: %d, expect: "
+                        "%d, actual: %d\n",
+                        i, j, e->M, expect, actual);
+            } else {
+                ++n_passed;
+            }
+        }
+    }
+
+    n_tests *= shape->n_profiles;
+    printf("%2d of %2d pass\n", n_passed, n_tests);
+
+    ggml_mulmat_tune_free(&tune);
+
+    return n_passed == n_tests ? 0 : 1;
+}