This commit is contained in:
mqy 2023-06-14 18:33:14 +08:00
parent ce2c7d72e2
commit 213f133701
25 changed files with 3664 additions and 518 deletions

1
.gitignore vendored
View File

@ -40,6 +40,7 @@ models/*
/server /server
/Pipfile /Pipfile
/libllama.so /libllama.so
/mulmat-tune
build-info.h build-info.h
arm_neon.h arm_neon.h

View File

@ -78,6 +78,7 @@ option(LLAMA_K_QUANTS "llama: use k-quants"
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" OFF) option(LLAMA_BUILD_SERVER "llama: build server example" OFF)
option(LLAMA_MULMAT_TUNE "llama: mulmat tune" OFF)
# #
# Build info header # Build info header
@ -214,6 +215,7 @@ if (LLAMA_BLAS)
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
add_compile_options(${BLAS_LINKER_FLAGS}) add_compile_options(${BLAS_LINKER_FLAGS})
add_compile_definitions(GGML_USE_OPENBLAS) add_compile_definitions(GGML_USE_OPENBLAS)
add_compile_definitions(GGML_BLAS_VENDOR="${LLAMA_BLAS_VENDOR}")
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
@ -276,6 +278,11 @@ if (LLAMA_METAL)
) )
endif() endif()
if (LLAMA_MULMAT_TUNE)
add_compile_definitions(GGML_USE_MULMAT_TUNE)
add_compile_definitions(GGML_MULMAT_TUNE_NDEBUG)
endif()
if (LLAMA_K_QUANTS) if (LLAMA_K_QUANTS)
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h) set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
add_compile_definitions(GGML_USE_K_QUANTS) add_compile_definitions(GGML_USE_K_QUANTS)
@ -450,6 +457,8 @@ endif()
add_library(ggml OBJECT add_library(ggml OBJECT
ggml.c ggml.c
ggml-threading.c
ggml-tune.c
ggml.h ggml.h
${GGML_SOURCES_CUDA} ${GGML_SOURCES_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_SOURCES_OPENCL}

View File

@ -1,5 +1,5 @@
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple mulmat-tune
ifdef LLAMA_BUILD_SERVER ifdef LLAMA_BUILD_SERVER
BUILD_TARGETS += server BUILD_TARGETS += server
@ -47,7 +47,8 @@ endif
OPT = -O3 OPT = -O3
CFLAGS = -I. $(OPT) -std=c11 -fPIC CFLAGS = -I. $(OPT) -std=c11 -fPIC
CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
LDFLAGS = # -lm fixed error: ggml.o: undefined reference to symbol 'tanhf@@GLIBC_2.2.5' from ubuntu 22.04
LDFLAGS = -lm
ifdef LLAMA_DEBUG ifdef LLAMA_DEBUG
CFLAGS += -O0 -g CFLAGS += -O0 -g
@ -134,8 +135,7 @@ ifndef LLAMA_NO_K_QUANTS
endif endif
ifndef LLAMA_NO_ACCELERATE ifndef LLAMA_NO_ACCELERATE
# Mac M1 - include Accelerate framework. # Mac Intel & M1 - include Accelerate framework.
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
CFLAGS += -DGGML_USE_ACCELERATE CFLAGS += -DGGML_USE_ACCELERATE
LDFLAGS += -framework Accelerate LDFLAGS += -framework Accelerate
@ -145,10 +145,16 @@ endif # LLAMA_NO_ACCELERATE
ifdef LLAMA_OPENBLAS ifdef LLAMA_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
LDFLAGS += -lopenblas LDFLAGS += -lopenblas
ifeq ($(UNAME_S),Darwin)
# openblas installed with Homebew on macOS.
CFLAGS += -I/usr/local/opt/openblas/include
LDFLAGS += -L/usr/local/opt/openblas/lib
endif
endif # LLAMA_OPENBLAS endif # LLAMA_OPENBLAS
ifdef LLAMA_BLIS ifdef LLAMA_BLIS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
CFLAGS += -DGGML_BLAS_VENDOR="\"BLIS\""
LDFLAGS += -lblis -L/usr/local/lib LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS endif # LLAMA_BLIS
@ -230,6 +236,11 @@ k_quants.o: k_quants.c k_quants.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_NO_K_QUANTS endif # LLAMA_NO_K_QUANTS
ifdef LLAMA_MULMAT_TUNE
CFLAGS += -DGGML_USE_MULMAT_TUNE -DGGML_MULMAT_TUNE_NDEBUG
CXXFLAGS += -DGGML_USE_MULMAT_TUNE
endif
# #
# Print build information # Print build information
# #
@ -245,6 +256,8 @@ $(info I CC: $(CCV))
$(info I CXX: $(CXXV)) $(info I CXX: $(CXXV))
$(info ) $(info )
OBJS += ggml-tune.o ggml-threading.o
# #
# Build library # Build library
# #
@ -253,7 +266,12 @@ ggml.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
$(CXX) $(CXXFLAGS) -c $< -o $@
ggml-threading.o: ggml-threading.c ggml.h
$(CC) $(CFLAGS) -c $< -o $@
ggml-tune.o: ggml-tune.c ggml.h
$(CC) $(CFLAGS) -c $< -o $@
common.o: examples/common.cpp examples/common.h common.o: examples/common.cpp examples/common.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
@ -298,6 +316,9 @@ server: examples/server/server.cpp examples/server/httplib.h examples/server/jso
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS) train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
mulmat-tune: examples/mulmat-tune/mulmat-tune.cpp build-info.h ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o mulmat-tune $(LDFLAGS)
build-info.h: $(wildcard .git/index) scripts/build-info.sh build-info.h: $(wildcard .git/index) scripts/build-info.sh
@sh scripts/build-info.sh > $@.tmp @sh scripts/build-info.sh > $@.tmp
@if ! cmp -s $@.tmp $@; then \ @if ! cmp -s $@.tmp $@; then \

View File

@ -39,6 +39,7 @@ else()
add_subdirectory(baby-llama) add_subdirectory(baby-llama)
add_subdirectory(train-text-from-scratch) add_subdirectory(train-text-from-scratch)
add_subdirectory(simple) add_subdirectory(simple)
add_subdirectory(mulmat-tune)
if (LLAMA_METAL) if (LLAMA_METAL)
add_subdirectory(metal) add_subdirectory(metal)
endif() endif()

View File

@ -345,6 +345,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.mem_test = true; params.mem_test = true;
} else if (arg == "--export") { } else if (arg == "--export") {
params.export_cgraph = true; params.export_cgraph = true;
#ifdef GGML_USE_MULMAT_TUNE
} else if (arg == "--tune") {
params.tune = true;
} else if (arg == "--tune-file") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.tune_file = argv[i];
#endif // GGML_USE_MULMAT_TUNE
} else if (arg == "--verbose-prompt") { } else if (arg == "--verbose-prompt") {
params.verbose_prompt = true; params.verbose_prompt = true;
} else if (arg == "-r" || arg == "--reverse-prompt") { } else if (arg == "-r" || arg == "--reverse-prompt") {
@ -498,6 +508,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
#endif #endif
fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n"); fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
#ifdef GGML_USE_MULMAT_TUNE
fprintf(stderr, " --tune mulmat tune enable. If tune-file is set then exit after bench\n");
fprintf(stderr, " --tune-file FILE mulmat tune data file. If tune is true, then write bench result to this file, else load the file and run\n");
#endif
fprintf(stderr, " --verbose-prompt print prompt before generation\n"); fprintf(stderr, " --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");

View File

@ -77,6 +77,8 @@ struct gpt_params {
bool mem_test = false; // compute maximum memory usage bool mem_test = false; // compute maximum memory usage
bool export_cgraph = false; // export the computation graph bool export_cgraph = false; // export the computation graph
bool verbose_prompt = false; // print prompt tokens before generation bool verbose_prompt = false; // print prompt tokens before generation
bool tune = false; // mulmat tune: enable
std::string tune_file = ""; // mulmat tune: data file
}; };
bool gpt_params_parse(int argc, char ** argv, gpt_params & params); bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

View File

@ -117,6 +117,16 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
#ifdef GGML_USE_MULMAT_TUNE
if (params.tune || !params.tune_file.empty()) {
bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
if (!ok || (params.tune && !params.tune_file.empty())) {
llama_free(ctx);
return ok? 0: 1;
}
}
#endif
// print system information // print system information
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");

View File

@ -0,0 +1,14 @@
set(TARGET mulmat-tune)
add_executable(${TARGET} mulmat-tune.cpp)
if (XCODE OR MSVC)
set(MULMAT_TUNE_LIBS ggml)
else()
set(MULMAT_TUNE_LIBS ggml m)
endif()
target_link_libraries(${TARGET} PRIVATE ${MULMAT_TUNE_LIBS} ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@ -0,0 +1,272 @@
# Mulmat Benchmark and Tunning
Apart from the standalone tool `mulmat-tune`, mulmat tune is also integrated into
`main` and `perplexity`. To avoid too many new cli options, I just added two options.
To make it run faster, the `m_num` is set as 8 thus max M is 128, and the `n_pass`
is set as 1.
With the newly added cli options, we can use `main` and `perplexity` with the
following three ways:
* bench and run: --tune
* bench and exit: --tune --tune-file <FILE>
* load and run: --tune-file <FILE>
The `load` mode reads existing data file. Although this is fine because we can
run bench ahead of time (saving tens of seconds), but there are two shortcomings:
- have to re-run when format changed, this is OK because we are acknowledged.
- the most subtle problem is algorithm was changed silently but we are using the
outdated format. So I integrated mulmat tune into `main` and `perplexity` as
a complementary solution.
## Build into main and perplexity
Makefile:
```
make clean && LLAMA_MULMAT_TUNE=1 make
```
CMake (with BLAS):
```
cmake --build . --target clean
cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON
cmake --build . --config Release
```
Run examples:
```
# bench and run:
./main -m ./models/3B/open-llama-3b-q4-0.bin -c 512 -b 1024 -n 256 --keep 48 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt -t 4 --tune
# bench then exit:
./main -m ./models/3B/open-llama-3b-q4-0.bin --tune --tune-file <FILE>
# load and run
./main -m ./models/3B/open-llama-3b-q4-0.bin -c 512 -b 1024 -n 256 --keep 48 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt -t 4 --tune-file <FILE>
```
# Build the standalone `mulmat-tune`
Makefile:
```
make clean && LLAMA_MULMAT_TUNE=1 make
```
CMake (with BLAS)
```
cmake --build . --target clean
cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON
cmake --build . --config Release
```
Run examples:
```
./mulmat-tune -h
# run with default params (7B, Q4_0, ...)
./mulmat-tune
# set model
./mulmat-tune --model 13B
# set ggml ftype, 2 for Q4_0, 3 for Q4_1, run `mulmat-tune -h` for help.
./mulmat-tune --ftype 3
# customized m_num
./mulmat-tune --m_num 8
# customized n_pass: run 1 pass only instead of the default 3.
./mulmat-tune --n_pass 1
# customized n_threads instead of the default 1.
./mulmat-tune --n_threads 4
# save to file
./mulmat-tune --file <FILE>
# save to file, always override if exists (CAUTION!)
./mulmat-tune --file <FILE> -y
```
# End to End Test
## Compare With Master
You may want to run the following commands. Make sure the tune result file is
setup properly.
General steps:
1. run `./mulmat-tune -h` to see how to build for misc vendors.
you can build with `GGML_MULMAT_TUNE_NDEBUG=` to enable the the debug, e.g:
```
make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1 LLAMA_NO_ACCELERATE=1 LLAMA_CLBLAST=1 make
```
On `macOS`, `ACCELERATE` is enabled by default. When `ACCELERATE` is built along
with `CUDA` or `CL`, you may not see `CUDA` or `CL` from debug because `CPU`
or `CPU_BLAS` is more faster (as of the estimation from mulmat tune).
2. create a small prompt file:
```
head -n 5 ./models/wikitext-2-raw/wiki.valid.raw > ./models/wiki.valid-5.raw
```
3. run any of the following example commands.
```
./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 1 -b 32
./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 4 -b 64
```
* `--mlock` is recommended for `macOS`, you may not want to use it.
* don't change `-c 128`: too large `context size` causes 0 perplexity trunk.
* `-t` is the number of threads, recommend `1`, `2`, `4` or `6`.
* you can change the batch size (`-b`) between `1` and `128`.
* you may want to add other cli options.
The following results are generated with Accelerate compiled.
### 1 thread
**Master (2d43387d)**
```
| M | perplexity (seconds per pass) | prompt eval time (ms per token) |
| --- | --------------- |
| 8 | 43.53 | 339.95 |
| 16 | 44.31 | 346.12 |
| 24 | 43.14 | 336.90 |
| 32 | 33.59 | 262.25 |
| 40 | 27.64 | 215.77 |
| 48 | 24.52 | 191.42 |
```
**This branch (tune)**
```
| M | perplexity (seconds per pass) | prompt eval time (ms per token) |
| --- | --------------- |
| 8 | 43.78 | 341.96 |
| 16 | 42.88 | 334.93 |
| 24 | 42.06 | 328.42 |
| 32 | 33.07 | 258.25 |
| 40 | 28.69 | 223.98 |
| 48 | 25.65 | 200.19 |
```
### 4 threads
**Master (2d43387d)**
```
| M | perplexity (seconds per pass) | prompt eval time (ms per token) |
| --- | --------------- |
| 8 | 12.43 | 96.99 |
| 16 | 12.10 | 94.44 |
| 24 | 12.81 | 99.95 |
| 32 | 31.64 | 247.04 |
| 48 | 24.55 | 191.63 |
| 64 | 17.56 | 137.09 |
| 96 | 17.59 | 137.25 |
| 128 | 10.73 | 83.74 |
```
**This branch (no tune)**
```
| M | perplexity (seconds per pass) | prompt eval time (ms per token) |
| --- | --------------- |
| 8 | 12.31 | 96.07 |
| 16 | 12.00 | 93.63 |
| 24 | 12.07 | 94.15 |
| 32 | 20.34 | 158.76 |
| 48 | 15.86 | 123.73 |
| 64 | 10.98 | 85.69 |
| 96 | 11.24 | 87.66 |
| 128 | 7.53 | 58.77 |
```
**This branch (tune)**
```
| M | perplexity (seconds per pass) | prompt eval time (ms per token) |
| --- | --------------- |
| 8 | 12.48 | 97.37 |
| 16 | 12.26 | 95.70 |
| 24 | 12.25 | 95.53 |
| 32 | 11.98 | 93.58 |
| 48 | 12.57 | 98.12 |
| 64 | 11.28 | 88.05 |
| 96 | 9.55 | 74.53 |
| 128 | 7.51 | 58.61 |
```
# Bench Data Format
**Example**
```
5 3B 2 6 1
3200 3200 2 0 3 10
16 0 0 0 16 1 0 1 0 0 0 0
16 1 0 2 17 0 1 0 0 0 0 0
0 0 0 0 34 0 1 0 0 0 0 0
1 1 793 0 9103 2102 0 0 6014 0
2 2 1591 0 8034 2305 0 0 30982 0
4 4 2236 0 6476 2484 0 0 31388 0
8 7 4161 0 6623 2389 0 0 29204 0
16 15 8339 0 6434 2752 0 0 34303 0
32 32 16919 0 6915 3651 0 0 42511 0
64 200 34270 0 6574 4528 0 0 68212 0
128 188 69400 0 6325 6839 0 0 74437 0
256 303 134597 0 6168 11544 0 0 110180 0
512 687 279685 0 6337 29712 0 0 159728 0
3200 8640 2 0 2 10
...
```
**Informal Explanation**
```
head
groups+
head := version model ggml_ftype n_shapes n_threads
shape+
# head
version: 1
model: "3B" | "7B" | "13B" | "30B" | "65B"
ggml_ftype: 0 - 4, 7 - 14
n_shapes: number of shapes
n_threads: number of threads
shape := N K m_num n_profiles
task_conf_profile+
bench_item+
task_conf_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize)
stage_conf: backend parallel wait
backend: 0 (NONE) | 16 (CPU) | 17 (CPU_BLAS) | 32 (GPU) | 33 (GPU_CUDA) | 34 (GPU_CL)
parallel: 0 (false) | 1 (true)
wait: 0 (false) | 1 (true)
bench_item: M profile_time+
profile_time := stage_time[3]
stage_time[3]: init_time, compute_time, finalize_time
```
A task stage is invalid if it's backend equals to `GGML_TASK_BACKEND_NONE`.
Time unit is `us`. A column is all zeros when that stage does not exist.
# NOTE
1. "3B" is [open-llama 3B](https://github.com/ggerganov/llama.cpp/pull/1588).
2. Model names are subject to change: we may support something like X-3B, Y-4B, ...
3. As of Jun 1, this tool is still in early stage, will be changed frequently in
recent couple of days (or weeks).

View File

@ -0,0 +1,277 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "build-info.h"
#include "ggml-tune.h"
#include "ggml.h"
#include "llama.h"
#define UNUSED(x) (void)(x)
static void print_build_tips(void) {
const char *a = "LLAMA_NO_ACCELERATE";
fprintf(stderr, "Tips on how to build with various backend vendors:\n\n");
fprintf(stderr, "CUDA: make clean; LLAMA_CUBLAS=1 make\n");
fprintf(stderr, "CL: make clean; LLAMA_CLBLAST=1 make\n");
fprintf(stderr, "Accelerate: make clean; %s= make\n", a);
fprintf(stderr, "OpenBLAS: make clean; %s=1 LLAMA_OPENBLAS=1 make\n", a);
fprintf(stderr, "BLIS: make clean; %s=1 LLAMA_BLIS=1 make\n", a);
fprintf(stderr, "\n");
fprintf(stderr, "NOTE: for CUDA/CL, use %s=1 to disable ACCELERATE\n", a);
}
static bool prompt_yes_no(const char *prompt) {
char buf[2];
while (true) {
fprintf(stderr, "%s (Y|n)\n", prompt);
buf[0] = 0;
buf[1] = 0;
int i = 0;
int c = 0;
while (c != '\n') {
c = fgetc(stdin);
buf[i % 2] = c;
i++;
}
if (i == 1) {
if (buf[0] == '\n') {
return true;
}
} else if (i == 2) {
if (buf[0] == 'Y' || buf[0] == 'y') {
return true;
}
if (buf[0] == 'N' || buf[0] == 'n') {
return false;
}
}
}
}
static void cmd_analyze(struct ggml_mulmat_tune *tune);
static void usage(char *prog) {
const char *usage_lines[] = {
"usage: %s args",
"",
"bench [-m MODEL] [-t TYPE] [-f FILE] [-y]",
"--model MODEL 3B | 7B | 13B | 30B | 65B",
" default 7B",
"--ftype FTYPE ggml ftype:",
" 0: all F32",
" 1: mostly F16",
" 2: mostly Q4_0",
" 3: mostly Q4_1",
" 4: mostly Q4_1, some F16",
" 7: mostly Q8_0",
" 8: mostly Q5_0",
" 9: mostly Q5_1",
" 10: mostly Q2_K",
" 11: mostly Q3_K",
" 12: mostly Q4_K",
" 13: mostly Q5_K",
" 14: mostly Q6_K",
" default 2 (mostly Q4_0)",
"--m_num M_NUM number of M, the max M = 2^(M_NUM-1)",
" requires between [6, 12]",
" default 10",
"--n_pass PASS number of passes to run",
" default 1",
" requires: between [1, 3]",
"--n_threads NTH bench with this number of threads",
" requires: between [1, 16]",
" default 1",
"--file FILE data file to write",
" default stdout",
"-y always answer \"yes\" to all prompts",
};
int len = (int)(sizeof(usage_lines) / sizeof(char *));
for (int i = 0; i < len; i++) {
const char *line = usage_lines[i];
if (i == 0) {
fprintf(stderr, line, prog);
} else {
fprintf(stderr, "%s\n", line);
}
}
printf("\n");
print_build_tips();
printf("\n");
}
int main(int argc, char **argv) {
if (argc == 2) {
if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) {
usage(argv[0]);
return 0;
}
}
int arg_start = 1;
const char *arg_model = NULL;
const char *arg_ftype = NULL;
const char *arg_m_num = NULL;
const char *arg_n_threads = NULL;
const char *arg_n_pass = NULL;
const char *arg_file = NULL;
bool always_yes = false;
for (int i = arg_start; i < argc; i++) {
if (strcmp(argv[i], "--model") == 0) {
if (i + 1 < argc) {
arg_model = argv[i + 1];
++i;
}
} else if (strcmp(argv[i], "--ftype") == 0) {
if (i + 1 < argc) {
arg_ftype = argv[i + 1];
++i;
}
} else if (strcmp(argv[i], "--m_num") == 0) {
if (i + 1 < argc) {
arg_m_num = argv[i + 1];
++i;
}
} else if (strcmp(argv[i], "--n_pass") == 0) {
if (i + 1 < argc) {
arg_n_pass = argv[i + 1];
++i;
}
} else if (strcmp(argv[i], "--n_threads") == 0) {
if (i + 1 < argc) {
arg_n_threads = argv[i + 1];
++i;
}
} else if (strcmp(argv[i], "--file") == 0) {
if (i + 1 < argc) {
arg_file = argv[i + 1];
++i;
}
} else if (strcmp(argv[i], "-y") == 0) {
always_yes = true;
} else {
fprintf(stderr, "invalid arg: %s\n", argv[i]);
usage(argv[0]);
return 1;
}
}
enum ggml_ftype ftype = GGML_FTYPE_MOSTLY_Q4_0;
{
if (arg_ftype != NULL) {
int v = atoi(arg_ftype);
ftype = (enum ggml_ftype)v;
}
if (ftype > GGML_FTYPE_MOSTLY_Q5_1) {
fprintf(stderr, "k_quants type %d is not implemented\n", ftype);
return 1;
}
}
if (arg_file != NULL && !always_yes) {
struct stat st;
int rc = stat(arg_file, &st);
UNUSED(st);
if (rc == 0) { // prompt
size_t len = strlen(arg_file) + 50;
char *prompt = (char *)malloc(len);
GGML_ASSERT(prompt);
snprintf(prompt, len, "data file '%s' exists, override?", arg_file);
if (!prompt_yes_no(prompt)) {
printf("Aborted.\n");
return 1;
}
free(prompt);
}
}
int m_num = 10;
{
if (arg_m_num != NULL) {
int v = atoi(arg_m_num);
m_num = v;
}
if (m_num < 6 || m_num > 12) {
fprintf(stderr, "invalid m_num: %d, expect between [6, 12]\n",
m_num);
usage(argv[0]);
return 1;
}
}
int n_pass = 1;
{
if (arg_n_pass != NULL) {
int v = atoi(arg_n_pass);
n_pass = v;
}
if (n_pass < 1 || n_pass > GGML_MULMAT_MAX_PASS) {
fprintf(stderr, "invalid n_pass: %d, expect between [1, %d]\n",
n_pass, GGML_MULMAT_MAX_PASS);
usage(argv[0]);
return 1;
}
}
int n_threads = 1;
{
if (arg_n_threads != NULL) {
int v = atoi(arg_n_threads);
n_threads = v;
if (n_threads < 1 || n_threads > 16) {
fprintf(stderr,
"invalid n_threads: %d, expect between [1, 16]\n",
n_threads);
usage(argv[0]);
return 1;
}
}
}
const char *model_name = "7B";
{
if (arg_model != NULL) {
model_name = arg_model;
}
}
// Let init message print earlier.
{
struct ggml_init_params init_params = {
/*.mem_size =*/1,
/*.mem_buffer =*/NULL,
/*.no_alloc =*/0,
};
struct ggml_context *ctx = ggml_init(init_params);
GGML_ASSERT(ctx);
ggml_free(ctx);
}
struct ggml_mulmat_tune tune;
struct ggml_mulmat_tune_params params;
memset(&params, 0, sizeof(struct ggml_mulmat_tune_params));
ggml_mulmat_init_task_profiles();
ggml_mulmat_tune_model_init(&params.model, model_name, ftype);
params.m_num = m_num;
params.n_pass = n_pass;
params.n_threads = n_threads;
params.progress = true;
params.output_console = true;
params.fname = arg_file;
bool ok = ggml_mulmat_tune_bench(&tune, &params);
return ok ? 0 : 1;
}

View File

@ -158,6 +158,16 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
#ifdef GGML_USE_MULMAT_TUNE
if (params.tune || !params.tune_file.empty()){
bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
if (!ok || (params.tune && !params.tune_file.empty())) {
llama_free(ctx);
return ok? 0: 1;
}
}
#endif
// print system information // print system information
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");

View File

@ -2571,7 +2571,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
func = ggml_cuda_rms_norm; func = ggml_cuda_rms_norm;
break; break;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) { if (!any_on_device/* && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)*/) {
return false; return false;
} }
func = ggml_cuda_mul_mat; func = ggml_cuda_mul_mat;

View File

@ -1628,7 +1628,7 @@ bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_
} }
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) { void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst)); // GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
if (src0->type == GGML_TYPE_F32) { if (src0->type == GGML_TYPE_F32) {
ggml_cl_mul_mat_f32(src0, src1, dst); ggml_cl_mul_mat_f32(src0, src1, dst);

620
ggml-threading.c Normal file
View File

@ -0,0 +1,620 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "ggml-threading.h"
#include "ggml.h"
#define UNUSED(x) (void)(x)
// see https://github.com/ggerganov/llama.cpp/pull/1314
#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
#include <emmintrin.h>
static inline void ggml_spin_pause(void) { _mm_pause(); }
#else
static inline void ggml_spin_pause(void) {}
#endif
#if defined(_WIN32)
#include <windows.h>
typedef volatile LONG atomic_int;
typedef atomic_int atomic_bool;
typedef LONG atomic_flag;
typedef CRITICAL_SECTION pthread_mutex_t;
typedef CONDITION_VARIABLE pthread_cond_t;
typedef void pthread_mutexattr_t;
typedef void pthread_condattr_t;
typedef HANDLE pthread_t;
static void atomic_store(atomic_int *ptr, LONG val) {
InterlockedExchange(ptr, val);
}
static LONG atomic_load(atomic_int *ptr) {
return InterlockedCompareExchange(ptr, 0, 0);
}
static LONG atomic_fetch_add(atomic_int *ptr, LONG inc) {
return InterlockedExchangeAdd(ptr, inc);
}
static LONG atomic_fetch_sub(atomic_int *ptr, LONG dec) {
return atomic_fetch_add(ptr, -(dec));
}
static inline LONG atomic_flag_test_and_set(volatile atomic_flag *ptr) {
return InterlockedCompareExchange(ptr, 1, 0);
}
static inline LONG atomic_flag_clear(volatile atomic_flag *ptr) {
return InterlockedExchange(ptr, 0);
}
static int pthread_create(pthread_t *out, void *unused,
ggml_thread_ret_t (*func)(void *), void *arg) {
(void)unused;
HANDLE handle =
CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, arg, 0, NULL);
if (handle == NULL) {
return EAGAIN;
}
*out = handle;
return 0;
}
static int pthread_join(pthread_t thread, void *unused) {
(void)unused;
return (int)WaitForSingleObject(thread, INFINITE);
}
static int pthread_mutex_init(pthread_mutex_t *mutex,
pthread_mutexattr_t *attr) {
(void)attr;
InitializeCriticalSection(mutex);
return 0;
}
static int pthread_mutex_destroy(pthread_mutex_t *mutex) {
DeleteCriticalSection(mutex);
return 0;
}
static int pthread_mutex_lock(pthread_mutex_t *mutex) {
EnterCriticalSection(mutex);
return 0;
}
static int pthread_mutex_unlock(pthread_mutex_t *mutex) {
LeaveCriticalSection(mutex);
return 0;
}
static int pthread_cond_init(pthread_cond_t *cond, pthread_condattr_t *attr) {
(void)attr;
InitializeConditionVariable(cond);
return 0;
}
static int pthread_cond_destroy(pthread_cond_t *cond) {
(void)cond;
return 0;
}
static int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex) {
SleepConditionVariableCS(cond, mutex, INFINITE);
return 0;
}
static int pthread_cond_signal(pthread_cond_t *cond) {
WakeConditionVariable(cond);
return 0;
}
static int pthread_cond_broadcast(pthread_cond_t *cond) {
WakeAllConditionVariable(cond);
return 0;
}
static int sched_yield(void) {
// https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-yieldprocessor
YieldProcessor();
return 0;
}
#else // ! _WIN32
#include <pthread.h>
#include <stdatomic.h>
#include <stdbool.h>
#endif
// #define GGML_THREADING_DEBUG 1
#ifdef GGML_THREADING_DEBUG
#define PRINT_DEBUG(...) fprintf(stdout, __VA_ARGS__)
#else
#define PRINT_DEBUG(...)
#endif
struct ggml_perf_stats {
int runs;
// total cycles
atomic_int cycles;
// total time in us.
atomic_int time_us;
};
struct ggml_compute_state_shared {
atomic_flag spin;
pthread_mutex_t mutex;
pthread_cond_t cond;
// number of threads that has entered thread runner.
atomic_int n_ready;
// number of assigned but unfinished tasks, workers decrease it.
atomic_int n_tasks;
// number of waiting workers, workers increase it.
atomic_int n_waiting;
// commands.
atomic_bool wait_now;
atomic_bool wait_on_done;
atomic_bool stop;
ggml_threading_task_runner *task_runner;
struct ggml_threading_context *ctx;
};
struct ggml_compute_state {
pthread_t thrd;
atomic_bool has_work;
struct ggml_compute_params params;
struct ggml_tensor *node;
struct ggml_compute_state_shared *shared;
};
struct ggml_threading_context {
int n_threads;
struct ggml_compute_state_shared shared;
struct ggml_compute_state *workers;
enum ggml_threading_features features;
struct ggml_perf_stats wait_perf;
struct ggml_perf_stats wakeup_perf;
int64_t *stages_time;
};
// NOTE: ggml_spin_lock and ggml_spin_unlock may can be noop if
// feature wait_on_done is off.
static inline void ggml_spin_lock(volatile atomic_flag *obj) {
while (atomic_flag_test_and_set(obj)) {
ggml_spin_pause();
}
}
static inline void ggml_spin_unlock(volatile atomic_flag *obj) {
atomic_flag_clear(obj);
}
static inline void ggml_perf_collect(struct ggml_perf_stats *st, int64_t c0,
int64_t t0) {
st->runs++;
st->cycles += (ggml_cycles() - c0);
st->time_us += (ggml_time_us() - t0);
}
// A worker thread goes cond waiting.
// NOTE: must be protected by shared->spin
static void ggml_threading_cond_wait(struct ggml_compute_state *state) {
struct ggml_compute_state_shared *shared = state->shared;
int64_t perf_cycles_0 = 0;
int64_t perf_time_0 = 0;
if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
perf_cycles_0 = ggml_cycles();
perf_time_0 = ggml_time_us();
}
GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0);
if (!shared->wait_now) {
GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
ggml_spin_unlock(&shared->spin);
return;
}
shared->n_waiting++;
ggml_spin_unlock(&shared->spin);
GGML_ASSERT(pthread_cond_wait(&shared->cond, &shared->mutex) == 0);
GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
ggml_spin_lock(&shared->spin);
shared->n_waiting--;
if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
ggml_perf_collect(&shared->ctx->wait_perf, perf_cycles_0, perf_time_0);
}
}
// Wakeup all workers.
//
// Workers takes some time to wakeup, and has to lock spin after wakeup. Yield
// is used to avoid signal frequently. Current implementation is highly
// experimental. See tests/test-ggml-threading.c for details.
//
// NOTE: must be protected by shared->spin
static void
ggml_threading_wakeup_workers(struct ggml_compute_state_shared *shared) {
int64_t perf_cycles_0 = 0;
int64_t perf_time_0 = 0;
if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
perf_cycles_0 = ggml_cycles();
perf_time_0 = ggml_time_us();
}
shared->wait_now = false;
int loop_counter = 0;
int notify_counter = 0;
int64_t last_signal_time = 0;
while (shared->n_waiting != 0) {
ggml_spin_unlock(&shared->spin);
if (loop_counter > 0) {
ggml_spin_pause();
if (loop_counter > 3) {
sched_yield();
}
}
++loop_counter;
// TODO: should bench actual average wait/wakeup time.
if (last_signal_time > 0 && (ggml_time_us() - last_signal_time) < 10) {
continue;
}
GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0);
GGML_ASSERT(pthread_cond_broadcast(&shared->cond) == 0);
GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
++notify_counter;
last_signal_time = ggml_time_us();
ggml_spin_lock(&shared->spin);
}
if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
ggml_perf_collect(&shared->ctx->wakeup_perf, perf_cycles_0,
perf_time_0);
}
// if (notify_counter > 1) {
// printf("%s: loop counter: %d, notify counter: %d\n", __func__,
// loop_counter, notify_counter);
// }
UNUSED(notify_counter);
}
// Setup workers for a task stage.
// NOTE: must be protected by shared->spin
static void ggml_threading_setup_workers(struct ggml_threading_context *ctx,
struct ggml_task_profile *profile,
enum ggml_task_type type) {
PRINT_DEBUG("[main] setup workers for task ...\n");
#ifdef GGML_THREADING_DEBUG
int64_t t0 = ggml_time_us();
#endif
const int n_worker_threads = ctx->n_threads - 1;
struct ggml_task_stage *current = &profile->stages[type];
struct ggml_compute_state_shared *shared = &ctx->shared;
if (current->parallel) {
if (shared->n_waiting > 0) {
ggml_threading_wakeup_workers(shared);
}
if ((ctx->features & GGML_THREADING_FEATURE_WAIT_ON_DONE) > 0) {
// Optimize energy: wait_on_done. We MAY also check following nodes,
// but that's a bit complicated.
shared->wait_on_done = false;
for (int i = type + 1; i <= GGML_TASK_FINALIZE; i++) {
struct ggml_task_stage *next = &profile->stages[i];
if (next->parallel) {
break;
}
if (next->wait) {
shared->wait_on_done = true;
PRINT_DEBUG("[main] wait_on_done is enabled for "
"current task stage\n");
break;
}
}
}
} else if (current->wait) {
if (shared->n_waiting < n_worker_threads) {
shared->wait_now = true;
PRINT_DEBUG("[main] wait_now was set, expect %d workers wait\n",
n_worker_threads);
ggml_spin_unlock(&shared->spin);
while (shared->n_waiting != n_worker_threads) {
ggml_spin_pause();
}
ggml_spin_lock(&shared->spin);
PRINT_DEBUG("[main] saw %d workers waiting\n", n_worker_threads);
}
}
PRINT_DEBUG("[main] setup workers for task took %d us\n",
(int)(ggml_time_us() - t0));
}
ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
GGML_ASSERT(data);
struct ggml_compute_state *state = (struct ggml_compute_state *)data;
GGML_ASSERT(state);
struct ggml_compute_state_shared *shared = state->shared;
GGML_ASSERT(shared);
GGML_ASSERT(shared->task_runner);
shared->n_ready++;
PRINT_DEBUG("[%d-th] running\n", state->params.ith);
while (!shared->stop) {
if (shared->wait_now) {
ggml_spin_lock(&shared->spin);
if (!state->has_work) {
ggml_threading_cond_wait(state);
}
ggml_spin_unlock(&shared->spin);
}
if (shared->n_tasks > 0 && state->has_work) {
enum ggml_compute_error err =
shared->task_runner(&state->params, state->node);
GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK);
ggml_spin_lock(&shared->spin);
state->has_work = false;
shared->n_tasks--;
bool wait = shared->wait_on_done && !state->has_work;
if (wait) {
ggml_threading_cond_wait(state);
}
ggml_spin_unlock(&shared->spin);
// no need to pause.
if (wait) {
continue;
}
}
ggml_spin_pause();
}
PRINT_DEBUG("[%d-th] exited\n", state->params.ith);
return 0;
}
enum ggml_compute_error
ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
struct ggml_tensor *node, void *wdata,
size_t wsize) {
GGML_ASSERT(ctx);
GGML_ASSERT(node);
GGML_ASSERT(ctx->shared.task_runner);
struct ggml_compute_state_shared *state_shared = &ctx->shared;
// This is the params for main thread.
struct ggml_compute_params params;
enum ggml_compute_error err;
for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) {
if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) {
continue;
}
PRINT_DEBUG("[main] stage: %d\n", type);
int64_t t_stage = 0;
if (ctx->stages_time) {
t_stage = ggml_time_us();
}
// n_tasks is the total number of parallel computing tasks
// (including main thread).
int n_tasks =
node->task_profile.stages[type].parallel ? ctx->n_threads : 1;
ggml_spin_lock(&state_shared->spin);
if (ctx->n_threads > 1) {
ggml_threading_setup_workers(ctx, &node->task_profile, type);
}
if (n_tasks > 1) {
// setup compute task parameters.
for (int j = 0; j < n_tasks - 1; j++) {
ctx->workers[j].params = (struct ggml_compute_params){
.type = type,
.ith = j + 1,
.nth = n_tasks,
.wsize = wsize,
.wdata = wdata,
};
ctx->workers[j].node = node;
ctx->workers[j].has_work = true;
}
state_shared->n_tasks = n_tasks - 1;
PRINT_DEBUG("[main] assigned %d tasks\n", state_shared->n_tasks);
}
ggml_spin_unlock(&state_shared->spin);
// main thread always run the 0-th task.
// TODO: assert(params->nth == 1) instead of
// assert(params->ith == 0)
{
params.type = type;
params.ith = 0;
params.nth = n_tasks;
params.wsize = wsize;
params.wdata = wdata;
err = state_shared->task_runner(&params, node);
}
// wait for tasks done.
if (n_tasks > 1) {
while (state_shared->n_tasks != 0) {
ggml_spin_pause();
}
}
PRINT_DEBUG("[main] all tasks finished\n\n");
if (ctx->stages_time) {
ctx->stages_time[type] = ggml_time_us() - t_stage;
}
if (err != GGML_COMPUTE_OK) {
return err;
}
}
return GGML_COMPUTE_OK;
}
struct ggml_threading_context *
ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner,
ggml_threading_task_runner *task_stage_runner,
enum ggml_threading_features features,
int64_t stages_time[3]) {
GGML_ASSERT(n_threads > 0);
GGML_ASSERT(thread_runner);
GGML_ASSERT(task_stage_runner);
size_t ctx_sz = sizeof(struct ggml_threading_context);
struct ggml_threading_context *ctx = malloc(ctx_sz);
GGML_ASSERT(ctx);
memset(ctx, 0, ctx_sz);
ctx->shared = (struct ggml_compute_state_shared){
.spin = {0},
.n_ready = 0,
.n_tasks = 0,
.n_waiting = 0,
.wait_now = false,
.wait_on_done = false,
.stop = false,
.task_runner = task_stage_runner,
.ctx = ctx,
};
PRINT_DEBUG("[main] thread start, features: %d\n", features);
ctx->n_threads = n_threads;
ctx->features = features;
ctx->stages_time = stages_time;
int n_workers = n_threads - 1;
if (n_workers > 0) {
GGML_ASSERT(pthread_mutex_init(&ctx->shared.mutex, NULL) == 0);
GGML_ASSERT(pthread_cond_init(&ctx->shared.cond, NULL) == 0);
size_t workers_sz = sizeof(struct ggml_compute_state) * n_workers;
struct ggml_compute_state *workers = malloc(workers_sz);
GGML_ASSERT(workers);
memset(workers, 0, workers_sz);
for (int j = 0; j < n_workers; j++) {
workers[j].shared = &ctx->shared;
GGML_ASSERT(pthread_create(&workers[j].thrd, NULL, thread_runner,
&workers[j]) == 0);
}
ctx->workers = workers;
while (ctx->shared.n_ready != n_workers) {
ggml_spin_pause();
}
}
return ctx;
}
static void
ggml_threading_print_perf_stats(struct ggml_threading_context *ctx) {
bool print_stats = ctx->features & GGML_THREADING_FEATURE_PERF;
#ifdef GGML_THREADING_DEBUG
print_stats = true;
#endif
if (!print_stats) {
return;
}
const char *prefix_arr[2] = {"[threading wait ]", "[threading wakeup]"};
struct ggml_perf_stats *st_arr[2] = {&ctx->wait_perf, &ctx->wakeup_perf};
for (int i = 0; i < 2; i++) {
struct ggml_perf_stats *st = st_arr[i];
if (st->runs == 0) {
continue;
}
fprintf(stdout,
"%s runs: %4d, avg cycles: %8.3f ms, avg time: "
"%8.3f ms\n",
prefix_arr[i], st->runs,
1.0 * st->cycles / (st->runs * ggml_cycles_per_ms()),
1.0 * st->time_us / (st->runs * 1000));
}
}
void ggml_threading_stop(struct ggml_threading_context *ctx) {
GGML_ASSERT(ctx);
if (ctx->workers) {
PRINT_DEBUG("[main] stopping thread pool ...\n");
ctx->shared.stop = true;
ggml_spin_lock(&ctx->shared.spin);
ggml_threading_wakeup_workers(&ctx->shared);
ggml_spin_unlock(&ctx->shared.spin);
for (int j = 0; j < ctx->n_threads - 1; j++) {
GGML_ASSERT(pthread_join(ctx->workers[j].thrd, NULL) == 0);
}
free(ctx->workers);
PRINT_DEBUG("[main] thread pool stopped\n");
}
ggml_threading_print_perf_stats(ctx);
free(ctx);
}

68
ggml-threading.h Normal file
View File

@ -0,0 +1,68 @@
#pragma once
#include "ggml.h"
#ifdef __cplusplus
extern "C" {
#endif
#if defined(_WIN32)
typedef int ggml_thread_ret_t;
#else
typedef void *ggml_thread_ret_t;
#endif
struct ggml_threading_context;
// Optional (experimental) features.
enum ggml_threading_features {
GGML_THREADING_FEATURE_NONE = 0,
GGML_THREADING_FEATURE_WAIT_ON_DONE = 1 << 0,
GGML_THREADING_FEATURE_PERF = 1 << 1,
};
// Compute errors.
enum ggml_compute_error {
GGML_COMPUTE_OK = 0,
GGML_COMPUTE_FALLBACK = 1,
};
// The task runner to be called by main thread and workers.
typedef enum ggml_compute_error(ggml_threading_task_runner)(
struct ggml_compute_params *params, struct ggml_tensor *node);
// The thread runner to feed into OS threads.
typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data);
// Init and start underlying workers if n_threads > 1.
//
// features: optional for configure threading additional features.
// see `ggml_threading_feature`, default 0.
// stages_time: optional for collecting per-stage wall clock time.
struct ggml_threading_context *
ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread,
ggml_threading_task_runner *task_stage_runner,
enum ggml_threading_features features,
int64_t stages_time[3]);
// Stop workers (if exist), free memories (including the ctx).
void ggml_threading_stop(struct ggml_threading_context *ctx);
// The default implementation of `ggml_threading_thread_runner`
ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data);
// Compute a tensor. It computes the enabled task stages one by one.
// Caller should take care of the return error: retry for fallback error.
enum ggml_compute_error
ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
struct ggml_tensor *node, void *wdata,
size_t wsize);
// This is an experimental functionality for mulmat tune, as a thin wrapper.
enum ggml_compute_error
ggml_compute_forward_wrapper(struct ggml_compute_params *params,
struct ggml_tensor *tensor);
#ifdef __cplusplus
}
#endif

897
ggml-tune.c Normal file
View File

@ -0,0 +1,897 @@
#include <string.h>
#include "ggml-threading.h"
#include "ggml-tune.h"
#include "ggml.h"
// MUL_MAT fine tunning for non-GPU-offloading cases.
#define GGML_MULMAT_CACHE_LEN 16
static struct mm_cache_element default_mm_cache[GGML_MULMAT_CACHE_LEN] = {0};
#define FNV_OFFSET 14695981039346656037UL
#define FNV_PRIME 1099511628211UL
static uint64_t ggml_mulmat_tune_cache_hash(int M, int N, int K) {
char buf[30];
snprintf(buf, 30, "%d%d%d", M, N, K);
uint64_t hash = FNV_OFFSET;
for (const char *p = buf; *p; p++) {
hash ^= (uint64_t)(unsigned char)(*p);
hash *= FNV_PRIME;
}
return hash;
}
static const char *
ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) {
switch (backend) {
case GGML_TASK_BACKEND_NONE:
return "";
case GGML_TASK_BACKEND_CPU:
return "CPU";
case GGML_TASK_BACKEND_CPU_BLAS:
return "BLAS";
case GGML_TASK_BACKEND_GPU:
return "GPU";
case GGML_TASK_BACKEND_GPU_CUDA:
return "CUDA";
case GGML_TASK_BACKEND_GPU_CL:
return "CL";
default:
GGML_ASSERT(false);
}
}
const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
struct ggml_mulmat_tune *tune, int M, int N, int K, enum ggml_type src0_t,
enum ggml_type src1_t, int stages_time[3]) {
GGML_ASSERT(tune);
// TODO: default_mm_cache is thread-unsafe.
struct mm_cache_element *mm_cache = default_mm_cache;
int slot = ggml_mulmat_tune_cache_hash(M, N, K) % GGML_MULMAT_CACHE_LEN;
struct mm_cache_element *e = &mm_cache[slot];
struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0};
struct ggml_task_profile *prof = NULL;
if (e->M == M && e->N == N && e->K == K) {
prof = e->profile;
if (stages_time) {
for (int i = 0; i < 3; i++) {
stages_time[i] = e->stages_time[i];
}
}
} else {
const struct ggml_mulmat_tune_shape *shape = NULL;
shape = ggml_mulmat_tune_get_shape(tune, N, K, src0_t, src1_t);
if (shape) {
ggml_mulmat_tune_estimate_time(shape, M, profiles_time);
int min = INT32_MAX;
int index = -1;
for (int i = 0; i < shape->n_profiles; i++) {
int total = profiles_time[i].total_time;
if (total < min) {
min = total;
index = i;
}
}
if (index >= 0) {
prof = profiles_time[index].profile;
for (int i = 0; i < 3; i++) {
int t = profiles_time[index].stage_time[i];
if (stages_time) {
stages_time[i] = t;
}
e->stages_time[i] = t;
}
GGML_ASSERT(prof);
e->profile = prof;
e->M = M;
e->N = N;
e->K = K;
// to disable this, build with
// `make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1
// make`
#if !defined(GGML_MULMAT_TUNE_NDEBUG)
const char *names[3];
for (int i = 0; i < 3; i++) {
names[i] = ggml_mulmat_tune_task_backend_name(
prof->stages[i].backend);
}
printf(
"\n[mulmat tune] M: %3d, N: %5d, K: %5d, backends of the "
"fastest profile: %s %s %s\n",
M, N, K, names[0], names[1], names[2]);
#endif
}
}
}
return prof;
}
void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
const char *name, enum ggml_ftype ftype) {
const int n_vocab = 32000;
int n_embd;
// n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult
int n_ff;
// n_rot = n_embd/n_head;
int n_rot;
if (strcmp(name, "3B") == 0) {
// n_head=32, n_mult=216, n_layer=26
// https://github.com/ggerganov/llama.cpp/pull/1588
n_embd = 3200;
n_ff = 8640;
n_rot = 100;
} else if (strcmp(name, "7B") == 0) {
n_embd = 4096;
n_ff = 11008;
n_rot = 128;
} else if (strcmp(name, "13B") == 0) {
n_embd = 5120;
n_ff = 13824;
n_rot = 128;
} else if (strcmp(name, "30B") == 0) {
n_embd = 6656;
n_ff = 17920;
n_rot = 128;
} else if (strcmp(name, "65B") == 0) {
n_embd = 8192;
n_ff = 22016;
n_rot = 128;
} else {
GGML_ASSERT(false);
}
model->name = name;
model->ftype = ftype;
model->n_vocab = n_vocab;
model->n_embd = n_embd;
model->n_ff = n_ff;
model->n_rot = n_rot;
}
bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
struct ggml_mulmat_tune_params *params,
struct ggml_task_profile_factory *pf) {
struct ggml_mulmat_tune_model *model = &params->model;
memset(tune, 0, sizeof(struct ggml_mulmat_tune));
tune->version = GGML_MULMAT_TUNE_VERSION;
tune->n_threads = params->n_threads;
tune->ftype = model->ftype;
size_t name_len = strlen(model->name);
GGML_ASSERT(name_len > 0);
strncpy(tune->model, model->name, sizeof(tune->model) - 1);
const enum ggml_type rot_src0_type = GGML_TYPE_F16;
const enum ggml_type src1_type = GGML_TYPE_F32;
int n_vocab = model->n_vocab;
int n_embd = model->n_embd;
int n_ff = model->n_ff;
int n_rot = model->n_rot;
enum ggml_type type = ggml_ftype_to_ggml_type(model->ftype);
GGML_ASSERT(GGML_MULMAT_N_SHAPES >= 6);
tune->n_shapes = GGML_MULMAT_N_SHAPES;
// Attention layers
tune->shapes[0] = (struct ggml_mulmat_tune_shape){
.N = n_embd, .K = n_embd, .src0_type = type, .src1_type = src1_type};
// Feed forward layers
tune->shapes[1] = (struct ggml_mulmat_tune_shape){
.N = n_embd, .K = n_ff, .src0_type = type, .src1_type = src1_type};
tune->shapes[2] = (struct ggml_mulmat_tune_shape){
.N = n_ff, .K = n_embd, .src0_type = type, .src1_type = src1_type};
tune->shapes[3] = (struct ggml_mulmat_tune_shape){
.N = n_vocab, .K = n_embd, .src0_type = type, .src1_type = src1_type};
// RoPE
tune->shapes[4] = (struct ggml_mulmat_tune_shape){
.N = n_rot, .K = 0, .src0_type = rot_src0_type, .src1_type = src1_type};
tune->shapes[5] = (struct ggml_mulmat_tune_shape){
.N = 0, .K = n_rot, .src0_type = rot_src0_type, .src1_type = src1_type};
for (int i = 0; i < tune->n_shapes; i++) {
struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
shape->n_profiles = ggml_mulmat_get_task_profiles(
pf, shape->src0_type, shape->src1_type, &shape->profiles);
if (shape->n_profiles == 0) {
// allowed for testing.
continue;
}
shape->m_num = params->m_num;
shape->arr_m = malloc(shape->m_num * sizeof(int));
for (int j = 0; j < shape->m_num; j++) {
shape->arr_m[j] = 1 << j;
}
size_t sz = sizeof(struct ggml_mulmat_tune_m) *
(shape->n_profiles * shape->m_num);
shape->items = malloc(sz);
GGML_ASSERT(shape->items);
memset(shape->items, 0, sz);
}
return true;
}
void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune) {
for (int i = 0; i < tune->n_shapes; i++) {
struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
GGML_ASSERT(shape);
// arr_m and items can be NULL only when testing.
if (shape->arr_m) {
free(shape->arr_m);
}
if (shape->items) {
free(shape->items);
}
}
}
static bool ggml_mulmat_tune_write_profiles(
FILE *fp, const struct ggml_task_profile *profiles, int n_profiles) {
int rc;
for (int i = 0; i < n_profiles; i++) {
const struct ggml_task_profile *profile = &profiles[i];
for (int j = 0; j < 3; j++) {
const struct ggml_task_stage *ts = &profile->stages[j];
rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0,
ts->wait ? 1 : 0);
if (rc <= 0) {
return false;
}
if (j < 2) {
rc = fprintf(fp, " ");
if (rc <= 0) {
return false;
}
}
}
rc = fprintf(fp, "\n");
if (rc <= 0) {
return false;
}
}
return true;
}
static bool
ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
const char *model, int ftype, int n_threads,
char *errbuf, int errbuf_len) {
if (tune->version != GGML_MULMAT_TUNE_VERSION) {
snprintf(errbuf, errbuf_len - 1,
"version mismatch, built-in: %d, "
"yours: %d",
GGML_MULMAT_TUNE_VERSION, tune->version);
return false;
} else if (strcmp(model, tune->model) != 0) {
snprintf(errbuf, errbuf_len - 1,
"model mismatch. built-in: %s, yours: %s", model, tune->model);
return false;
} else if (ftype != tune->ftype) {
snprintf(errbuf, errbuf_len - 1,
"ftype mismatch. built-in: %d, yours: %d\n", ftype,
tune->ftype);
return false;
} else if (n_threads != tune->n_threads) {
snprintf(errbuf, errbuf_len - 1,
"n_threads mismatch. run-time: %d, yours: %d\n", n_threads,
tune->n_threads);
return false;
}
for (int i = 0; i < tune->n_shapes; i++) {
const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
struct ggml_task_profile *builtin_profiles = NULL;
int n_profiles = ggml_mulmat_get_task_profiles(
NULL, shape->src0_type, shape->src1_type, &builtin_profiles);
if (n_profiles != shape->n_profiles) {
snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
return false;
}
// TODO: profiles order is relevant, too strict.
size_t sz = sizeof(struct ggml_task_profile) * n_profiles;
if (memcmp(builtin_profiles, shape->profiles, sz) != 0) {
snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
printf("=== built-in profiles:\n");
ggml_mulmat_tune_write_profiles(stderr, builtin_profiles,
n_profiles);
printf("=== incoming profiles:\n");
ggml_mulmat_tune_write_profiles(stderr, shape->profiles,
shape->n_profiles);
return false;
}
}
return true;
}
bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
const char *model, int ftype, int n_threads) {
char errbuf[128];
bool ok = ggml_mulmat_tune_validate_internal(tune, model, ftype, n_threads,
errbuf, sizeof(errbuf));
if (!ok) {
fprintf(stderr, "[mulmat tune] error: %s. run bench again.\n", errbuf);
}
return ok;
}
bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
int rc = fscanf(fp, "%d", &tune->version);
if (rc <= 0) {
return false;
}
if (tune->version != GGML_MULMAT_TUNE_VERSION) {
fprintf(stderr, "[mulmat tune] version mismatch, run bench again\n");
return false;
}
rc = fscanf(fp, "%s %d %d %d", tune->model, (int *)&tune->ftype,
&tune->n_shapes, &tune->n_threads);
if (rc <= 0) {
return false;
}
for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
rc = fscanf(fp, "%d %d %d %d %d %d", &shape->N, &shape->K,
(int *)&shape->src0_type, (int *)&shape->src1_type,
&shape->n_profiles, &shape->m_num);
if (rc <= 0) {
return false;
}
{
size_t item_size = sizeof(struct ggml_mulmat_tune_m) *
(shape->n_profiles * shape->m_num);
shape->items = malloc(item_size);
if (shape->items == NULL) {
fprintf(stderr, "[mulmat tune] failed to allocate memory\n");
return false;
}
memset(shape->items, 0, item_size);
}
{
size_t sz = sizeof(struct ggml_task_profile) * shape->n_profiles;
shape->profiles = malloc(sz);
GGML_ASSERT(shape->profiles);
memset(shape->profiles, 0, sz);
}
for (int ip = 0; ip < shape->n_profiles; ip++) {
struct ggml_task_profile *profile = &shape->profiles[ip];
for (int j = 0; j < 3; j++) {
struct ggml_task_stage *ts = &profile->stages[j];
int backend;
int parallel;
int wait;
rc = fscanf(fp, "%d %d %d", &backend, &parallel, &wait);
if (rc <= 0) {
return false;
}
ts->backend = (enum ggml_task_backend)backend;
ts->parallel = parallel ? true : false;
ts->wait = wait ? true : false;
}
}
for (int i_m = 0; i_m < shape->m_num; i_m++) {
int M;
for (int ip = 0; ip < shape->n_profiles; ip++) {
if (ip == 0) {
rc = fscanf(fp, "%d", &M);
if (rc <= 0) {
return false;
}
}
struct ggml_mulmat_tune_m *item =
&shape->items[ip * shape->m_num + i_m];
item->M = M;
rc = fscanf(fp, "%d %d %d", &item->stages_time[0],
&item->stages_time[1], &item->stages_time[2]);
if (rc <= 0) {
return false;
}
}
}
}
return true;
}
bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
FILE *fp) {
int rc;
rc = fprintf(fp, "%d %s %d %d %d\n\n", tune->version, tune->model,
tune->ftype, tune->n_shapes, tune->n_threads);
if (rc <= 0) {
return false;
}
for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
if (i_shape > 0) {
fprintf(fp, "\n");
}
const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
rc = fprintf(fp, "%d %d %d %d %d %d\n", shape->N, shape->K,
shape->src0_type, shape->src1_type, shape->n_profiles,
shape->m_num);
if (rc <= 0) {
return false;
}
rc = ggml_mulmat_tune_write_profiles(fp, shape->profiles,
shape->n_profiles);
if (rc <= 0) {
return false;
}
for (int i_m = 0; i_m < shape->m_num; i_m++) {
for (int ip = 0; ip < shape->n_profiles; ip++) {
struct ggml_mulmat_tune_m *item =
&shape->items[ip * shape->m_num + i_m];
if (ip == 0) {
rc = fprintf(fp, "%4d", item->M);
if (rc <= 0) {
return false;
}
}
struct ggml_task_profile *profile = &shape->profiles[ip];
for (int k = 0; k < 3; k++) {
if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) {
rc = fprintf(fp, "%9d", item->stages_time[k]);
if (rc <= 0) {
return false;
}
} else {
rc = fprintf(fp, " 0");
if (rc <= 0) {
return false;
}
}
}
}
rc = fprintf(fp, "\n");
if (rc <= 0) {
return false;
}
}
}
return true;
}
const struct ggml_mulmat_tune_shape *
ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N,
const int K, enum ggml_type src0_type,
enum ggml_type src1_type) {
GGML_ASSERT(N > 0 && K > 0);
for (int i = 0; i < tune->n_shapes; i++) {
const struct ggml_mulmat_tune_shape *s = &tune->shapes[i];
if (s->src0_type != src0_type || s->src1_type != src1_type) {
continue;
}
if (s->N > 0 && s->K > 0) {
if (s->N == N && s->K == K) {
return s;
}
} else if (s->N > 0 && s->K == 0) {
if (s->N == N) {
return s;
}
} else if (s->N == 0 && s->K > 0) {
if (s->K == K) {
return s;
}
}
}
return NULL;
}
// This is the experimental reference implementation.
// Requires both n_threads are same at bench time and runtime.
void ggml_mulmat_tune_estimate_time(
const struct ggml_mulmat_tune_shape *shape, int M,
struct ggml_mulmat_tune_time *profile_time) {
GGML_ASSERT(shape);
GGML_ASSERT(profile_time);
const int m_num = shape->m_num;
const int min_m = shape->items[0].M;
const int max_m = shape->items[m_num - 1].M;
for (int ip = 0; ip < shape->n_profiles; ip++) {
struct ggml_task_profile *profile = &shape->profiles[ip];
profile_time[ip].total_time = 0;
profile_time[ip].profile = profile;
const int items_offset = ip * m_num;
struct ggml_mulmat_tune_m *p0 = NULL;
struct ggml_mulmat_tune_m *p1 = NULL;
if (M < min_m) {
// first two.
p0 = &shape->items[items_offset];
p1 = &shape->items[items_offset + 1];
} else if (M > max_m) {
// last two
p0 = &shape->items[items_offset + m_num - 2];
p1 = &shape->items[items_offset + m_num - 1];
} else {
for (int i = 0; i < m_num; i++) {
p1 = &shape->items[items_offset + i];
if (p1->M == M) {
p0 = p1;
break;
}
if (i > 0) {
p0 = (struct ggml_mulmat_tune_m *)(p1 - 1);
if (M > p0->M && M < p1->M) {
break;
}
}
}
}
GGML_ASSERT(p0 && p1);
for (int i_stage = 0; i_stage < 3; i_stage++) {
struct ggml_task_stage *stage = &profile->stages[i_stage];
if (stage->backend == GGML_TASK_BACKEND_NONE) {
continue;
}
int p0_v = p0->stages_time[i_stage];
int p1_v = p1->stages_time[i_stage];
GGML_ASSERT(p0_v >= 0);
GGML_ASSERT(p1_v >= 0);
// t = aM + b
double a;
double b;
if (p0 == p1) {
a = 0.0;
b = p1_v;
} else {
a = 1.0 * (p1_v - p0_v) / (p1->M - p0->M);
b = p1_v - a * p1->M;
}
int t = (int)(a * M + b);
profile_time[ip].stage_time[i_stage] = t;
profile_time[ip].total_time += t;
}
}
}
// Experimental: create mul_mat tensor.
static struct ggml_tensor *ggml_mulmat_new_tensor(int M, int N, int K,
enum ggml_type src0_type,
struct ggml_context **ctx) {
// At most 256, because in `ggml_quantize_qx_x`, the index type of hist is
// either int8_t or uint8_t.
// Use 1024 to avoid suddenly broken.
int64_t hist[1024];
bool src0_is_quantized = ggml_is_quantized(src0_type);
size_t ctx_size = 0;
ctx_size += (size_t)(M * N * ggml_type_sizef(GGML_TYPE_F32)); // src1
ctx_size += (size_t)(N * K * ggml_type_sizef(src0_type)); // src0
ctx_size += (size_t)(1024 * 1024 * 64); // experimental
if (src0_is_quantized) {
// quantize F32 to Qx_x
ctx_size += (size_t)(N * K * ggml_type_sizef(GGML_TYPE_F32));
}
struct ggml_init_params init_params = {
.mem_size = ctx_size,
.mem_buffer = NULL,
.no_alloc = 0,
};
*ctx = ggml_init(init_params);
GGML_ASSERT(*ctx);
// src0: N x K
struct ggml_tensor *src0 =
ggml_new_tensor_2d(*ctx, src0_type, (int64_t)K, (int64_t)N);
// src1: M x K
struct ggml_tensor *src1 =
ggml_new_tensor_2d(*ctx, GGML_TYPE_F32, (int64_t)K, (int64_t)M);
ggml_set_f32(src1, 0.5f);
if (src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_F16) {
ggml_set_f32(src0, 0.1f);
} else if (src0_is_quantized) {
struct ggml_tensor *src0_f32 =
ggml_new_tensor_2d(*ctx, GGML_TYPE_F32, (int64_t)K, (int64_t)N);
ggml_set_f32(src0_f32, 0.1f);
switch (src0_type) {
case GGML_TYPE_Q4_0:
ggml_quantize_q4_0((const float *)src0_f32->data, src0->data, N * K,
K, hist);
break;
case GGML_TYPE_Q4_1:
ggml_quantize_q4_1((const float *)src0_f32->data, src0->data, N * K,
K, hist);
break;
case GGML_TYPE_Q5_0:
ggml_quantize_q5_0((const float *)src0_f32->data, src0->data, N * K,
K, hist);
break;
case GGML_TYPE_Q5_1:
ggml_quantize_q5_1((const float *)src0_f32->data, src0->data, N * K,
K, hist);
break;
case GGML_TYPE_Q8_0:
ggml_quantize_q8_0((const float *)src0_f32->data, src0->data, N * K,
K, hist);
break;
default:
GGML_ASSERT(false);
}
} else {
GGML_ASSERT(false);
}
// node: M x N
// Will compute z = y * xT, z: node, y: src1, x: src0
return ggml_mul_mat(*ctx, src0, src1);
}
// Experimental: allocate memory for wdata with max possible size.
// This part of code is actually belongs to ggml compute graph.
static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) {
// The size is actually determined by cgraph before computing.
// Apart from the src0_type, wsize is affected by backend, cache line size,
// n_threads etc.
const size_t extra = 1024 * 1024;
size_t sz = (size_t)(N * K * ggml_type_sizef(GGML_TYPE_F32)) + extra;
void *buf = malloc(sz);
if (!buf) {
fprintf(stderr,
"[mulmat tune] error: failed to allocate %zu MiB memory",
sz / 1024 / 1024);
return 0;
}
memset(buf, 0, sz);
*wdata = buf;
return sz;
}
int ggml_mulmat_tune_get_builtin_task_backends(
enum ggml_task_backend *backends) {
int i = 0;
backends[i++] = GGML_TASK_BACKEND_CPU;
if (ggml_cpu_has_cpublas()) {
backends[i++] = GGML_TASK_BACKEND_CPU_BLAS;
}
if (ggml_cpu_has_cublas()) {
backends[i++] = GGML_TASK_BACKEND_GPU_CUDA;
} else if (ggml_cpu_has_clblast()) {
backends[i++] = GGML_TASK_BACKEND_GPU_CL;
}
return i;
}
bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
struct ggml_mulmat_tune_params *params) {
GGML_ASSERT(tune);
GGML_ASSERT(params);
GGML_ASSERT(params->model.name);
enum ggml_task_backend backends[16];
int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
if (n_backends < 2) {
fprintf(stderr,
"[mulmat tune] error: this program was not built with BLAS.\n");
return false;
}
bool ok = ggml_mulmat_tune_init(tune, params, NULL);
if (!ok) {
return false;
}
{
char buf[128] = {0};
int offset = 0;
for (int i = 0; i < n_backends; i++) {
if (i > 0) {
buf[offset++] = ',';
buf[offset++] = ' ';
}
const char *name = ggml_mulmat_tune_task_backend_name(backends[i]);
size_t len = strlen(name);
memcpy(&buf[offset], name, len);
offset += (int)len;
}
fprintf(stdout,
"[mulmat tune] model: %s, ggml ftype: %d, "
"n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n",
params->model.name, params->model.ftype, params->n_pass,
params->n_threads, tune->n_shapes, buf);
}
int64_t stages_time[3];
int64_t t0 = ggml_time_ms();
struct ggml_threading_context *thrd_ctx = ggml_threading_start(
tune->n_threads, ggml_threading_graph_compute_thread,
ggml_compute_forward_wrapper, GGML_THREADING_FEATURE_WAIT_ON_DONE,
stages_time);
for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
int M;
int N = shape->N;
int K = shape->K;
char buf[20] = {0};
int buf_len = sizeof(buf) - 1;
int line_len = 0;
for (int i_m = 0; i_m < shape->m_num; i_m++) {
M = shape->arr_m[i_m];
if (shape->N == 0) {
N = M;
} else if (shape->K == 0) {
K = M;
}
if (params->progress) {
line_len = snprintf(buf, buf_len, "%d %d %d ", N, K, M);
fprintf(stdout, "%s", buf);
fflush(stdout);
}
char *wdata = NULL;
size_t wsize = ggml_mulmat_allocate_wdata(N, K, &wdata);
if (wsize == 0) {
return false;
}
struct ggml_context *ctx = NULL;
struct ggml_tensor *node =
ggml_mulmat_new_tensor(M, N, K, shape->src0_type, &ctx);
for (int ip = 0; ip < shape->n_profiles; ip++) {
const struct ggml_task_profile *profile = &shape->profiles[ip];
memcpy(&node->task_profile, profile,
sizeof(struct ggml_task_profile));
struct ggml_mulmat_tune_m *item =
&shape->items[ip * shape->m_num + i_m];
item->M = M;
int min[3] = {INT32_MAX, INT32_MAX, INT32_MAX};
for (int k = 0; k < params->n_pass; k++) {
for (int j = 0; j < 3; j++) {
stages_time[j] = 0;
}
/*enum ggml_compute_error err = */
ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
for (int i = 0; i < 3; i++) {
int v = (int)stages_time[i];
if (v < min[i]) {
min[i] = v;
}
}
if (params->progress) {
fprintf(stdout, ".");
fflush(stdout);
line_len++;
}
}
for (int i = 0; i < 3; i++) {
item->stages_time[i] = min[i];
}
}
ggml_free(ctx);
free(wdata);
if (params->progress) {
line_len += 10;
for (int j = 0; j < line_len; j++) {
fprintf(stdout, "\b \b");
}
fflush(stdout);
}
}
}
ggml_threading_stop(thrd_ctx);
fprintf(stdout, "[mulmat tune] done, elapsed time: %d seconds.\n",
(int)(ggml_time_ms() - t0) / 1000);
// output
if (params->fname && strcmp(params->fname, "") != 0) {
FILE *fp = fopen(params->fname, "w");
if (!fp) {
fprintf(stderr,
"[mulmat tune] warn: failed to open file `%s`, print to "
"console instead\n\n",
params->fname);
params->output_console = 1;
} else {
ok = ggml_mulmat_tune_write_data(tune, fp);
fclose(fp);
if (ok) {
fprintf(stdout, "[mulmat tune] data was written to `%s`\n",
params->fname);
} else {
fprintf(
stderr,
"[mulmat tune] warn: failed to write file `%s`, print to "
"console instead\n\n",
params->fname);
params->output_console = 1;
}
}
}
if (params->output_console) {
return ggml_mulmat_tune_write_data(tune, stdout);
}
return true;
}

137
ggml-tune.h Normal file
View File

@ -0,0 +1,137 @@
#pragma once
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include "ggml.h"
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_MULMAT_TUNE_VERSION 8
#define GGML_MULMAT_N_SHAPES 6
#define GGML_MULMAT_MAX_PASS 3
struct ggml_mulmat_tune_m {
int M;
int stages_time[3];
};
struct ggml_mulmat_tune_model {
const char *name;
enum ggml_ftype ftype;
int n_vocab;
int n_embd;
// n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult
int n_ff;
// n_rot = n_embd/n_head;
int n_rot;
};
struct ggml_mulmat_tune_shape {
// For RoPE, one of N / K is 0.
int N;
int K;
enum ggml_type src0_type;
enum ggml_type src1_type;
int n_profiles;
struct ggml_task_profile *profiles;
int m_num;
int *arr_m;
struct ggml_mulmat_tune_m *items;
};
struct ggml_mulmat_tune {
int version;
char model[16];
enum ggml_ftype ftype;
int n_shapes;
// Given N/K, we bench for mul_mat [M,K] x [K,N].
struct ggml_mulmat_tune_shape shapes[GGML_MULMAT_N_SHAPES];
int n_threads;
};
struct ggml_mulmat_tune_time {
struct ggml_task_profile *profile;
int stage_time[3];
int total_time;
};
struct mm_cache_element {
int M;
int N;
int K;
struct ggml_task_profile *profile;
int stages_time[3];
};
// params for tune/bench.
struct ggml_mulmat_tune_params {
struct ggml_mulmat_tune_model model;
int m_num;
int n_pass;
int n_threads;
bool progress; // print and clear '.'
bool output_console; // also print result to console
const char *fname;
};
// NOTE: stages_time is filled if not null.
const struct ggml_task_profile *
ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
int N, int K, enum ggml_type src0_t,
enum ggml_type src1_t, int stages_time[3]);
bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
const char *model_name, int ftype,
int n_threads);
void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
const char *name, enum ggml_ftype ftype);
bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
struct ggml_mulmat_tune_params *params,
struct ggml_task_profile_factory *profile_factory);
void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune);
bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp);
bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp);
const struct ggml_mulmat_tune_shape *
ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, int N, int K,
enum ggml_type src0_type, enum ggml_type src1_type);
void ggml_mulmat_tune_estimate_time(const struct ggml_mulmat_tune_shape *shape,
int M,
struct ggml_mulmat_tune_time *profile_time);
const char *ggml_task_backend_name(enum ggml_task_backend backend);
int ggml_mulmat_tune_get_builtin_task_backends(
enum ggml_task_backend *backends);
bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
struct ggml_mulmat_tune_params *params);
#ifdef __cplusplus
}
#endif

1032
ggml.c

File diff suppressed because it is too large Load Diff

71
ggml.h
View File

@ -1,5 +1,7 @@
#pragma once #pragma once
// //
// GGML Tensor Library // GGML Tensor Library
// //
@ -200,6 +202,7 @@
#define GGML_MAX_OPT 4 #define GGML_MAX_OPT 4
#define GGML_MAX_NAME 32 #define GGML_MAX_NAME 32
#define GGML_DEFAULT_N_THREADS 4 #define GGML_DEFAULT_N_THREADS 4
#define GGML_MAX_TASK_PROFILES 8
#define GGML_ASSERT(x) \ #define GGML_ASSERT(x) \
do { \ do { \
@ -347,7 +350,6 @@ extern "C" {
GGML_OP_COUNT, GGML_OP_COUNT,
}; };
// ggml object // ggml object
struct ggml_object { struct ggml_object {
size_t offs; size_t offs;
@ -360,6 +362,54 @@ extern "C" {
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
// As part of task config profile solution, `ggml_task_backend` defines
// backends for each task stage. Similar to `ggml_tensor.backend`,
// `ggml_tensor.task_profile` generalizes how to configure tensor computing
// at per task-stage level.
//
// The following enum values are designed as combination of hardware and
// optional software interface.
enum ggml_task_backend {
GGML_TASK_BACKEND_NONE = 0,
// [0x10, 0x1F]: CPU
GGML_TASK_BACKEND_CPU = 0x10,
GGML_TASK_BACKEND_CPU_BLAS = 0x11,
// [0x20 - 0x2F]: GPU
GGML_TASK_BACKEND_GPU = 0x20,
GGML_TASK_BACKEND_GPU_CUDA = 0x21,
GGML_TASK_BACKEND_GPU_CL = 0x22,
};
// config for computing one of the 3 task stages of a tensor.
struct ggml_task_stage {
enum ggml_task_backend backend;
bool parallel;
// hint idle workers go waiting, valid only when parallel is false.
bool wait;
};
// config for computing a tensor.
struct ggml_task_profile {
// index 0: INIT, 1: COMPUTE, 2: FINALIZE
struct ggml_task_stage stages[3];
// MUST be used only in testing codes.
uint8_t dev_flags[4];
};
struct ggml_task_profile_factory {
struct ggml_task_profile f32_f32[GGML_MAX_TASK_PROFILES];
int n_f32_f32;
struct ggml_task_profile f16_f32[GGML_MAX_TASK_PROFILES];
int n_f16_f32;
struct ggml_task_profile qxx_f32[GGML_MAX_TASK_PROFILES];
int n_qxx_f32;
};
// n-dimensional tensor // n-dimensional tensor
struct ggml_tensor { struct ggml_tensor {
enum ggml_type type; enum ggml_type type;
@ -383,7 +433,8 @@ extern "C" {
struct ggml_tensor * opt[GGML_MAX_OPT]; struct ggml_tensor * opt[GGML_MAX_OPT];
// thread scheduling // thread scheduling
int n_tasks;
struct ggml_task_profile task_profile;
// performance // performance
int perf_runs; int perf_runs;
@ -396,7 +447,7 @@ extern "C" {
void * extra; // extra things e.g. for ggml-cuda.cu void * extra; // extra things e.g. for ggml-cuda.cu
char padding[4]; char padding[12];
}; };
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -407,6 +458,8 @@ extern "C" {
int n_leafs; int n_leafs;
int n_threads; int n_threads;
struct ggml_mulmat_tune *tune;
size_t work_size; size_t work_size;
struct ggml_tensor * work; struct ggml_tensor * work;
@ -1287,9 +1340,21 @@ extern "C" {
GGML_API int ggml_cpu_has_cublas (void); GGML_API int ggml_cpu_has_cublas (void);
GGML_API int ggml_cpu_has_clblast (void); GGML_API int ggml_cpu_has_clblast (void);
GGML_API int ggml_cpu_has_gpublas (void); GGML_API int ggml_cpu_has_gpublas (void);
GGML_API int ggml_cpu_has_cpublas (void);
GGML_API int ggml_cpu_has_sse3 (void); GGML_API int ggml_cpu_has_sse3 (void);
GGML_API int ggml_cpu_has_vsx (void); GGML_API int ggml_cpu_has_vsx (void);
//
// mulmat task profiles
//
GGML_API void ggml_mulmat_init_task_profiles(void);
GGML_API int ggml_mulmat_get_task_profiles(
struct ggml_task_profile_factory *pf,
enum ggml_type src0_t,
enum ggml_type src1_t,
struct ggml_task_profile **profiles);
// //
// Internal types and functions exposed for tests and benchmarks // Internal types and functions exposed for tests and benchmarks
// //

160
llama.cpp
View File

@ -4,6 +4,7 @@
#include <cstddef> #include <cstddef>
#include <cstdint> #include <cstdint>
#include <cstdio> #include <cstdio>
#include <cstdlib>
#endif #endif
#include "llama-util.h" #include "llama-util.h"
@ -20,6 +21,10 @@
#include "ggml-metal.h" #include "ggml-metal.h"
#endif #endif
#ifdef GGML_USE_MULMAT_TUNE
#include "ggml-tune.h"
#endif
#include <array> #include <array>
#include <ctime> #include <ctime>
#include <cinttypes> #include <cinttypes>
@ -280,6 +285,10 @@ struct llama_context {
int buf_last = 0; int buf_last = 0;
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
#ifdef GGML_USE_MULMAT_TUNE
struct ggml_mulmat_tune *tune = nullptr;
#endif
void use_buf(struct ggml_context * ctx, int i) { void use_buf(struct ggml_context * ctx, int i) {
#if defined(LLAMA_USE_SCRATCH) #if defined(LLAMA_USE_SCRATCH)
size_t last_size = 0; size_t last_size = 0;
@ -1396,10 +1405,12 @@ static bool llama_eval_internal(
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ggml_cgraph gf = {}; ggml_cgraph gf = {};
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; gf.n_threads = n_threads;
#ifdef GGML_USE_MULMAT_TUNE
gf.tune =lctx.tune;
#endif
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
ggml_set_name(embd, "embd"); ggml_set_name(embd, "embd");
@ -2732,7 +2743,150 @@ struct llama_context * llama_init_from_file(
return ctx; return ctx;
} }
#ifdef GGML_USE_MULMAT_TUNE
bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) {
printf("\n");
if (ctx->model.n_gpu_layers != 0) {
fprintf(stderr, "[mulmat tune] error: is disabled by GPU offloading\n");
return false;
}
const char *model_name = llama_model_type_name(ctx->model.type);
llama_hparams *hparams = &ctx->model.hparams;
enum ggml_ftype ggml_ftype;
switch (hparams->ftype) {
case LLAMA_FTYPE_ALL_F32:
ggml_ftype = GGML_FTYPE_ALL_F32;
break;
case LLAMA_FTYPE_MOSTLY_F16:
ggml_ftype = GGML_FTYPE_MOSTLY_F16;
break;
case LLAMA_FTYPE_MOSTLY_Q4_0:
ggml_ftype = GGML_FTYPE_MOSTLY_Q4_0;
break;
case LLAMA_FTYPE_MOSTLY_Q4_1:
ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1;
break;
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1_SOME_F16;
break;
case LLAMA_FTYPE_MOSTLY_Q5_0:
ggml_ftype = GGML_FTYPE_MOSTLY_Q5_0;
break;
case LLAMA_FTYPE_MOSTLY_Q5_1:
ggml_ftype = GGML_FTYPE_MOSTLY_Q5_1;
break;
case LLAMA_FTYPE_MOSTLY_Q8_0:
ggml_ftype = GGML_FTYPE_MOSTLY_Q8_0;
break;
case LLAMA_FTYPE_MOSTLY_Q2_K:
ggml_ftype = GGML_FTYPE_MOSTLY_Q2_K;
break;
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
case LLAMA_FTYPE_MOSTLY_Q3_K_L:
ggml_ftype = GGML_FTYPE_MOSTLY_Q3_K;
break;
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
case LLAMA_FTYPE_MOSTLY_Q4_K_M:
ggml_ftype = GGML_FTYPE_MOSTLY_Q4_K;
break;
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
case LLAMA_FTYPE_MOSTLY_Q5_K_M:
ggml_ftype = GGML_FTYPE_MOSTLY_Q5_K;
break;
case LLAMA_FTYPE_MOSTLY_Q6_K:
ggml_ftype = GGML_FTYPE_MOSTLY_Q6_K;
break;
default:
throw std::runtime_error(
format("invalid output file type %d\n", hparams->ftype));
}
int n_vocab = hparams->n_vocab;
int n_embd = hparams->n_embd;
int n_rot = hparams->n_rot;
int n_mult = hparams->n_mult;
int n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult;
struct ggml_mulmat_tune_params params = {
/*.model =*/ {
/* .name =*/ model_name,
/* .ftype =*/ ggml_ftype,
/* .n_vocab =*/ n_vocab,
/* .n_embd =*/ n_embd,
/* .n_ff =*/ n_ff,
/* .n_rot =*/ n_rot,
},
/* .m_num =*/ 8,
/* .n_pass =*/ 1,
/* .n_threads =*/ n_threads,
/* .prrogress =*/ true,
/* .output_console =*/ false,
/* .fname =*/ fname,
};
bool empty_fname = !fname || strcmp(fname, "") == 0;
ctx->tune = new(struct ggml_mulmat_tune);
if (!ctx->tune) {
throw std::runtime_error(format("failed to allocate memory for tune\n"));
}
if (tune) {
bool ok = ggml_mulmat_tune_bench(ctx->tune, &params);
if (!ok) {
ggml_mulmat_tune_free(ctx->tune);
return false;
}
if (!empty_fname) {
ggml_mulmat_tune_free(ctx->tune);
return true;
}
} else {
if (empty_fname) {
return false;
}
}
if (!empty_fname) {
FILE *fp = fopen(fname, "r");
if (!fp) {
fprintf(stderr, "[mulmat tune] failed to open file %s.\n",
fname);
} else {
bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp);
fclose(fp);
if (!ok) {
fprintf(stderr,
"[mulmat tune] failed to read data from %s\n",
fname);
return false;
}
fprintf(stderr, "[mulmat tune] loaded data from %s\n", fname);
ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads);
if (!ok) {
return false;
}
}
}
return true;
}
#endif
void llama_free(struct llama_context * ctx) { void llama_free(struct llama_context * ctx) {
#ifdef GGML_USE_MULMAT_TUNE
if (ctx->tune) {
delete(ctx->tune);
}
#endif
delete ctx; delete ctx;
} }

View File

@ -300,6 +300,9 @@ extern "C" {
// Print system information // Print system information
LLAMA_API const char * llama_print_system_info(void); LLAMA_API const char * llama_print_system_info(void);
// Experimental utility functionality for mulmat tunning.
LLAMA_API bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

2
tests/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
/test-ggml-threading
/test-ggml-tune

View File

@ -12,3 +12,5 @@ llama_add_test(test-sampling.cpp)
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
# llama_add_test(test-grad0.c) # SLOW # llama_add_test(test-grad0.c) # SLOW
# llama_add_test(test-opt.c) # SLOW # llama_add_test(test-opt.c) # SLOW
llama_add_test(test-ggml-threading.c)
llama_add_test(test-ggml-tune.c)

345
tests/test-ggml-threading.c Normal file
View File

@ -0,0 +1,345 @@
#include "ggml-threading.h"
#include "ggml.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
// Purposes:
// 1. general overview of the threading behaviors.
// 2. race (dead lock) detection.
// # build
// cd build
//
// # build release:
// cmake .. && cmake --build . --config Release
//
// # build with sanitize:
// cmake .. -DLLAMA_SANITIZE_THREAD=ON && cmake --build . --config Release
//
// # run:
// ./bin/test-ggml-threading
// How to turn off the warning on Apple: malloc: nano zone abandoned due to
// inability to reserve vm space?
// ==> export MallocNanoZone=0, no need to rebuild.
// See `nano_init()` from
// https://opensource.apple.com/source/libmalloc/libmalloc-140.40.1/src/nano_malloc.c.auto.html
// How to view the threading debug:
// ==> uncomment `#define GGML_THREADING_DEBUG 1` from file ggml-threading.c
#define UNUSED(x) (void)(x)
#define MAX_N_THREADS 16
static const int n_repeat = 10;
// It's frustrating to use atomic with c11 on Windows, let's replace atomic
// counter with array.
static int work_done_arr[MAX_N_THREADS];
static enum ggml_compute_error
mock_task_runner(struct ggml_compute_params *params, struct ggml_tensor *node) {
int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000;
if (node->task_profile.stages[params->type].parallel) {
loops /= params->nth;
}
volatile int64_t j = 0;
for (int i = 0; i < loops; i++) {
j++;
}
UNUSED(j);
work_done_arr[params->ith]++;
return GGML_COMPUTE_OK;
}
int test_driver(int id, struct ggml_tensor *node, int n_threads) {
printf("\n[test-ggml-threading] #%d, n_threads: %d\n", id, n_threads);
for (int i = 0; i < n_threads; i++) {
work_done_arr[i] = 0;
}
bool wait_on_done = (node->task_profile.dev_flags[0] > 0u);
enum ggml_threading_features features = GGML_THREADING_FEATURE_PERF;
if (wait_on_done) {
features |= GGML_THREADING_FEATURE_WAIT_ON_DONE;
}
int t0 = (int)ggml_time_us();
struct ggml_threading_context *ctx =
ggml_threading_start(n_threads, ggml_threading_graph_compute_thread,
mock_task_runner, features, /*stages_time*/ NULL);
int t1 = (int)ggml_time_us();
for (int i = 0; i < n_repeat; i++) {
enum ggml_compute_error err = ggml_threading_compute_tensor(
ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
if (err != GGML_COMPUTE_OK) {
ggml_threading_stop(ctx);
fprintf(stderr,
"ggml_threading_compute_tensor failed with error: %d.\n",
err);
return 1;
}
}
int t2 = (int)ggml_time_us();
ggml_threading_stop(ctx);
int t3 = (int)ggml_time_us();
int expect = 0;
for (int i = 0; i < 3; i++) {
struct ggml_task_stage *ts = &node->task_profile.stages[i];
if (ts->backend != GGML_TASK_BACKEND_NONE) {
if (ts->parallel) {
expect += n_threads;
} else {
expect++;
}
}
}
expect *= n_repeat;
int actual = 0;
for (int i = 0; i < n_threads; i++) {
actual += work_done_arr[i];
}
uint8_t loops = node->task_profile.dev_flags[1];
printf("\tloops: %2d million(s), ---wait_on_done---: %d\n\tstage-0: "
"(parallel: %d, "
"wait: %d)\n"
"\tstage-1: (parallel: %d, wait: %d)\n",
loops, wait_on_done, node->task_profile.stages[0].parallel,
node->task_profile.stages[0].wait,
node->task_profile.stages[1].parallel,
node->task_profile.stages[1].wait);
if (actual == expect) {
printf("\tthreading: init %6.3f ms, compute %6.3f ms, cleanup %6.3f "
"ms, total %6.3f ms\n",
1.0 * (t1 - t0) / 1000, 1.0 * (t2 - t1) / 1000,
1.0 * (t3 - t2) / 1000, 1.0 * (t3 - t0) / 1000);
return 0;
}
fprintf(stderr, "\t== failed. expect %d done, actual %d done\n\n", expect,
actual);
return 2;
}
static enum ggml_compute_error
mock_task_runner_fallback(struct ggml_compute_params *params,
struct ggml_tensor *node) {
UNUSED(params);
if (node->backend == GGML_BACKEND_GPU) {
// ... finally failed to compute in GPU.
node->backend = GGML_BACKEND_CPU;
return GGML_COMPUTE_FALLBACK;
} else {
return GGML_COMPUTE_OK;
}
}
// By design, fallback should happen when attempt computing tensor in GPU,
// thus it is not parallelled.
int test_fallback(struct ggml_tensor *node) {
struct ggml_threading_context *ctx = ggml_threading_start(
1, ggml_threading_graph_compute_thread, mock_task_runner_fallback,
/*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL);
enum ggml_compute_error err =
ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
if (err == GGML_COMPUTE_FALLBACK) {
err = ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL,
/*wsize*/ 0);
}
ggml_threading_stop(ctx);
if (err != GGML_COMPUTE_OK) {
fprintf(stderr,
"ggml_threading_compute_tensor failed with error: %d.\n", err);
return 1;
}
return 0;
}
int main(void) {
ggml_time_init();
struct ggml_tensor node;
memset(&node, 0, sizeof(struct ggml_tensor));
struct ggml_task_stage *stages = node.task_profile.stages;
stages[0].backend = GGML_TASK_BACKEND_CPU;
stages[1].backend = GGML_TASK_BACKEND_CPU;
stages[2].backend = GGML_TASK_BACKEND_NONE;
int n_passed = 0;
int n_tests = 0;
int parallel[3] = {0, 1, 2};
// In github build actions (windows-latest-cmake and ubuntu-latest-cmake):
// When n_threads >= 4, the thread init time and compute time suddenly goes
// down to 100x ~ 1000x slow -- comparing to n_threads == 2.
//
// But the tests (n_threads 1, 2, 4, 6) looks sound on my devices:
// - MacBook air 2013, ubuntu 22.04
// - MacBook pro 2018, macOS 13.4
//
// So I assume the github build host has limited multi-cpu quota.
// Will skip computing when threading init time is too slow.
//
// NOTE: it's observed that when workload is 0 and n_threads >= number of
// physical cores:
// - the wait/wakeup time varies much: can be up to tens or hundreds of the
// average time, thus greatly punishes those small workloads.
// - wait_on_done is general faster than wait_now, can be 10x faster.
int threads_arr[] = {1, 2, 4, 8};
int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
// millions of loops.
uint8_t workload_arr[] = {0u, 1u, 10u};
int workload_arr_len = sizeof(workload_arr) / sizeof(workload_arr[0]);
// node.task_profile.dev_flags: byte 0 for wait_on_done, byte 1 for loops.
for (int x = 0; x < workload_arr_len; x++) {
node.task_profile.dev_flags[1] = workload_arr[x];
for (int i = 0; i < threads_arr_len; i++) {
int n_threads = threads_arr[i];
if (n_threads > MAX_N_THREADS) {
abort();
}
printf("\n[test-ggml-threading] ==== n_nodes: %d, n_threads: %d, "
"loops: %2d million(s) ====\n",
n_repeat, n_threads, workload_arr[x]);
if (n_threads > 1) { // skip this n_threads when too slow.
int t0 = (int)ggml_time_us();
struct ggml_threading_context *ctx = ggml_threading_start(
n_threads, ggml_threading_graph_compute_thread,
mock_task_runner, 0, /*stages_time*/ NULL);
int t1 = (int)ggml_time_us();
ggml_threading_stop(ctx);
int elapsed_us = t1 - t0;
if (elapsed_us > 500 * n_threads) {
fprintf(stderr,
"[test-ggml-threading] warning: it took took %.3f "
"ms to start %d worker thread(s).\n",
1.0 * elapsed_us / 1000, n_threads - 1);
fprintf(stderr, "[test-ggml-threading] warning: looks like "
"the environment is too slow to run this "
"number of threads, skip.\n");
continue;
}
}
// multi-threads: parallel + wait_now/wait_on_done
if (n_threads == 1) {
stages[0].parallel = false;
stages[1].parallel = false;
stages[0].wait = false;
stages[1].wait = false;
n_tests++;
if (test_driver(n_tests, &node, n_threads) == 0) {
n_passed++;
}
continue;
}
for (int j = 0; j < 3; j++) {
stages[0].wait = false;
stages[1].wait = false;
node.task_profile.dev_flags[0] = 0u;
if (parallel[j] == 0) {
stages[0].parallel = false;
stages[1].parallel = false;
n_tests++;
if (test_driver(n_tests, &node, n_threads) == 0) {
n_passed++;
}
} else if (parallel[j] == 1) {
stages[0].parallel = true;
stages[1].parallel = false;
for (int k = 0; k < 2; k++) {
stages[1].wait = (k == 1);
if (!stages[1].wait) {
n_tests++;
if (test_driver(n_tests, &node, n_threads) == 0) {
n_passed++;
}
continue;
}
// wait
for (int m = 0; m < 2; m++) {
if (m == 1) {
node.task_profile.dev_flags[0] = 1u;
}
n_tests++;
if (test_driver(n_tests, &node, n_threads) == 0) {
n_passed++;
}
node.task_profile.dev_flags[0] = 0u;
}
}
} else {
stages[0].parallel = true;
stages[1].parallel = true;
n_tests++;
if (test_driver(n_tests, &node, n_threads) == 0) {
n_passed++;
}
}
}
}
}
{
++n_tests;
node.backend = GGML_BACKEND_GPU;
if (test_fallback(&node) == 0) {
++n_passed;
printf("\n[test-ggml-threading] test fallback: ok\n\n");
}
}
printf("[test-ggml-threading] %d/%d passed.\n", n_passed, n_tests);
return (n_passed == n_tests) ? 0 : 1;
}

200
tests/test-ggml-tune.c Normal file
View File

@ -0,0 +1,200 @@
#include "ggml-tune.h"
#include "ggml.h"
#include <string.h>
static int bench(void);
static int estimate_time_non_zero_NK(void);
static void init_params(struct ggml_mulmat_tune_params *params, int m_num) {
*params = (struct ggml_mulmat_tune_params){
.model =
(struct ggml_mulmat_tune_model){
.name = "3B", // fake
.ftype = GGML_FTYPE_MOSTLY_Q4_0,
.n_vocab = 4096,
.n_embd = 1024,
.n_ff = 2048,
.n_rot = 128,
},
.m_num = m_num,
.n_pass = 1,
.n_threads = 1,
.progress = false,
.output_console = true,
.fname = NULL};
}
int main(void) {
int rv = bench();
if (rv != 0) {
return rv;
}
printf("\n");
rv = estimate_time_non_zero_NK();
if (rv != 0) {
return rv;
}
printf("\n");
return 0;
}
static int bench(void) {
printf("test: %s\n", __func__);
{
enum ggml_task_backend backends[16];
int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
if (n_backends < 2) {
printf("test: %s, skipped because no BLAS\n", __func__);
return 0;
}
}
{
struct ggml_init_params init_params = {
/*.mem_size =*/1,
/*.mem_buffer =*/NULL,
/*.no_alloc =*/0,
};
struct ggml_context *ctx = ggml_init(init_params);
GGML_ASSERT(ctx);
ggml_free(ctx);
}
struct ggml_mulmat_tune tune;
struct ggml_mulmat_tune_params params;
init_params(&params, /*m_num*/ 4);
bool ok = ggml_mulmat_tune_bench(&tune, &params);
ggml_mulmat_tune_free(&tune);
return ok ? 0 : 1;
}
int estimate_time_non_zero_NK(void) {
printf("test: %s\n", __func__);
struct test_data_t {
int M;
int time[3]; // 3 profiles.
};
struct ggml_mulmat_tune tune = {
.version = 1,
.ftype = GGML_FTYPE_MOSTLY_Q4_0,
};
const int m_num = 2;
struct ggml_task_profile_factory pf;
memset(&pf, 0, sizeof(struct ggml_task_profile_factory));
{
pf.n_qxx_f32 = 2;
pf.qxx_f32[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
pf.qxx_f32[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
pf.qxx_f32[1].stages[0].backend = GGML_TASK_BACKEND_CPU;
pf.qxx_f32[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
}
struct ggml_mulmat_tune_params params;
init_params(&params, m_num);
ggml_mulmat_tune_init(&tune, &params, &pf);
struct ggml_mulmat_tune_shape *shape = NULL;
for (int i = 0; i < tune.n_shapes; i++) {
if (tune.shapes[i].N > 0 && tune.shapes[i].K > 0) {
shape = &tune.shapes[i];
break;
}
}
GGML_ASSERT(shape);
GGML_ASSERT(shape->n_profiles == 2);
GGML_ASSERT(ggml_is_quantized(shape->src0_type));
printf("shape: N: %d, K: %d, n_profiles: %d\n", shape->N, shape->K,
shape->n_profiles);
{
shape->items[0] =
(struct ggml_mulmat_tune_m){.M = 2, .stages_time = {2, 4, 0}};
shape->items[1] =
(struct ggml_mulmat_tune_m){.M = 4, .stages_time = {4, 8, 0}};
shape->items[2] =
(struct ggml_mulmat_tune_m){.M = 2, .stages_time = {4, 4, 0}};
shape->items[3] =
(struct ggml_mulmat_tune_m){.M = 4, .stages_time = {4, 4, 0}};
}
const struct test_data_t test_data[] = {
{
.M = 1, // out of range
.time = {3, 8},
},
{
.M = 2,
.time = {6, 8},
},
{
.M = 3,
.time = {9, 8},
},
{
.M = 4,
.time = {12, 8},
},
{
.M = 5, // out of range
.time = {15, 8},
},
};
int n_tests = (int)(sizeof(test_data) / sizeof(struct test_data_t));
struct ggml_mulmat_tune_time profile_time[GGML_MAX_TASK_PROFILES];
size_t profile_time_sz =
sizeof(struct ggml_mulmat_tune_time) * GGML_MAX_TASK_PROFILES;
int n_passed = 0;
for (int i = 0; i < n_tests; i++) {
memset(profile_time, 0, profile_time_sz);
const struct test_data_t *e = &test_data[i];
const struct ggml_mulmat_tune_shape *matched_shape =
ggml_mulmat_tune_get_shape(&tune, shape->N, shape->K,
shape->src0_type, shape->src1_type);
GGML_ASSERT(matched_shape);
GGML_ASSERT(matched_shape == shape);
ggml_mulmat_tune_estimate_time(matched_shape, e->M, profile_time);
for (int j = 0; j < shape->n_profiles; j++) {
int actual = profile_time[j].total_time;
int expect = e->time[j];
if (expect != actual) {
fprintf(stderr,
"test fail. i: %d, j: %d, M: %d, expect: "
"%d, actual: %d\n",
i, j, e->M, expect, actual);
} else {
++n_passed;
}
}
}
n_tests *= shape->n_profiles;
printf("%2d of %2d pass\n", n_passed, n_tests);
ggml_mulmat_tune_free(&tune);
return n_passed == n_tests ? 0 : 1;
}