mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-10 18:51:45 +00:00
initial
This commit is contained in:
parent
ce2c7d72e2
commit
213f133701
1
.gitignore
vendored
1
.gitignore
vendored
@ -40,6 +40,7 @@ models/*
|
||||
/server
|
||||
/Pipfile
|
||||
/libllama.so
|
||||
/mulmat-tune
|
||||
|
||||
build-info.h
|
||||
arm_neon.h
|
||||
|
@ -78,6 +78,7 @@ option(LLAMA_K_QUANTS "llama: use k-quants"
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" OFF)
|
||||
option(LLAMA_MULMAT_TUNE "llama: mulmat tune" OFF)
|
||||
|
||||
#
|
||||
# Build info header
|
||||
@ -214,6 +215,7 @@ if (LLAMA_BLAS)
|
||||
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
|
||||
add_compile_options(${BLAS_LINKER_FLAGS})
|
||||
add_compile_definitions(GGML_USE_OPENBLAS)
|
||||
add_compile_definitions(GGML_BLAS_VENDOR="${LLAMA_BLAS_VENDOR}")
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
||||
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
|
||||
|
||||
@ -276,6 +278,11 @@ if (LLAMA_METAL)
|
||||
)
|
||||
endif()
|
||||
|
||||
if (LLAMA_MULMAT_TUNE)
|
||||
add_compile_definitions(GGML_USE_MULMAT_TUNE)
|
||||
add_compile_definitions(GGML_MULMAT_TUNE_NDEBUG)
|
||||
endif()
|
||||
|
||||
if (LLAMA_K_QUANTS)
|
||||
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
|
||||
add_compile_definitions(GGML_USE_K_QUANTS)
|
||||
@ -450,6 +457,8 @@ endif()
|
||||
|
||||
add_library(ggml OBJECT
|
||||
ggml.c
|
||||
ggml-threading.c
|
||||
ggml-tune.c
|
||||
ggml.h
|
||||
${GGML_SOURCES_CUDA}
|
||||
${GGML_SOURCES_OPENCL}
|
||||
|
31
Makefile
31
Makefile
@ -1,5 +1,5 @@
|
||||
# Define the default target now so that it is always the first target
|
||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple
|
||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple mulmat-tune
|
||||
|
||||
ifdef LLAMA_BUILD_SERVER
|
||||
BUILD_TARGETS += server
|
||||
@ -47,7 +47,8 @@ endif
|
||||
OPT = -O3
|
||||
CFLAGS = -I. $(OPT) -std=c11 -fPIC
|
||||
CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
|
||||
LDFLAGS =
|
||||
# -lm fixed error: ggml.o: undefined reference to symbol 'tanhf@@GLIBC_2.2.5' from ubuntu 22.04
|
||||
LDFLAGS = -lm
|
||||
|
||||
ifdef LLAMA_DEBUG
|
||||
CFLAGS += -O0 -g
|
||||
@ -134,8 +135,7 @@ ifndef LLAMA_NO_K_QUANTS
|
||||
endif
|
||||
|
||||
ifndef LLAMA_NO_ACCELERATE
|
||||
# Mac M1 - include Accelerate framework.
|
||||
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
|
||||
# Mac Intel & M1 - include Accelerate framework.
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
CFLAGS += -DGGML_USE_ACCELERATE
|
||||
LDFLAGS += -framework Accelerate
|
||||
@ -145,10 +145,16 @@ endif # LLAMA_NO_ACCELERATE
|
||||
ifdef LLAMA_OPENBLAS
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
|
||||
LDFLAGS += -lopenblas
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
# openblas installed with Homebew on macOS.
|
||||
CFLAGS += -I/usr/local/opt/openblas/include
|
||||
LDFLAGS += -L/usr/local/opt/openblas/lib
|
||||
endif
|
||||
endif # LLAMA_OPENBLAS
|
||||
|
||||
ifdef LLAMA_BLIS
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
|
||||
CFLAGS += -DGGML_BLAS_VENDOR="\"BLIS\""
|
||||
LDFLAGS += -lblis -L/usr/local/lib
|
||||
endif # LLAMA_BLIS
|
||||
|
||||
@ -230,6 +236,11 @@ k_quants.o: k_quants.c k_quants.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif # LLAMA_NO_K_QUANTS
|
||||
|
||||
ifdef LLAMA_MULMAT_TUNE
|
||||
CFLAGS += -DGGML_USE_MULMAT_TUNE -DGGML_MULMAT_TUNE_NDEBUG
|
||||
CXXFLAGS += -DGGML_USE_MULMAT_TUNE
|
||||
endif
|
||||
|
||||
#
|
||||
# Print build information
|
||||
#
|
||||
@ -245,6 +256,8 @@ $(info I CC: $(CCV))
|
||||
$(info I CXX: $(CXXV))
|
||||
$(info )
|
||||
|
||||
OBJS += ggml-tune.o ggml-threading.o
|
||||
|
||||
#
|
||||
# Build library
|
||||
#
|
||||
@ -253,7 +266,12 @@ ggml.o: ggml.c ggml.h ggml-cuda.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
ggml-threading.o: ggml-threading.c ggml.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
ggml-tune.o: ggml-tune.c ggml.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
common.o: examples/common.cpp examples/common.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
@ -298,6 +316,9 @@ server: examples/server/server.cpp examples/server/httplib.h examples/server/jso
|
||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
mulmat-tune: examples/mulmat-tune/mulmat-tune.cpp build-info.h ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o mulmat-tune $(LDFLAGS)
|
||||
|
||||
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||
@sh scripts/build-info.sh > $@.tmp
|
||||
@if ! cmp -s $@.tmp $@; then \
|
||||
|
@ -39,6 +39,7 @@ else()
|
||||
add_subdirectory(baby-llama)
|
||||
add_subdirectory(train-text-from-scratch)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(mulmat-tune)
|
||||
if (LLAMA_METAL)
|
||||
add_subdirectory(metal)
|
||||
endif()
|
||||
|
@ -345,6 +345,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
params.mem_test = true;
|
||||
} else if (arg == "--export") {
|
||||
params.export_cgraph = true;
|
||||
#ifdef GGML_USE_MULMAT_TUNE
|
||||
} else if (arg == "--tune") {
|
||||
params.tune = true;
|
||||
} else if (arg == "--tune-file") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.tune_file = argv[i];
|
||||
#endif // GGML_USE_MULMAT_TUNE
|
||||
} else if (arg == "--verbose-prompt") {
|
||||
params.verbose_prompt = true;
|
||||
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
||||
@ -498,6 +508,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
#endif
|
||||
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
||||
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
|
||||
#ifdef GGML_USE_MULMAT_TUNE
|
||||
fprintf(stderr, " --tune mulmat tune enable. If tune-file is set then exit after bench\n");
|
||||
fprintf(stderr, " --tune-file FILE mulmat tune data file. If tune is true, then write bench result to this file, else load the file and run\n");
|
||||
#endif
|
||||
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
|
||||
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||
|
@ -77,6 +77,8 @@ struct gpt_params {
|
||||
bool mem_test = false; // compute maximum memory usage
|
||||
bool export_cgraph = false; // export the computation graph
|
||||
bool verbose_prompt = false; // print prompt tokens before generation
|
||||
bool tune = false; // mulmat tune: enable
|
||||
std::string tune_file = ""; // mulmat tune: data file
|
||||
};
|
||||
|
||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
||||
|
@ -117,6 +117,16 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_MULMAT_TUNE
|
||||
if (params.tune || !params.tune_file.empty()) {
|
||||
bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
|
||||
if (!ok || (params.tune && !params.tune_file.empty())) {
|
||||
llama_free(ctx);
|
||||
return ok? 0: 1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
|
14
examples/mulmat-tune/CMakeLists.txt
Normal file
14
examples/mulmat-tune/CMakeLists.txt
Normal file
@ -0,0 +1,14 @@
|
||||
set(TARGET mulmat-tune)
|
||||
add_executable(${TARGET} mulmat-tune.cpp)
|
||||
|
||||
if (XCODE OR MSVC)
|
||||
set(MULMAT_TUNE_LIBS ggml)
|
||||
else()
|
||||
set(MULMAT_TUNE_LIBS ggml m)
|
||||
endif()
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE ${MULMAT_TUNE_LIBS} ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
272
examples/mulmat-tune/README.md
Normal file
272
examples/mulmat-tune/README.md
Normal file
@ -0,0 +1,272 @@
|
||||
# Mulmat Benchmark and Tunning
|
||||
|
||||
Apart from the standalone tool `mulmat-tune`, mulmat tune is also integrated into
|
||||
`main` and `perplexity`. To avoid too many new cli options, I just added two options.
|
||||
To make it run faster, the `m_num` is set as 8 thus max M is 128, and the `n_pass`
|
||||
is set as 1.
|
||||
|
||||
With the newly added cli options, we can use `main` and `perplexity` with the
|
||||
following three ways:
|
||||
|
||||
* bench and run: --tune
|
||||
* bench and exit: --tune --tune-file <FILE>
|
||||
* load and run: --tune-file <FILE>
|
||||
|
||||
The `load` mode reads existing data file. Although this is fine because we can
|
||||
run bench ahead of time (saving tens of seconds), but there are two shortcomings:
|
||||
- have to re-run when format changed, this is OK because we are acknowledged.
|
||||
- the most subtle problem is algorithm was changed silently but we are using the
|
||||
outdated format. So I integrated mulmat tune into `main` and `perplexity` as
|
||||
a complementary solution.
|
||||
|
||||
## Build into main and perplexity
|
||||
|
||||
Makefile:
|
||||
```
|
||||
make clean && LLAMA_MULMAT_TUNE=1 make
|
||||
```
|
||||
|
||||
CMake (with BLAS):
|
||||
```
|
||||
cmake --build . --target clean
|
||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
Run examples:
|
||||
|
||||
```
|
||||
# bench and run:
|
||||
|
||||
./main -m ./models/3B/open-llama-3b-q4-0.bin -c 512 -b 1024 -n 256 --keep 48 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt -t 4 --tune
|
||||
|
||||
# bench then exit:
|
||||
./main -m ./models/3B/open-llama-3b-q4-0.bin --tune --tune-file <FILE>
|
||||
|
||||
# load and run
|
||||
|
||||
./main -m ./models/3B/open-llama-3b-q4-0.bin -c 512 -b 1024 -n 256 --keep 48 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt -t 4 --tune-file <FILE>
|
||||
```
|
||||
|
||||
# Build the standalone `mulmat-tune`
|
||||
|
||||
Makefile:
|
||||
```
|
||||
make clean && LLAMA_MULMAT_TUNE=1 make
|
||||
```
|
||||
|
||||
CMake (with BLAS)
|
||||
```
|
||||
cmake --build . --target clean
|
||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
Run examples:
|
||||
|
||||
```
|
||||
./mulmat-tune -h
|
||||
|
||||
# run with default params (7B, Q4_0, ...)
|
||||
./mulmat-tune
|
||||
|
||||
# set model
|
||||
./mulmat-tune --model 13B
|
||||
|
||||
# set ggml ftype, 2 for Q4_0, 3 for Q4_1, run `mulmat-tune -h` for help.
|
||||
./mulmat-tune --ftype 3
|
||||
|
||||
# customized m_num
|
||||
./mulmat-tune --m_num 8
|
||||
|
||||
# customized n_pass: run 1 pass only instead of the default 3.
|
||||
./mulmat-tune --n_pass 1
|
||||
|
||||
# customized n_threads instead of the default 1.
|
||||
./mulmat-tune --n_threads 4
|
||||
|
||||
# save to file
|
||||
./mulmat-tune --file <FILE>
|
||||
|
||||
# save to file, always override if exists (CAUTION!)
|
||||
./mulmat-tune --file <FILE> -y
|
||||
|
||||
```
|
||||
|
||||
# End to End Test
|
||||
|
||||
## Compare With Master
|
||||
|
||||
You may want to run the following commands. Make sure the tune result file is
|
||||
setup properly.
|
||||
|
||||
General steps:
|
||||
|
||||
1. run `./mulmat-tune -h` to see how to build for misc vendors.
|
||||
you can build with `GGML_MULMAT_TUNE_NDEBUG=` to enable the the debug, e.g:
|
||||
```
|
||||
make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1 LLAMA_NO_ACCELERATE=1 LLAMA_CLBLAST=1 make
|
||||
```
|
||||
On `macOS`, `ACCELERATE` is enabled by default. When `ACCELERATE` is built along
|
||||
with `CUDA` or `CL`, you may not see `CUDA` or `CL` from debug because `CPU`
|
||||
or `CPU_BLAS` is more faster (as of the estimation from mulmat tune).
|
||||
2. create a small prompt file:
|
||||
```
|
||||
head -n 5 ./models/wikitext-2-raw/wiki.valid.raw > ./models/wiki.valid-5.raw
|
||||
```
|
||||
3. run any of the following example commands.
|
||||
```
|
||||
./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 1 -b 32
|
||||
./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 4 -b 64
|
||||
```
|
||||
* `--mlock` is recommended for `macOS`, you may not want to use it.
|
||||
* don't change `-c 128`: too large `context size` causes 0 perplexity trunk.
|
||||
* `-t` is the number of threads, recommend `1`, `2`, `4` or `6`.
|
||||
* you can change the batch size (`-b`) between `1` and `128`.
|
||||
* you may want to add other cli options.
|
||||
|
||||
The following results are generated with Accelerate compiled.
|
||||
|
||||
### 1 thread
|
||||
|
||||
**Master (2d43387d)**
|
||||
|
||||
```
|
||||
| M | perplexity (seconds per pass) | prompt eval time (ms per token) |
|
||||
| --- | --------------- |
|
||||
| 8 | 43.53 | 339.95 |
|
||||
| 16 | 44.31 | 346.12 |
|
||||
| 24 | 43.14 | 336.90 |
|
||||
| 32 | 33.59 | 262.25 |
|
||||
| 40 | 27.64 | 215.77 |
|
||||
| 48 | 24.52 | 191.42 |
|
||||
```
|
||||
|
||||
**This branch (tune)**
|
||||
|
||||
```
|
||||
| M | perplexity (seconds per pass) | prompt eval time (ms per token) |
|
||||
| --- | --------------- |
|
||||
| 8 | 43.78 | 341.96 |
|
||||
| 16 | 42.88 | 334.93 |
|
||||
| 24 | 42.06 | 328.42 |
|
||||
| 32 | 33.07 | 258.25 |
|
||||
| 40 | 28.69 | 223.98 |
|
||||
| 48 | 25.65 | 200.19 |
|
||||
```
|
||||
|
||||
### 4 threads
|
||||
|
||||
**Master (2d43387d)**
|
||||
|
||||
```
|
||||
| M | perplexity (seconds per pass) | prompt eval time (ms per token) |
|
||||
| --- | --------------- |
|
||||
| 8 | 12.43 | 96.99 |
|
||||
| 16 | 12.10 | 94.44 |
|
||||
| 24 | 12.81 | 99.95 |
|
||||
| 32 | 31.64 | 247.04 |
|
||||
| 48 | 24.55 | 191.63 |
|
||||
| 64 | 17.56 | 137.09 |
|
||||
| 96 | 17.59 | 137.25 |
|
||||
| 128 | 10.73 | 83.74 |
|
||||
```
|
||||
|
||||
**This branch (no tune)**
|
||||
|
||||
```
|
||||
| M | perplexity (seconds per pass) | prompt eval time (ms per token) |
|
||||
| --- | --------------- |
|
||||
| 8 | 12.31 | 96.07 |
|
||||
| 16 | 12.00 | 93.63 |
|
||||
| 24 | 12.07 | 94.15 |
|
||||
| 32 | 20.34 | 158.76 |
|
||||
| 48 | 15.86 | 123.73 |
|
||||
| 64 | 10.98 | 85.69 |
|
||||
| 96 | 11.24 | 87.66 |
|
||||
| 128 | 7.53 | 58.77 |
|
||||
```
|
||||
|
||||
**This branch (tune)**
|
||||
|
||||
```
|
||||
| M | perplexity (seconds per pass) | prompt eval time (ms per token) |
|
||||
| --- | --------------- |
|
||||
| 8 | 12.48 | 97.37 |
|
||||
| 16 | 12.26 | 95.70 |
|
||||
| 24 | 12.25 | 95.53 |
|
||||
| 32 | 11.98 | 93.58 |
|
||||
| 48 | 12.57 | 98.12 |
|
||||
| 64 | 11.28 | 88.05 |
|
||||
| 96 | 9.55 | 74.53 |
|
||||
| 128 | 7.51 | 58.61 |
|
||||
```
|
||||
|
||||
# Bench Data Format
|
||||
|
||||
**Example**
|
||||
|
||||
```
|
||||
5 3B 2 6 1
|
||||
|
||||
3200 3200 2 0 3 10
|
||||
16 0 0 0 16 1 0 1 0 0 0 0
|
||||
16 1 0 2 17 0 1 0 0 0 0 0
|
||||
0 0 0 0 34 0 1 0 0 0 0 0
|
||||
1 1 793 0 9103 2102 0 0 6014 0
|
||||
2 2 1591 0 8034 2305 0 0 30982 0
|
||||
4 4 2236 0 6476 2484 0 0 31388 0
|
||||
8 7 4161 0 6623 2389 0 0 29204 0
|
||||
16 15 8339 0 6434 2752 0 0 34303 0
|
||||
32 32 16919 0 6915 3651 0 0 42511 0
|
||||
64 200 34270 0 6574 4528 0 0 68212 0
|
||||
128 188 69400 0 6325 6839 0 0 74437 0
|
||||
256 303 134597 0 6168 11544 0 0 110180 0
|
||||
512 687 279685 0 6337 29712 0 0 159728 0
|
||||
|
||||
3200 8640 2 0 2 10
|
||||
|
||||
...
|
||||
|
||||
```
|
||||
|
||||
**Informal Explanation**
|
||||
|
||||
```
|
||||
head
|
||||
groups+
|
||||
|
||||
head := version model ggml_ftype n_shapes n_threads
|
||||
shape+
|
||||
|
||||
# head
|
||||
version: 1
|
||||
model: "3B" | "7B" | "13B" | "30B" | "65B"
|
||||
ggml_ftype: 0 - 4, 7 - 14
|
||||
n_shapes: number of shapes
|
||||
n_threads: number of threads
|
||||
|
||||
shape := N K m_num n_profiles
|
||||
task_conf_profile+
|
||||
bench_item+
|
||||
|
||||
task_conf_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize)
|
||||
stage_conf: backend parallel wait
|
||||
backend: 0 (NONE) | 16 (CPU) | 17 (CPU_BLAS) | 32 (GPU) | 33 (GPU_CUDA) | 34 (GPU_CL)
|
||||
parallel: 0 (false) | 1 (true)
|
||||
wait: 0 (false) | 1 (true)
|
||||
|
||||
bench_item: M profile_time+
|
||||
profile_time := stage_time[3]
|
||||
stage_time[3]: init_time, compute_time, finalize_time
|
||||
```
|
||||
|
||||
A task stage is invalid if it's backend equals to `GGML_TASK_BACKEND_NONE`.
|
||||
Time unit is `us`. A column is all zeros when that stage does not exist.
|
||||
|
||||
# NOTE
|
||||
|
||||
1. "3B" is [open-llama 3B](https://github.com/ggerganov/llama.cpp/pull/1588).
|
||||
2. Model names are subject to change: we may support something like X-3B, Y-4B, ...
|
||||
3. As of Jun 1, this tool is still in early stage, will be changed frequently in
|
||||
recent couple of days (or weeks).
|
277
examples/mulmat-tune/mulmat-tune.cpp
Normal file
277
examples/mulmat-tune/mulmat-tune.cpp
Normal file
@ -0,0 +1,277 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "build-info.h"
|
||||
#include "ggml-tune.h"
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
#define UNUSED(x) (void)(x)
|
||||
|
||||
static void print_build_tips(void) {
|
||||
const char *a = "LLAMA_NO_ACCELERATE";
|
||||
fprintf(stderr, "Tips on how to build with various backend vendors:\n\n");
|
||||
fprintf(stderr, "CUDA: make clean; LLAMA_CUBLAS=1 make\n");
|
||||
fprintf(stderr, "CL: make clean; LLAMA_CLBLAST=1 make\n");
|
||||
fprintf(stderr, "Accelerate: make clean; %s= make\n", a);
|
||||
fprintf(stderr, "OpenBLAS: make clean; %s=1 LLAMA_OPENBLAS=1 make\n", a);
|
||||
fprintf(stderr, "BLIS: make clean; %s=1 LLAMA_BLIS=1 make\n", a);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "NOTE: for CUDA/CL, use %s=1 to disable ACCELERATE\n", a);
|
||||
}
|
||||
|
||||
static bool prompt_yes_no(const char *prompt) {
|
||||
char buf[2];
|
||||
while (true) {
|
||||
fprintf(stderr, "%s (Y|n)\n", prompt);
|
||||
buf[0] = 0;
|
||||
buf[1] = 0;
|
||||
int i = 0;
|
||||
int c = 0;
|
||||
|
||||
while (c != '\n') {
|
||||
c = fgetc(stdin);
|
||||
buf[i % 2] = c;
|
||||
i++;
|
||||
}
|
||||
if (i == 1) {
|
||||
if (buf[0] == '\n') {
|
||||
return true;
|
||||
}
|
||||
} else if (i == 2) {
|
||||
if (buf[0] == 'Y' || buf[0] == 'y') {
|
||||
return true;
|
||||
}
|
||||
if (buf[0] == 'N' || buf[0] == 'n') {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void cmd_analyze(struct ggml_mulmat_tune *tune);
|
||||
|
||||
static void usage(char *prog) {
|
||||
const char *usage_lines[] = {
|
||||
"usage: %s args",
|
||||
"",
|
||||
"bench [-m MODEL] [-t TYPE] [-f FILE] [-y]",
|
||||
"--model MODEL 3B | 7B | 13B | 30B | 65B",
|
||||
" default 7B",
|
||||
"--ftype FTYPE ggml ftype:",
|
||||
" 0: all F32",
|
||||
" 1: mostly F16",
|
||||
" 2: mostly Q4_0",
|
||||
" 3: mostly Q4_1",
|
||||
" 4: mostly Q4_1, some F16",
|
||||
" 7: mostly Q8_0",
|
||||
" 8: mostly Q5_0",
|
||||
" 9: mostly Q5_1",
|
||||
" 10: mostly Q2_K",
|
||||
" 11: mostly Q3_K",
|
||||
" 12: mostly Q4_K",
|
||||
" 13: mostly Q5_K",
|
||||
" 14: mostly Q6_K",
|
||||
" default 2 (mostly Q4_0)",
|
||||
"--m_num M_NUM number of M, the max M = 2^(M_NUM-1)",
|
||||
" requires between [6, 12]",
|
||||
" default 10",
|
||||
"--n_pass PASS number of passes to run",
|
||||
" default 1",
|
||||
" requires: between [1, 3]",
|
||||
"--n_threads NTH bench with this number of threads",
|
||||
" requires: between [1, 16]",
|
||||
" default 1",
|
||||
"--file FILE data file to write",
|
||||
" default stdout",
|
||||
"-y always answer \"yes\" to all prompts",
|
||||
};
|
||||
|
||||
int len = (int)(sizeof(usage_lines) / sizeof(char *));
|
||||
for (int i = 0; i < len; i++) {
|
||||
const char *line = usage_lines[i];
|
||||
if (i == 0) {
|
||||
fprintf(stderr, line, prog);
|
||||
} else {
|
||||
fprintf(stderr, "%s\n", line);
|
||||
}
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
print_build_tips();
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc == 2) {
|
||||
if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) {
|
||||
usage(argv[0]);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
int arg_start = 1;
|
||||
|
||||
const char *arg_model = NULL;
|
||||
const char *arg_ftype = NULL;
|
||||
const char *arg_m_num = NULL;
|
||||
const char *arg_n_threads = NULL;
|
||||
const char *arg_n_pass = NULL;
|
||||
const char *arg_file = NULL;
|
||||
bool always_yes = false;
|
||||
|
||||
for (int i = arg_start; i < argc; i++) {
|
||||
if (strcmp(argv[i], "--model") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
arg_model = argv[i + 1];
|
||||
++i;
|
||||
}
|
||||
} else if (strcmp(argv[i], "--ftype") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
arg_ftype = argv[i + 1];
|
||||
++i;
|
||||
}
|
||||
} else if (strcmp(argv[i], "--m_num") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
arg_m_num = argv[i + 1];
|
||||
++i;
|
||||
}
|
||||
} else if (strcmp(argv[i], "--n_pass") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
arg_n_pass = argv[i + 1];
|
||||
++i;
|
||||
}
|
||||
} else if (strcmp(argv[i], "--n_threads") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
arg_n_threads = argv[i + 1];
|
||||
++i;
|
||||
}
|
||||
} else if (strcmp(argv[i], "--file") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
arg_file = argv[i + 1];
|
||||
++i;
|
||||
}
|
||||
} else if (strcmp(argv[i], "-y") == 0) {
|
||||
always_yes = true;
|
||||
} else {
|
||||
fprintf(stderr, "invalid arg: %s\n", argv[i]);
|
||||
usage(argv[0]);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
enum ggml_ftype ftype = GGML_FTYPE_MOSTLY_Q4_0;
|
||||
{
|
||||
if (arg_ftype != NULL) {
|
||||
int v = atoi(arg_ftype);
|
||||
ftype = (enum ggml_ftype)v;
|
||||
}
|
||||
|
||||
if (ftype > GGML_FTYPE_MOSTLY_Q5_1) {
|
||||
fprintf(stderr, "k_quants type %d is not implemented\n", ftype);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (arg_file != NULL && !always_yes) {
|
||||
struct stat st;
|
||||
int rc = stat(arg_file, &st);
|
||||
UNUSED(st);
|
||||
if (rc == 0) { // prompt
|
||||
size_t len = strlen(arg_file) + 50;
|
||||
char *prompt = (char *)malloc(len);
|
||||
GGML_ASSERT(prompt);
|
||||
snprintf(prompt, len, "data file '%s' exists, override?", arg_file);
|
||||
|
||||
if (!prompt_yes_no(prompt)) {
|
||||
printf("Aborted.\n");
|
||||
return 1;
|
||||
}
|
||||
free(prompt);
|
||||
}
|
||||
}
|
||||
|
||||
int m_num = 10;
|
||||
{
|
||||
if (arg_m_num != NULL) {
|
||||
int v = atoi(arg_m_num);
|
||||
m_num = v;
|
||||
}
|
||||
|
||||
if (m_num < 6 || m_num > 12) {
|
||||
fprintf(stderr, "invalid m_num: %d, expect between [6, 12]\n",
|
||||
m_num);
|
||||
usage(argv[0]);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
int n_pass = 1;
|
||||
{
|
||||
if (arg_n_pass != NULL) {
|
||||
int v = atoi(arg_n_pass);
|
||||
n_pass = v;
|
||||
}
|
||||
if (n_pass < 1 || n_pass > GGML_MULMAT_MAX_PASS) {
|
||||
fprintf(stderr, "invalid n_pass: %d, expect between [1, %d]\n",
|
||||
n_pass, GGML_MULMAT_MAX_PASS);
|
||||
usage(argv[0]);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
int n_threads = 1;
|
||||
{
|
||||
if (arg_n_threads != NULL) {
|
||||
int v = atoi(arg_n_threads);
|
||||
n_threads = v;
|
||||
if (n_threads < 1 || n_threads > 16) {
|
||||
fprintf(stderr,
|
||||
"invalid n_threads: %d, expect between [1, 16]\n",
|
||||
n_threads);
|
||||
usage(argv[0]);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const char *model_name = "7B";
|
||||
{
|
||||
if (arg_model != NULL) {
|
||||
model_name = arg_model;
|
||||
}
|
||||
}
|
||||
|
||||
// Let init message print earlier.
|
||||
{
|
||||
struct ggml_init_params init_params = {
|
||||
/*.mem_size =*/1,
|
||||
/*.mem_buffer =*/NULL,
|
||||
/*.no_alloc =*/0,
|
||||
};
|
||||
struct ggml_context *ctx = ggml_init(init_params);
|
||||
GGML_ASSERT(ctx);
|
||||
ggml_free(ctx);
|
||||
}
|
||||
|
||||
struct ggml_mulmat_tune tune;
|
||||
|
||||
struct ggml_mulmat_tune_params params;
|
||||
memset(¶ms, 0, sizeof(struct ggml_mulmat_tune_params));
|
||||
|
||||
ggml_mulmat_init_task_profiles();
|
||||
|
||||
ggml_mulmat_tune_model_init(¶ms.model, model_name, ftype);
|
||||
params.m_num = m_num;
|
||||
params.n_pass = n_pass;
|
||||
params.n_threads = n_threads;
|
||||
params.progress = true;
|
||||
params.output_console = true;
|
||||
params.fname = arg_file;
|
||||
|
||||
bool ok = ggml_mulmat_tune_bench(&tune, ¶ms);
|
||||
return ok ? 0 : 1;
|
||||
}
|
@ -158,6 +158,16 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_MULMAT_TUNE
|
||||
if (params.tune || !params.tune_file.empty()){
|
||||
bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
|
||||
if (!ok || (params.tune && !params.tune_file.empty())) {
|
||||
llama_free(ctx);
|
||||
return ok? 0: 1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
|
@ -2571,7 +2571,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||
func = ggml_cuda_rms_norm;
|
||||
break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
|
||||
if (!any_on_device/* && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)*/) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_mul_mat;
|
||||
|
@ -1628,7 +1628,7 @@ bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_
|
||||
}
|
||||
|
||||
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
|
||||
GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
|
||||
// GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
|
||||
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
ggml_cl_mul_mat_f32(src0, src1, dst);
|
||||
|
620
ggml-threading.c
Normal file
620
ggml-threading.c
Normal file
@ -0,0 +1,620 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "ggml-threading.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#define UNUSED(x) (void)(x)
|
||||
|
||||
// see https://github.com/ggerganov/llama.cpp/pull/1314
|
||||
#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
|
||||
#include <emmintrin.h>
|
||||
static inline void ggml_spin_pause(void) { _mm_pause(); }
|
||||
#else
|
||||
static inline void ggml_spin_pause(void) {}
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
|
||||
#include <windows.h>
|
||||
|
||||
typedef volatile LONG atomic_int;
|
||||
typedef atomic_int atomic_bool;
|
||||
typedef LONG atomic_flag;
|
||||
|
||||
typedef CRITICAL_SECTION pthread_mutex_t;
|
||||
typedef CONDITION_VARIABLE pthread_cond_t;
|
||||
typedef void pthread_mutexattr_t;
|
||||
typedef void pthread_condattr_t;
|
||||
|
||||
typedef HANDLE pthread_t;
|
||||
|
||||
static void atomic_store(atomic_int *ptr, LONG val) {
|
||||
InterlockedExchange(ptr, val);
|
||||
}
|
||||
|
||||
static LONG atomic_load(atomic_int *ptr) {
|
||||
return InterlockedCompareExchange(ptr, 0, 0);
|
||||
}
|
||||
|
||||
static LONG atomic_fetch_add(atomic_int *ptr, LONG inc) {
|
||||
return InterlockedExchangeAdd(ptr, inc);
|
||||
}
|
||||
|
||||
static LONG atomic_fetch_sub(atomic_int *ptr, LONG dec) {
|
||||
return atomic_fetch_add(ptr, -(dec));
|
||||
}
|
||||
|
||||
static inline LONG atomic_flag_test_and_set(volatile atomic_flag *ptr) {
|
||||
return InterlockedCompareExchange(ptr, 1, 0);
|
||||
}
|
||||
static inline LONG atomic_flag_clear(volatile atomic_flag *ptr) {
|
||||
return InterlockedExchange(ptr, 0);
|
||||
}
|
||||
static int pthread_create(pthread_t *out, void *unused,
|
||||
ggml_thread_ret_t (*func)(void *), void *arg) {
|
||||
(void)unused;
|
||||
HANDLE handle =
|
||||
CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, arg, 0, NULL);
|
||||
if (handle == NULL) {
|
||||
return EAGAIN;
|
||||
}
|
||||
|
||||
*out = handle;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pthread_join(pthread_t thread, void *unused) {
|
||||
(void)unused;
|
||||
return (int)WaitForSingleObject(thread, INFINITE);
|
||||
}
|
||||
|
||||
static int pthread_mutex_init(pthread_mutex_t *mutex,
|
||||
pthread_mutexattr_t *attr) {
|
||||
(void)attr;
|
||||
InitializeCriticalSection(mutex);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pthread_mutex_destroy(pthread_mutex_t *mutex) {
|
||||
DeleteCriticalSection(mutex);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pthread_mutex_lock(pthread_mutex_t *mutex) {
|
||||
EnterCriticalSection(mutex);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pthread_mutex_unlock(pthread_mutex_t *mutex) {
|
||||
LeaveCriticalSection(mutex);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pthread_cond_init(pthread_cond_t *cond, pthread_condattr_t *attr) {
|
||||
(void)attr;
|
||||
InitializeConditionVariable(cond);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pthread_cond_destroy(pthread_cond_t *cond) {
|
||||
(void)cond;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex) {
|
||||
SleepConditionVariableCS(cond, mutex, INFINITE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pthread_cond_signal(pthread_cond_t *cond) {
|
||||
WakeConditionVariable(cond);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pthread_cond_broadcast(pthread_cond_t *cond) {
|
||||
WakeAllConditionVariable(cond);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sched_yield(void) {
|
||||
// https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-yieldprocessor
|
||||
YieldProcessor();
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else // ! _WIN32
|
||||
|
||||
#include <pthread.h>
|
||||
#include <stdatomic.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#endif
|
||||
|
||||
// #define GGML_THREADING_DEBUG 1
|
||||
|
||||
#ifdef GGML_THREADING_DEBUG
|
||||
#define PRINT_DEBUG(...) fprintf(stdout, __VA_ARGS__)
|
||||
#else
|
||||
#define PRINT_DEBUG(...)
|
||||
#endif
|
||||
|
||||
struct ggml_perf_stats {
|
||||
int runs;
|
||||
|
||||
// total cycles
|
||||
atomic_int cycles;
|
||||
|
||||
// total time in us.
|
||||
atomic_int time_us;
|
||||
};
|
||||
|
||||
struct ggml_compute_state_shared {
|
||||
atomic_flag spin;
|
||||
pthread_mutex_t mutex;
|
||||
pthread_cond_t cond;
|
||||
|
||||
// number of threads that has entered thread runner.
|
||||
atomic_int n_ready;
|
||||
|
||||
// number of assigned but unfinished tasks, workers decrease it.
|
||||
atomic_int n_tasks;
|
||||
|
||||
// number of waiting workers, workers increase it.
|
||||
atomic_int n_waiting;
|
||||
|
||||
// commands.
|
||||
atomic_bool wait_now;
|
||||
atomic_bool wait_on_done;
|
||||
atomic_bool stop;
|
||||
|
||||
ggml_threading_task_runner *task_runner;
|
||||
|
||||
struct ggml_threading_context *ctx;
|
||||
};
|
||||
struct ggml_compute_state {
|
||||
pthread_t thrd;
|
||||
|
||||
atomic_bool has_work;
|
||||
struct ggml_compute_params params;
|
||||
struct ggml_tensor *node;
|
||||
|
||||
struct ggml_compute_state_shared *shared;
|
||||
};
|
||||
struct ggml_threading_context {
|
||||
int n_threads;
|
||||
struct ggml_compute_state_shared shared;
|
||||
struct ggml_compute_state *workers;
|
||||
|
||||
enum ggml_threading_features features;
|
||||
|
||||
struct ggml_perf_stats wait_perf;
|
||||
struct ggml_perf_stats wakeup_perf;
|
||||
|
||||
int64_t *stages_time;
|
||||
};
|
||||
|
||||
// NOTE: ggml_spin_lock and ggml_spin_unlock may can be noop if
|
||||
// feature wait_on_done is off.
|
||||
static inline void ggml_spin_lock(volatile atomic_flag *obj) {
|
||||
while (atomic_flag_test_and_set(obj)) {
|
||||
ggml_spin_pause();
|
||||
}
|
||||
}
|
||||
|
||||
static inline void ggml_spin_unlock(volatile atomic_flag *obj) {
|
||||
atomic_flag_clear(obj);
|
||||
}
|
||||
|
||||
static inline void ggml_perf_collect(struct ggml_perf_stats *st, int64_t c0,
|
||||
int64_t t0) {
|
||||
st->runs++;
|
||||
st->cycles += (ggml_cycles() - c0);
|
||||
st->time_us += (ggml_time_us() - t0);
|
||||
}
|
||||
|
||||
// A worker thread goes cond waiting.
|
||||
// NOTE: must be protected by shared->spin
|
||||
static void ggml_threading_cond_wait(struct ggml_compute_state *state) {
|
||||
struct ggml_compute_state_shared *shared = state->shared;
|
||||
|
||||
int64_t perf_cycles_0 = 0;
|
||||
int64_t perf_time_0 = 0;
|
||||
|
||||
if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
|
||||
perf_cycles_0 = ggml_cycles();
|
||||
perf_time_0 = ggml_time_us();
|
||||
}
|
||||
|
||||
GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0);
|
||||
|
||||
if (!shared->wait_now) {
|
||||
GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
|
||||
ggml_spin_unlock(&shared->spin);
|
||||
return;
|
||||
}
|
||||
|
||||
shared->n_waiting++;
|
||||
ggml_spin_unlock(&shared->spin);
|
||||
|
||||
GGML_ASSERT(pthread_cond_wait(&shared->cond, &shared->mutex) == 0);
|
||||
GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
|
||||
|
||||
ggml_spin_lock(&shared->spin);
|
||||
|
||||
shared->n_waiting--;
|
||||
|
||||
if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
|
||||
ggml_perf_collect(&shared->ctx->wait_perf, perf_cycles_0, perf_time_0);
|
||||
}
|
||||
}
|
||||
|
||||
// Wakeup all workers.
|
||||
//
|
||||
// Workers takes some time to wakeup, and has to lock spin after wakeup. Yield
|
||||
// is used to avoid signal frequently. Current implementation is highly
|
||||
// experimental. See tests/test-ggml-threading.c for details.
|
||||
//
|
||||
// NOTE: must be protected by shared->spin
|
||||
static void
|
||||
ggml_threading_wakeup_workers(struct ggml_compute_state_shared *shared) {
|
||||
int64_t perf_cycles_0 = 0;
|
||||
int64_t perf_time_0 = 0;
|
||||
|
||||
if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
|
||||
perf_cycles_0 = ggml_cycles();
|
||||
perf_time_0 = ggml_time_us();
|
||||
}
|
||||
|
||||
shared->wait_now = false;
|
||||
|
||||
int loop_counter = 0;
|
||||
int notify_counter = 0;
|
||||
int64_t last_signal_time = 0;
|
||||
|
||||
while (shared->n_waiting != 0) {
|
||||
ggml_spin_unlock(&shared->spin);
|
||||
|
||||
if (loop_counter > 0) {
|
||||
ggml_spin_pause();
|
||||
if (loop_counter > 3) {
|
||||
sched_yield();
|
||||
}
|
||||
}
|
||||
++loop_counter;
|
||||
|
||||
// TODO: should bench actual average wait/wakeup time.
|
||||
if (last_signal_time > 0 && (ggml_time_us() - last_signal_time) < 10) {
|
||||
continue;
|
||||
}
|
||||
|
||||
GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0);
|
||||
GGML_ASSERT(pthread_cond_broadcast(&shared->cond) == 0);
|
||||
GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
|
||||
++notify_counter;
|
||||
last_signal_time = ggml_time_us();
|
||||
|
||||
ggml_spin_lock(&shared->spin);
|
||||
}
|
||||
|
||||
if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
|
||||
ggml_perf_collect(&shared->ctx->wakeup_perf, perf_cycles_0,
|
||||
perf_time_0);
|
||||
}
|
||||
|
||||
// if (notify_counter > 1) {
|
||||
// printf("%s: loop counter: %d, notify counter: %d\n", __func__,
|
||||
// loop_counter, notify_counter);
|
||||
// }
|
||||
UNUSED(notify_counter);
|
||||
}
|
||||
|
||||
// Setup workers for a task stage.
|
||||
// NOTE: must be protected by shared->spin
|
||||
static void ggml_threading_setup_workers(struct ggml_threading_context *ctx,
|
||||
struct ggml_task_profile *profile,
|
||||
enum ggml_task_type type) {
|
||||
PRINT_DEBUG("[main] setup workers for task ...\n");
|
||||
|
||||
#ifdef GGML_THREADING_DEBUG
|
||||
int64_t t0 = ggml_time_us();
|
||||
#endif
|
||||
|
||||
const int n_worker_threads = ctx->n_threads - 1;
|
||||
struct ggml_task_stage *current = &profile->stages[type];
|
||||
struct ggml_compute_state_shared *shared = &ctx->shared;
|
||||
|
||||
if (current->parallel) {
|
||||
if (shared->n_waiting > 0) {
|
||||
ggml_threading_wakeup_workers(shared);
|
||||
}
|
||||
|
||||
if ((ctx->features & GGML_THREADING_FEATURE_WAIT_ON_DONE) > 0) {
|
||||
// Optimize energy: wait_on_done. We MAY also check following nodes,
|
||||
// but that's a bit complicated.
|
||||
shared->wait_on_done = false;
|
||||
for (int i = type + 1; i <= GGML_TASK_FINALIZE; i++) {
|
||||
struct ggml_task_stage *next = &profile->stages[i];
|
||||
if (next->parallel) {
|
||||
break;
|
||||
}
|
||||
if (next->wait) {
|
||||
shared->wait_on_done = true;
|
||||
PRINT_DEBUG("[main] wait_on_done is enabled for "
|
||||
"current task stage\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (current->wait) {
|
||||
if (shared->n_waiting < n_worker_threads) {
|
||||
shared->wait_now = true;
|
||||
PRINT_DEBUG("[main] wait_now was set, expect %d workers wait\n",
|
||||
n_worker_threads);
|
||||
ggml_spin_unlock(&shared->spin);
|
||||
|
||||
while (shared->n_waiting != n_worker_threads) {
|
||||
ggml_spin_pause();
|
||||
}
|
||||
|
||||
ggml_spin_lock(&shared->spin);
|
||||
PRINT_DEBUG("[main] saw %d workers waiting\n", n_worker_threads);
|
||||
}
|
||||
}
|
||||
|
||||
PRINT_DEBUG("[main] setup workers for task took %d us\n",
|
||||
(int)(ggml_time_us() - t0));
|
||||
}
|
||||
|
||||
ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
|
||||
GGML_ASSERT(data);
|
||||
struct ggml_compute_state *state = (struct ggml_compute_state *)data;
|
||||
GGML_ASSERT(state);
|
||||
|
||||
struct ggml_compute_state_shared *shared = state->shared;
|
||||
GGML_ASSERT(shared);
|
||||
GGML_ASSERT(shared->task_runner);
|
||||
|
||||
shared->n_ready++;
|
||||
|
||||
PRINT_DEBUG("[%d-th] running\n", state->params.ith);
|
||||
|
||||
while (!shared->stop) {
|
||||
if (shared->wait_now) {
|
||||
ggml_spin_lock(&shared->spin);
|
||||
if (!state->has_work) {
|
||||
ggml_threading_cond_wait(state);
|
||||
}
|
||||
ggml_spin_unlock(&shared->spin);
|
||||
}
|
||||
|
||||
if (shared->n_tasks > 0 && state->has_work) {
|
||||
enum ggml_compute_error err =
|
||||
shared->task_runner(&state->params, state->node);
|
||||
|
||||
GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK);
|
||||
|
||||
ggml_spin_lock(&shared->spin);
|
||||
|
||||
state->has_work = false;
|
||||
shared->n_tasks--;
|
||||
|
||||
bool wait = shared->wait_on_done && !state->has_work;
|
||||
if (wait) {
|
||||
ggml_threading_cond_wait(state);
|
||||
}
|
||||
|
||||
ggml_spin_unlock(&shared->spin);
|
||||
|
||||
// no need to pause.
|
||||
if (wait) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_spin_pause();
|
||||
}
|
||||
|
||||
PRINT_DEBUG("[%d-th] exited\n", state->params.ith);
|
||||
return 0;
|
||||
}
|
||||
|
||||
enum ggml_compute_error
|
||||
ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
|
||||
struct ggml_tensor *node, void *wdata,
|
||||
size_t wsize) {
|
||||
GGML_ASSERT(ctx);
|
||||
GGML_ASSERT(node);
|
||||
|
||||
GGML_ASSERT(ctx->shared.task_runner);
|
||||
struct ggml_compute_state_shared *state_shared = &ctx->shared;
|
||||
|
||||
// This is the params for main thread.
|
||||
struct ggml_compute_params params;
|
||||
enum ggml_compute_error err;
|
||||
|
||||
for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) {
|
||||
if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) {
|
||||
continue;
|
||||
}
|
||||
|
||||
PRINT_DEBUG("[main] stage: %d\n", type);
|
||||
|
||||
int64_t t_stage = 0;
|
||||
if (ctx->stages_time) {
|
||||
t_stage = ggml_time_us();
|
||||
}
|
||||
|
||||
// n_tasks is the total number of parallel computing tasks
|
||||
// (including main thread).
|
||||
int n_tasks =
|
||||
node->task_profile.stages[type].parallel ? ctx->n_threads : 1;
|
||||
|
||||
ggml_spin_lock(&state_shared->spin);
|
||||
|
||||
if (ctx->n_threads > 1) {
|
||||
ggml_threading_setup_workers(ctx, &node->task_profile, type);
|
||||
}
|
||||
|
||||
if (n_tasks > 1) {
|
||||
// setup compute task parameters.
|
||||
for (int j = 0; j < n_tasks - 1; j++) {
|
||||
ctx->workers[j].params = (struct ggml_compute_params){
|
||||
.type = type,
|
||||
.ith = j + 1,
|
||||
.nth = n_tasks,
|
||||
.wsize = wsize,
|
||||
.wdata = wdata,
|
||||
};
|
||||
ctx->workers[j].node = node;
|
||||
ctx->workers[j].has_work = true;
|
||||
}
|
||||
state_shared->n_tasks = n_tasks - 1;
|
||||
PRINT_DEBUG("[main] assigned %d tasks\n", state_shared->n_tasks);
|
||||
}
|
||||
|
||||
ggml_spin_unlock(&state_shared->spin);
|
||||
|
||||
// main thread always run the 0-th task.
|
||||
// TODO: assert(params->nth == 1) instead of
|
||||
// assert(params->ith == 0)
|
||||
{
|
||||
params.type = type;
|
||||
params.ith = 0;
|
||||
params.nth = n_tasks;
|
||||
params.wsize = wsize;
|
||||
params.wdata = wdata;
|
||||
|
||||
err = state_shared->task_runner(¶ms, node);
|
||||
}
|
||||
|
||||
// wait for tasks done.
|
||||
if (n_tasks > 1) {
|
||||
while (state_shared->n_tasks != 0) {
|
||||
ggml_spin_pause();
|
||||
}
|
||||
}
|
||||
|
||||
PRINT_DEBUG("[main] all tasks finished\n\n");
|
||||
|
||||
if (ctx->stages_time) {
|
||||
ctx->stages_time[type] = ggml_time_us() - t_stage;
|
||||
}
|
||||
|
||||
if (err != GGML_COMPUTE_OK) {
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
return GGML_COMPUTE_OK;
|
||||
}
|
||||
|
||||
struct ggml_threading_context *
|
||||
ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner,
|
||||
ggml_threading_task_runner *task_stage_runner,
|
||||
enum ggml_threading_features features,
|
||||
int64_t stages_time[3]) {
|
||||
GGML_ASSERT(n_threads > 0);
|
||||
GGML_ASSERT(thread_runner);
|
||||
GGML_ASSERT(task_stage_runner);
|
||||
|
||||
size_t ctx_sz = sizeof(struct ggml_threading_context);
|
||||
struct ggml_threading_context *ctx = malloc(ctx_sz);
|
||||
GGML_ASSERT(ctx);
|
||||
memset(ctx, 0, ctx_sz);
|
||||
|
||||
ctx->shared = (struct ggml_compute_state_shared){
|
||||
.spin = {0},
|
||||
.n_ready = 0,
|
||||
.n_tasks = 0,
|
||||
.n_waiting = 0,
|
||||
.wait_now = false,
|
||||
.wait_on_done = false,
|
||||
.stop = false,
|
||||
.task_runner = task_stage_runner,
|
||||
.ctx = ctx,
|
||||
};
|
||||
|
||||
PRINT_DEBUG("[main] thread start, features: %d\n", features);
|
||||
|
||||
ctx->n_threads = n_threads;
|
||||
ctx->features = features;
|
||||
ctx->stages_time = stages_time;
|
||||
|
||||
int n_workers = n_threads - 1;
|
||||
if (n_workers > 0) {
|
||||
GGML_ASSERT(pthread_mutex_init(&ctx->shared.mutex, NULL) == 0);
|
||||
GGML_ASSERT(pthread_cond_init(&ctx->shared.cond, NULL) == 0);
|
||||
|
||||
size_t workers_sz = sizeof(struct ggml_compute_state) * n_workers;
|
||||
struct ggml_compute_state *workers = malloc(workers_sz);
|
||||
GGML_ASSERT(workers);
|
||||
memset(workers, 0, workers_sz);
|
||||
|
||||
for (int j = 0; j < n_workers; j++) {
|
||||
workers[j].shared = &ctx->shared;
|
||||
GGML_ASSERT(pthread_create(&workers[j].thrd, NULL, thread_runner,
|
||||
&workers[j]) == 0);
|
||||
}
|
||||
|
||||
ctx->workers = workers;
|
||||
|
||||
while (ctx->shared.n_ready != n_workers) {
|
||||
ggml_spin_pause();
|
||||
}
|
||||
}
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
static void
|
||||
ggml_threading_print_perf_stats(struct ggml_threading_context *ctx) {
|
||||
bool print_stats = ctx->features & GGML_THREADING_FEATURE_PERF;
|
||||
#ifdef GGML_THREADING_DEBUG
|
||||
print_stats = true;
|
||||
#endif
|
||||
|
||||
if (!print_stats) {
|
||||
return;
|
||||
}
|
||||
|
||||
const char *prefix_arr[2] = {"[threading wait ]", "[threading wakeup]"};
|
||||
struct ggml_perf_stats *st_arr[2] = {&ctx->wait_perf, &ctx->wakeup_perf};
|
||||
for (int i = 0; i < 2; i++) {
|
||||
struct ggml_perf_stats *st = st_arr[i];
|
||||
if (st->runs == 0) {
|
||||
continue;
|
||||
}
|
||||
fprintf(stdout,
|
||||
"%s runs: %4d, avg cycles: %8.3f ms, avg time: "
|
||||
"%8.3f ms\n",
|
||||
prefix_arr[i], st->runs,
|
||||
1.0 * st->cycles / (st->runs * ggml_cycles_per_ms()),
|
||||
1.0 * st->time_us / (st->runs * 1000));
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_threading_stop(struct ggml_threading_context *ctx) {
|
||||
GGML_ASSERT(ctx);
|
||||
|
||||
if (ctx->workers) {
|
||||
PRINT_DEBUG("[main] stopping thread pool ...\n");
|
||||
ctx->shared.stop = true;
|
||||
|
||||
ggml_spin_lock(&ctx->shared.spin);
|
||||
ggml_threading_wakeup_workers(&ctx->shared);
|
||||
ggml_spin_unlock(&ctx->shared.spin);
|
||||
|
||||
for (int j = 0; j < ctx->n_threads - 1; j++) {
|
||||
GGML_ASSERT(pthread_join(ctx->workers[j].thrd, NULL) == 0);
|
||||
}
|
||||
free(ctx->workers);
|
||||
PRINT_DEBUG("[main] thread pool stopped\n");
|
||||
}
|
||||
|
||||
ggml_threading_print_perf_stats(ctx);
|
||||
|
||||
free(ctx);
|
||||
}
|
68
ggml-threading.h
Normal file
68
ggml-threading.h
Normal file
@ -0,0 +1,68 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
typedef int ggml_thread_ret_t;
|
||||
#else
|
||||
typedef void *ggml_thread_ret_t;
|
||||
#endif
|
||||
|
||||
struct ggml_threading_context;
|
||||
|
||||
// Optional (experimental) features.
|
||||
enum ggml_threading_features {
|
||||
GGML_THREADING_FEATURE_NONE = 0,
|
||||
GGML_THREADING_FEATURE_WAIT_ON_DONE = 1 << 0,
|
||||
GGML_THREADING_FEATURE_PERF = 1 << 1,
|
||||
};
|
||||
|
||||
// Compute errors.
|
||||
enum ggml_compute_error {
|
||||
GGML_COMPUTE_OK = 0,
|
||||
GGML_COMPUTE_FALLBACK = 1,
|
||||
};
|
||||
|
||||
// The task runner to be called by main thread and workers.
|
||||
typedef enum ggml_compute_error(ggml_threading_task_runner)(
|
||||
struct ggml_compute_params *params, struct ggml_tensor *node);
|
||||
|
||||
// The thread runner to feed into OS threads.
|
||||
typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data);
|
||||
|
||||
// Init and start underlying workers if n_threads > 1.
|
||||
//
|
||||
// features: optional for configure threading additional features.
|
||||
// see `ggml_threading_feature`, default 0.
|
||||
// stages_time: optional for collecting per-stage wall clock time.
|
||||
struct ggml_threading_context *
|
||||
ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread,
|
||||
ggml_threading_task_runner *task_stage_runner,
|
||||
enum ggml_threading_features features,
|
||||
int64_t stages_time[3]);
|
||||
|
||||
// Stop workers (if exist), free memories (including the ctx).
|
||||
void ggml_threading_stop(struct ggml_threading_context *ctx);
|
||||
|
||||
// The default implementation of `ggml_threading_thread_runner`
|
||||
ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data);
|
||||
|
||||
// Compute a tensor. It computes the enabled task stages one by one.
|
||||
// Caller should take care of the return error: retry for fallback error.
|
||||
enum ggml_compute_error
|
||||
ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
|
||||
struct ggml_tensor *node, void *wdata,
|
||||
size_t wsize);
|
||||
|
||||
// This is an experimental functionality for mulmat tune, as a thin wrapper.
|
||||
enum ggml_compute_error
|
||||
ggml_compute_forward_wrapper(struct ggml_compute_params *params,
|
||||
struct ggml_tensor *tensor);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
897
ggml-tune.c
Normal file
897
ggml-tune.c
Normal file
@ -0,0 +1,897 @@
|
||||
#include <string.h>
|
||||
|
||||
#include "ggml-threading.h"
|
||||
#include "ggml-tune.h"
|
||||
#include "ggml.h"
|
||||
|
||||
// MUL_MAT fine tunning for non-GPU-offloading cases.
|
||||
|
||||
#define GGML_MULMAT_CACHE_LEN 16
|
||||
static struct mm_cache_element default_mm_cache[GGML_MULMAT_CACHE_LEN] = {0};
|
||||
|
||||
#define FNV_OFFSET 14695981039346656037UL
|
||||
#define FNV_PRIME 1099511628211UL
|
||||
static uint64_t ggml_mulmat_tune_cache_hash(int M, int N, int K) {
|
||||
char buf[30];
|
||||
snprintf(buf, 30, "%d%d%d", M, N, K);
|
||||
|
||||
uint64_t hash = FNV_OFFSET;
|
||||
for (const char *p = buf; *p; p++) {
|
||||
hash ^= (uint64_t)(unsigned char)(*p);
|
||||
hash *= FNV_PRIME;
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
||||
static const char *
|
||||
ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) {
|
||||
switch (backend) {
|
||||
case GGML_TASK_BACKEND_NONE:
|
||||
return "";
|
||||
case GGML_TASK_BACKEND_CPU:
|
||||
return "CPU";
|
||||
case GGML_TASK_BACKEND_CPU_BLAS:
|
||||
return "BLAS";
|
||||
case GGML_TASK_BACKEND_GPU:
|
||||
return "GPU";
|
||||
case GGML_TASK_BACKEND_GPU_CUDA:
|
||||
return "CUDA";
|
||||
case GGML_TASK_BACKEND_GPU_CL:
|
||||
return "CL";
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
|
||||
struct ggml_mulmat_tune *tune, int M, int N, int K, enum ggml_type src0_t,
|
||||
enum ggml_type src1_t, int stages_time[3]) {
|
||||
GGML_ASSERT(tune);
|
||||
|
||||
// TODO: default_mm_cache is thread-unsafe.
|
||||
struct mm_cache_element *mm_cache = default_mm_cache;
|
||||
int slot = ggml_mulmat_tune_cache_hash(M, N, K) % GGML_MULMAT_CACHE_LEN;
|
||||
struct mm_cache_element *e = &mm_cache[slot];
|
||||
|
||||
struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0};
|
||||
|
||||
struct ggml_task_profile *prof = NULL;
|
||||
|
||||
if (e->M == M && e->N == N && e->K == K) {
|
||||
prof = e->profile;
|
||||
if (stages_time) {
|
||||
for (int i = 0; i < 3; i++) {
|
||||
stages_time[i] = e->stages_time[i];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
const struct ggml_mulmat_tune_shape *shape = NULL;
|
||||
shape = ggml_mulmat_tune_get_shape(tune, N, K, src0_t, src1_t);
|
||||
if (shape) {
|
||||
ggml_mulmat_tune_estimate_time(shape, M, profiles_time);
|
||||
|
||||
int min = INT32_MAX;
|
||||
int index = -1;
|
||||
for (int i = 0; i < shape->n_profiles; i++) {
|
||||
int total = profiles_time[i].total_time;
|
||||
if (total < min) {
|
||||
min = total;
|
||||
index = i;
|
||||
}
|
||||
}
|
||||
|
||||
if (index >= 0) {
|
||||
prof = profiles_time[index].profile;
|
||||
for (int i = 0; i < 3; i++) {
|
||||
int t = profiles_time[index].stage_time[i];
|
||||
if (stages_time) {
|
||||
stages_time[i] = t;
|
||||
}
|
||||
e->stages_time[i] = t;
|
||||
}
|
||||
|
||||
GGML_ASSERT(prof);
|
||||
|
||||
e->profile = prof;
|
||||
e->M = M;
|
||||
e->N = N;
|
||||
e->K = K;
|
||||
|
||||
// to disable this, build with
|
||||
// `make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1
|
||||
// make`
|
||||
#if !defined(GGML_MULMAT_TUNE_NDEBUG)
|
||||
const char *names[3];
|
||||
for (int i = 0; i < 3; i++) {
|
||||
names[i] = ggml_mulmat_tune_task_backend_name(
|
||||
prof->stages[i].backend);
|
||||
}
|
||||
printf(
|
||||
"\n[mulmat tune] M: %3d, N: %5d, K: %5d, backends of the "
|
||||
"fastest profile: %s %s %s\n",
|
||||
M, N, K, names[0], names[1], names[2]);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return prof;
|
||||
}
|
||||
|
||||
void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
|
||||
const char *name, enum ggml_ftype ftype) {
|
||||
const int n_vocab = 32000;
|
||||
int n_embd;
|
||||
// n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult
|
||||
int n_ff;
|
||||
// n_rot = n_embd/n_head;
|
||||
int n_rot;
|
||||
|
||||
if (strcmp(name, "3B") == 0) {
|
||||
// n_head=32, n_mult=216, n_layer=26
|
||||
// https://github.com/ggerganov/llama.cpp/pull/1588
|
||||
n_embd = 3200;
|
||||
n_ff = 8640;
|
||||
n_rot = 100;
|
||||
} else if (strcmp(name, "7B") == 0) {
|
||||
n_embd = 4096;
|
||||
n_ff = 11008;
|
||||
n_rot = 128;
|
||||
} else if (strcmp(name, "13B") == 0) {
|
||||
n_embd = 5120;
|
||||
n_ff = 13824;
|
||||
n_rot = 128;
|
||||
} else if (strcmp(name, "30B") == 0) {
|
||||
n_embd = 6656;
|
||||
n_ff = 17920;
|
||||
n_rot = 128;
|
||||
} else if (strcmp(name, "65B") == 0) {
|
||||
n_embd = 8192;
|
||||
n_ff = 22016;
|
||||
n_rot = 128;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
model->name = name;
|
||||
model->ftype = ftype;
|
||||
model->n_vocab = n_vocab;
|
||||
model->n_embd = n_embd;
|
||||
model->n_ff = n_ff;
|
||||
model->n_rot = n_rot;
|
||||
}
|
||||
|
||||
bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
|
||||
struct ggml_mulmat_tune_params *params,
|
||||
struct ggml_task_profile_factory *pf) {
|
||||
|
||||
struct ggml_mulmat_tune_model *model = ¶ms->model;
|
||||
|
||||
memset(tune, 0, sizeof(struct ggml_mulmat_tune));
|
||||
|
||||
tune->version = GGML_MULMAT_TUNE_VERSION;
|
||||
tune->n_threads = params->n_threads;
|
||||
tune->ftype = model->ftype;
|
||||
|
||||
size_t name_len = strlen(model->name);
|
||||
GGML_ASSERT(name_len > 0);
|
||||
strncpy(tune->model, model->name, sizeof(tune->model) - 1);
|
||||
|
||||
const enum ggml_type rot_src0_type = GGML_TYPE_F16;
|
||||
const enum ggml_type src1_type = GGML_TYPE_F32;
|
||||
|
||||
int n_vocab = model->n_vocab;
|
||||
int n_embd = model->n_embd;
|
||||
int n_ff = model->n_ff;
|
||||
int n_rot = model->n_rot;
|
||||
|
||||
enum ggml_type type = ggml_ftype_to_ggml_type(model->ftype);
|
||||
|
||||
GGML_ASSERT(GGML_MULMAT_N_SHAPES >= 6);
|
||||
tune->n_shapes = GGML_MULMAT_N_SHAPES;
|
||||
|
||||
// Attention layers
|
||||
tune->shapes[0] = (struct ggml_mulmat_tune_shape){
|
||||
.N = n_embd, .K = n_embd, .src0_type = type, .src1_type = src1_type};
|
||||
// Feed forward layers
|
||||
tune->shapes[1] = (struct ggml_mulmat_tune_shape){
|
||||
.N = n_embd, .K = n_ff, .src0_type = type, .src1_type = src1_type};
|
||||
tune->shapes[2] = (struct ggml_mulmat_tune_shape){
|
||||
.N = n_ff, .K = n_embd, .src0_type = type, .src1_type = src1_type};
|
||||
tune->shapes[3] = (struct ggml_mulmat_tune_shape){
|
||||
.N = n_vocab, .K = n_embd, .src0_type = type, .src1_type = src1_type};
|
||||
// RoPE
|
||||
tune->shapes[4] = (struct ggml_mulmat_tune_shape){
|
||||
.N = n_rot, .K = 0, .src0_type = rot_src0_type, .src1_type = src1_type};
|
||||
tune->shapes[5] = (struct ggml_mulmat_tune_shape){
|
||||
.N = 0, .K = n_rot, .src0_type = rot_src0_type, .src1_type = src1_type};
|
||||
|
||||
for (int i = 0; i < tune->n_shapes; i++) {
|
||||
struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
|
||||
shape->n_profiles = ggml_mulmat_get_task_profiles(
|
||||
pf, shape->src0_type, shape->src1_type, &shape->profiles);
|
||||
if (shape->n_profiles == 0) {
|
||||
// allowed for testing.
|
||||
continue;
|
||||
}
|
||||
|
||||
shape->m_num = params->m_num;
|
||||
shape->arr_m = malloc(shape->m_num * sizeof(int));
|
||||
for (int j = 0; j < shape->m_num; j++) {
|
||||
shape->arr_m[j] = 1 << j;
|
||||
}
|
||||
|
||||
size_t sz = sizeof(struct ggml_mulmat_tune_m) *
|
||||
(shape->n_profiles * shape->m_num);
|
||||
shape->items = malloc(sz);
|
||||
GGML_ASSERT(shape->items);
|
||||
memset(shape->items, 0, sz);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune) {
|
||||
for (int i = 0; i < tune->n_shapes; i++) {
|
||||
struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
|
||||
GGML_ASSERT(shape);
|
||||
|
||||
// arr_m and items can be NULL only when testing.
|
||||
if (shape->arr_m) {
|
||||
free(shape->arr_m);
|
||||
}
|
||||
if (shape->items) {
|
||||
free(shape->items);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static bool ggml_mulmat_tune_write_profiles(
|
||||
FILE *fp, const struct ggml_task_profile *profiles, int n_profiles) {
|
||||
int rc;
|
||||
for (int i = 0; i < n_profiles; i++) {
|
||||
const struct ggml_task_profile *profile = &profiles[i];
|
||||
for (int j = 0; j < 3; j++) {
|
||||
const struct ggml_task_stage *ts = &profile->stages[j];
|
||||
rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0,
|
||||
ts->wait ? 1 : 0);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
if (j < 2) {
|
||||
rc = fprintf(fp, " ");
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
rc = fprintf(fp, "\n");
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
|
||||
const char *model, int ftype, int n_threads,
|
||||
char *errbuf, int errbuf_len) {
|
||||
|
||||
if (tune->version != GGML_MULMAT_TUNE_VERSION) {
|
||||
snprintf(errbuf, errbuf_len - 1,
|
||||
"version mismatch, built-in: %d, "
|
||||
"yours: %d",
|
||||
GGML_MULMAT_TUNE_VERSION, tune->version);
|
||||
return false;
|
||||
} else if (strcmp(model, tune->model) != 0) {
|
||||
snprintf(errbuf, errbuf_len - 1,
|
||||
"model mismatch. built-in: %s, yours: %s", model, tune->model);
|
||||
return false;
|
||||
} else if (ftype != tune->ftype) {
|
||||
snprintf(errbuf, errbuf_len - 1,
|
||||
"ftype mismatch. built-in: %d, yours: %d\n", ftype,
|
||||
tune->ftype);
|
||||
return false;
|
||||
} else if (n_threads != tune->n_threads) {
|
||||
snprintf(errbuf, errbuf_len - 1,
|
||||
"n_threads mismatch. run-time: %d, yours: %d\n", n_threads,
|
||||
tune->n_threads);
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < tune->n_shapes; i++) {
|
||||
const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
|
||||
|
||||
struct ggml_task_profile *builtin_profiles = NULL;
|
||||
int n_profiles = ggml_mulmat_get_task_profiles(
|
||||
NULL, shape->src0_type, shape->src1_type, &builtin_profiles);
|
||||
|
||||
if (n_profiles != shape->n_profiles) {
|
||||
snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
|
||||
return false;
|
||||
}
|
||||
|
||||
// TODO: profiles order is relevant, too strict.
|
||||
size_t sz = sizeof(struct ggml_task_profile) * n_profiles;
|
||||
if (memcmp(builtin_profiles, shape->profiles, sz) != 0) {
|
||||
snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
|
||||
|
||||
printf("=== built-in profiles:\n");
|
||||
ggml_mulmat_tune_write_profiles(stderr, builtin_profiles,
|
||||
n_profiles);
|
||||
|
||||
printf("=== incoming profiles:\n");
|
||||
ggml_mulmat_tune_write_profiles(stderr, shape->profiles,
|
||||
shape->n_profiles);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
|
||||
const char *model, int ftype, int n_threads) {
|
||||
char errbuf[128];
|
||||
bool ok = ggml_mulmat_tune_validate_internal(tune, model, ftype, n_threads,
|
||||
errbuf, sizeof(errbuf));
|
||||
if (!ok) {
|
||||
fprintf(stderr, "[mulmat tune] error: %s. run bench again.\n", errbuf);
|
||||
}
|
||||
|
||||
return ok;
|
||||
}
|
||||
|
||||
bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
|
||||
int rc = fscanf(fp, "%d", &tune->version);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (tune->version != GGML_MULMAT_TUNE_VERSION) {
|
||||
fprintf(stderr, "[mulmat tune] version mismatch, run bench again\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
rc = fscanf(fp, "%s %d %d %d", tune->model, (int *)&tune->ftype,
|
||||
&tune->n_shapes, &tune->n_threads);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
|
||||
struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
|
||||
|
||||
rc = fscanf(fp, "%d %d %d %d %d %d", &shape->N, &shape->K,
|
||||
(int *)&shape->src0_type, (int *)&shape->src1_type,
|
||||
&shape->n_profiles, &shape->m_num);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
{
|
||||
size_t item_size = sizeof(struct ggml_mulmat_tune_m) *
|
||||
(shape->n_profiles * shape->m_num);
|
||||
shape->items = malloc(item_size);
|
||||
if (shape->items == NULL) {
|
||||
fprintf(stderr, "[mulmat tune] failed to allocate memory\n");
|
||||
return false;
|
||||
}
|
||||
memset(shape->items, 0, item_size);
|
||||
}
|
||||
|
||||
{
|
||||
size_t sz = sizeof(struct ggml_task_profile) * shape->n_profiles;
|
||||
shape->profiles = malloc(sz);
|
||||
GGML_ASSERT(shape->profiles);
|
||||
memset(shape->profiles, 0, sz);
|
||||
}
|
||||
|
||||
for (int ip = 0; ip < shape->n_profiles; ip++) {
|
||||
struct ggml_task_profile *profile = &shape->profiles[ip];
|
||||
for (int j = 0; j < 3; j++) {
|
||||
struct ggml_task_stage *ts = &profile->stages[j];
|
||||
int backend;
|
||||
int parallel;
|
||||
int wait;
|
||||
rc = fscanf(fp, "%d %d %d", &backend, ¶llel, &wait);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
ts->backend = (enum ggml_task_backend)backend;
|
||||
ts->parallel = parallel ? true : false;
|
||||
ts->wait = wait ? true : false;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i_m = 0; i_m < shape->m_num; i_m++) {
|
||||
int M;
|
||||
for (int ip = 0; ip < shape->n_profiles; ip++) {
|
||||
if (ip == 0) {
|
||||
rc = fscanf(fp, "%d", &M);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
struct ggml_mulmat_tune_m *item =
|
||||
&shape->items[ip * shape->m_num + i_m];
|
||||
item->M = M;
|
||||
rc = fscanf(fp, "%d %d %d", &item->stages_time[0],
|
||||
&item->stages_time[1], &item->stages_time[2]);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
|
||||
FILE *fp) {
|
||||
int rc;
|
||||
rc = fprintf(fp, "%d %s %d %d %d\n\n", tune->version, tune->model,
|
||||
tune->ftype, tune->n_shapes, tune->n_threads);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
|
||||
if (i_shape > 0) {
|
||||
fprintf(fp, "\n");
|
||||
}
|
||||
const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
|
||||
rc = fprintf(fp, "%d %d %d %d %d %d\n", shape->N, shape->K,
|
||||
shape->src0_type, shape->src1_type, shape->n_profiles,
|
||||
shape->m_num);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
rc = ggml_mulmat_tune_write_profiles(fp, shape->profiles,
|
||||
shape->n_profiles);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i_m = 0; i_m < shape->m_num; i_m++) {
|
||||
for (int ip = 0; ip < shape->n_profiles; ip++) {
|
||||
struct ggml_mulmat_tune_m *item =
|
||||
&shape->items[ip * shape->m_num + i_m];
|
||||
if (ip == 0) {
|
||||
rc = fprintf(fp, "%4d", item->M);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_task_profile *profile = &shape->profiles[ip];
|
||||
for (int k = 0; k < 3; k++) {
|
||||
if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) {
|
||||
rc = fprintf(fp, "%9d", item->stages_time[k]);
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
rc = fprintf(fp, " 0");
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
rc = fprintf(fp, "\n");
|
||||
if (rc <= 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
const struct ggml_mulmat_tune_shape *
|
||||
ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N,
|
||||
const int K, enum ggml_type src0_type,
|
||||
enum ggml_type src1_type) {
|
||||
GGML_ASSERT(N > 0 && K > 0);
|
||||
|
||||
for (int i = 0; i < tune->n_shapes; i++) {
|
||||
const struct ggml_mulmat_tune_shape *s = &tune->shapes[i];
|
||||
if (s->src0_type != src0_type || s->src1_type != src1_type) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (s->N > 0 && s->K > 0) {
|
||||
if (s->N == N && s->K == K) {
|
||||
return s;
|
||||
}
|
||||
} else if (s->N > 0 && s->K == 0) {
|
||||
if (s->N == N) {
|
||||
return s;
|
||||
}
|
||||
} else if (s->N == 0 && s->K > 0) {
|
||||
if (s->K == K) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// This is the experimental reference implementation.
|
||||
// Requires both n_threads are same at bench time and runtime.
|
||||
void ggml_mulmat_tune_estimate_time(
|
||||
const struct ggml_mulmat_tune_shape *shape, int M,
|
||||
struct ggml_mulmat_tune_time *profile_time) {
|
||||
|
||||
GGML_ASSERT(shape);
|
||||
GGML_ASSERT(profile_time);
|
||||
|
||||
const int m_num = shape->m_num;
|
||||
const int min_m = shape->items[0].M;
|
||||
const int max_m = shape->items[m_num - 1].M;
|
||||
|
||||
for (int ip = 0; ip < shape->n_profiles; ip++) {
|
||||
struct ggml_task_profile *profile = &shape->profiles[ip];
|
||||
profile_time[ip].total_time = 0;
|
||||
profile_time[ip].profile = profile;
|
||||
|
||||
const int items_offset = ip * m_num;
|
||||
|
||||
struct ggml_mulmat_tune_m *p0 = NULL;
|
||||
struct ggml_mulmat_tune_m *p1 = NULL;
|
||||
if (M < min_m) {
|
||||
// first two.
|
||||
p0 = &shape->items[items_offset];
|
||||
p1 = &shape->items[items_offset + 1];
|
||||
} else if (M > max_m) {
|
||||
// last two
|
||||
p0 = &shape->items[items_offset + m_num - 2];
|
||||
p1 = &shape->items[items_offset + m_num - 1];
|
||||
} else {
|
||||
for (int i = 0; i < m_num; i++) {
|
||||
p1 = &shape->items[items_offset + i];
|
||||
if (p1->M == M) {
|
||||
p0 = p1;
|
||||
break;
|
||||
}
|
||||
|
||||
if (i > 0) {
|
||||
p0 = (struct ggml_mulmat_tune_m *)(p1 - 1);
|
||||
if (M > p0->M && M < p1->M) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(p0 && p1);
|
||||
|
||||
for (int i_stage = 0; i_stage < 3; i_stage++) {
|
||||
struct ggml_task_stage *stage = &profile->stages[i_stage];
|
||||
if (stage->backend == GGML_TASK_BACKEND_NONE) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int p0_v = p0->stages_time[i_stage];
|
||||
int p1_v = p1->stages_time[i_stage];
|
||||
|
||||
GGML_ASSERT(p0_v >= 0);
|
||||
GGML_ASSERT(p1_v >= 0);
|
||||
|
||||
// t = aM + b
|
||||
double a;
|
||||
double b;
|
||||
|
||||
if (p0 == p1) {
|
||||
a = 0.0;
|
||||
b = p1_v;
|
||||
} else {
|
||||
a = 1.0 * (p1_v - p0_v) / (p1->M - p0->M);
|
||||
b = p1_v - a * p1->M;
|
||||
}
|
||||
int t = (int)(a * M + b);
|
||||
|
||||
profile_time[ip].stage_time[i_stage] = t;
|
||||
profile_time[ip].total_time += t;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Experimental: create mul_mat tensor.
|
||||
static struct ggml_tensor *ggml_mulmat_new_tensor(int M, int N, int K,
|
||||
enum ggml_type src0_type,
|
||||
struct ggml_context **ctx) {
|
||||
// At most 256, because in `ggml_quantize_qx_x`, the index type of hist is
|
||||
// either int8_t or uint8_t.
|
||||
// Use 1024 to avoid suddenly broken.
|
||||
int64_t hist[1024];
|
||||
|
||||
bool src0_is_quantized = ggml_is_quantized(src0_type);
|
||||
|
||||
size_t ctx_size = 0;
|
||||
ctx_size += (size_t)(M * N * ggml_type_sizef(GGML_TYPE_F32)); // src1
|
||||
ctx_size += (size_t)(N * K * ggml_type_sizef(src0_type)); // src0
|
||||
ctx_size += (size_t)(1024 * 1024 * 64); // experimental
|
||||
|
||||
if (src0_is_quantized) {
|
||||
// quantize F32 to Qx_x
|
||||
ctx_size += (size_t)(N * K * ggml_type_sizef(GGML_TYPE_F32));
|
||||
}
|
||||
|
||||
struct ggml_init_params init_params = {
|
||||
.mem_size = ctx_size,
|
||||
.mem_buffer = NULL,
|
||||
.no_alloc = 0,
|
||||
};
|
||||
|
||||
*ctx = ggml_init(init_params);
|
||||
GGML_ASSERT(*ctx);
|
||||
|
||||
// src0: N x K
|
||||
struct ggml_tensor *src0 =
|
||||
ggml_new_tensor_2d(*ctx, src0_type, (int64_t)K, (int64_t)N);
|
||||
|
||||
// src1: M x K
|
||||
struct ggml_tensor *src1 =
|
||||
ggml_new_tensor_2d(*ctx, GGML_TYPE_F32, (int64_t)K, (int64_t)M);
|
||||
ggml_set_f32(src1, 0.5f);
|
||||
|
||||
if (src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_F16) {
|
||||
ggml_set_f32(src0, 0.1f);
|
||||
} else if (src0_is_quantized) {
|
||||
struct ggml_tensor *src0_f32 =
|
||||
ggml_new_tensor_2d(*ctx, GGML_TYPE_F32, (int64_t)K, (int64_t)N);
|
||||
ggml_set_f32(src0_f32, 0.1f);
|
||||
|
||||
switch (src0_type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
ggml_quantize_q4_0((const float *)src0_f32->data, src0->data, N * K,
|
||||
K, hist);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
ggml_quantize_q4_1((const float *)src0_f32->data, src0->data, N * K,
|
||||
K, hist);
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
ggml_quantize_q5_0((const float *)src0_f32->data, src0->data, N * K,
|
||||
K, hist);
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
ggml_quantize_q5_1((const float *)src0_f32->data, src0->data, N * K,
|
||||
K, hist);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
ggml_quantize_q8_0((const float *)src0_f32->data, src0->data, N * K,
|
||||
K, hist);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
// node: M x N
|
||||
// Will compute z = y * xT, z: node, y: src1, x: src0
|
||||
return ggml_mul_mat(*ctx, src0, src1);
|
||||
}
|
||||
|
||||
// Experimental: allocate memory for wdata with max possible size.
|
||||
// This part of code is actually belongs to ggml compute graph.
|
||||
static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) {
|
||||
// The size is actually determined by cgraph before computing.
|
||||
// Apart from the src0_type, wsize is affected by backend, cache line size,
|
||||
// n_threads etc.
|
||||
|
||||
const size_t extra = 1024 * 1024;
|
||||
size_t sz = (size_t)(N * K * ggml_type_sizef(GGML_TYPE_F32)) + extra;
|
||||
void *buf = malloc(sz);
|
||||
|
||||
if (!buf) {
|
||||
fprintf(stderr,
|
||||
"[mulmat tune] error: failed to allocate %zu MiB memory",
|
||||
sz / 1024 / 1024);
|
||||
return 0;
|
||||
}
|
||||
|
||||
memset(buf, 0, sz);
|
||||
*wdata = buf;
|
||||
return sz;
|
||||
}
|
||||
|
||||
int ggml_mulmat_tune_get_builtin_task_backends(
|
||||
enum ggml_task_backend *backends) {
|
||||
int i = 0;
|
||||
backends[i++] = GGML_TASK_BACKEND_CPU;
|
||||
|
||||
if (ggml_cpu_has_cpublas()) {
|
||||
backends[i++] = GGML_TASK_BACKEND_CPU_BLAS;
|
||||
}
|
||||
|
||||
if (ggml_cpu_has_cublas()) {
|
||||
backends[i++] = GGML_TASK_BACKEND_GPU_CUDA;
|
||||
} else if (ggml_cpu_has_clblast()) {
|
||||
backends[i++] = GGML_TASK_BACKEND_GPU_CL;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
|
||||
struct ggml_mulmat_tune_params *params) {
|
||||
GGML_ASSERT(tune);
|
||||
GGML_ASSERT(params);
|
||||
GGML_ASSERT(params->model.name);
|
||||
|
||||
enum ggml_task_backend backends[16];
|
||||
int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
|
||||
if (n_backends < 2) {
|
||||
fprintf(stderr,
|
||||
"[mulmat tune] error: this program was not built with BLAS.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ok = ggml_mulmat_tune_init(tune, params, NULL);
|
||||
if (!ok) {
|
||||
return false;
|
||||
}
|
||||
|
||||
{
|
||||
char buf[128] = {0};
|
||||
int offset = 0;
|
||||
|
||||
for (int i = 0; i < n_backends; i++) {
|
||||
if (i > 0) {
|
||||
buf[offset++] = ',';
|
||||
buf[offset++] = ' ';
|
||||
}
|
||||
const char *name = ggml_mulmat_tune_task_backend_name(backends[i]);
|
||||
size_t len = strlen(name);
|
||||
memcpy(&buf[offset], name, len);
|
||||
offset += (int)len;
|
||||
}
|
||||
|
||||
fprintf(stdout,
|
||||
"[mulmat tune] model: %s, ggml ftype: %d, "
|
||||
"n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n",
|
||||
params->model.name, params->model.ftype, params->n_pass,
|
||||
params->n_threads, tune->n_shapes, buf);
|
||||
}
|
||||
|
||||
int64_t stages_time[3];
|
||||
int64_t t0 = ggml_time_ms();
|
||||
|
||||
struct ggml_threading_context *thrd_ctx = ggml_threading_start(
|
||||
tune->n_threads, ggml_threading_graph_compute_thread,
|
||||
ggml_compute_forward_wrapper, GGML_THREADING_FEATURE_WAIT_ON_DONE,
|
||||
stages_time);
|
||||
|
||||
for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
|
||||
const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
|
||||
int M;
|
||||
int N = shape->N;
|
||||
int K = shape->K;
|
||||
|
||||
char buf[20] = {0};
|
||||
int buf_len = sizeof(buf) - 1;
|
||||
int line_len = 0;
|
||||
|
||||
for (int i_m = 0; i_m < shape->m_num; i_m++) {
|
||||
M = shape->arr_m[i_m];
|
||||
if (shape->N == 0) {
|
||||
N = M;
|
||||
} else if (shape->K == 0) {
|
||||
K = M;
|
||||
}
|
||||
|
||||
if (params->progress) {
|
||||
line_len = snprintf(buf, buf_len, "%d %d %d ", N, K, M);
|
||||
fprintf(stdout, "%s", buf);
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
char *wdata = NULL;
|
||||
size_t wsize = ggml_mulmat_allocate_wdata(N, K, &wdata);
|
||||
if (wsize == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
struct ggml_context *ctx = NULL;
|
||||
struct ggml_tensor *node =
|
||||
ggml_mulmat_new_tensor(M, N, K, shape->src0_type, &ctx);
|
||||
|
||||
for (int ip = 0; ip < shape->n_profiles; ip++) {
|
||||
const struct ggml_task_profile *profile = &shape->profiles[ip];
|
||||
|
||||
memcpy(&node->task_profile, profile,
|
||||
sizeof(struct ggml_task_profile));
|
||||
|
||||
struct ggml_mulmat_tune_m *item =
|
||||
&shape->items[ip * shape->m_num + i_m];
|
||||
item->M = M;
|
||||
|
||||
int min[3] = {INT32_MAX, INT32_MAX, INT32_MAX};
|
||||
|
||||
for (int k = 0; k < params->n_pass; k++) {
|
||||
for (int j = 0; j < 3; j++) {
|
||||
stages_time[j] = 0;
|
||||
}
|
||||
|
||||
/*enum ggml_compute_error err = */
|
||||
ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
int v = (int)stages_time[i];
|
||||
if (v < min[i]) {
|
||||
min[i] = v;
|
||||
}
|
||||
}
|
||||
|
||||
if (params->progress) {
|
||||
fprintf(stdout, ".");
|
||||
fflush(stdout);
|
||||
line_len++;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < 3; i++) {
|
||||
item->stages_time[i] = min[i];
|
||||
}
|
||||
}
|
||||
|
||||
ggml_free(ctx);
|
||||
free(wdata);
|
||||
|
||||
if (params->progress) {
|
||||
line_len += 10;
|
||||
for (int j = 0; j < line_len; j++) {
|
||||
fprintf(stdout, "\b \b");
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ggml_threading_stop(thrd_ctx);
|
||||
|
||||
fprintf(stdout, "[mulmat tune] done, elapsed time: %d seconds.\n",
|
||||
(int)(ggml_time_ms() - t0) / 1000);
|
||||
|
||||
// output
|
||||
|
||||
if (params->fname && strcmp(params->fname, "") != 0) {
|
||||
FILE *fp = fopen(params->fname, "w");
|
||||
if (!fp) {
|
||||
fprintf(stderr,
|
||||
"[mulmat tune] warn: failed to open file `%s`, print to "
|
||||
"console instead\n\n",
|
||||
params->fname);
|
||||
params->output_console = 1;
|
||||
} else {
|
||||
ok = ggml_mulmat_tune_write_data(tune, fp);
|
||||
fclose(fp);
|
||||
|
||||
if (ok) {
|
||||
fprintf(stdout, "[mulmat tune] data was written to `%s`\n",
|
||||
params->fname);
|
||||
} else {
|
||||
fprintf(
|
||||
stderr,
|
||||
"[mulmat tune] warn: failed to write file `%s`, print to "
|
||||
"console instead\n\n",
|
||||
params->fname);
|
||||
params->output_console = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (params->output_console) {
|
||||
return ggml_mulmat_tune_write_data(tune, stdout);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
137
ggml-tune.h
Normal file
137
ggml-tune.h
Normal file
@ -0,0 +1,137 @@
|
||||
#pragma once
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_MULMAT_TUNE_VERSION 8
|
||||
#define GGML_MULMAT_N_SHAPES 6
|
||||
|
||||
#define GGML_MULMAT_MAX_PASS 3
|
||||
|
||||
struct ggml_mulmat_tune_m {
|
||||
int M;
|
||||
|
||||
int stages_time[3];
|
||||
};
|
||||
|
||||
struct ggml_mulmat_tune_model {
|
||||
const char *name;
|
||||
|
||||
enum ggml_ftype ftype;
|
||||
|
||||
int n_vocab;
|
||||
|
||||
int n_embd;
|
||||
|
||||
// n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult
|
||||
int n_ff;
|
||||
|
||||
// n_rot = n_embd/n_head;
|
||||
int n_rot;
|
||||
};
|
||||
|
||||
struct ggml_mulmat_tune_shape {
|
||||
// For RoPE, one of N / K is 0.
|
||||
int N;
|
||||
int K;
|
||||
|
||||
enum ggml_type src0_type;
|
||||
enum ggml_type src1_type;
|
||||
|
||||
int n_profiles;
|
||||
struct ggml_task_profile *profiles;
|
||||
|
||||
int m_num;
|
||||
int *arr_m;
|
||||
|
||||
struct ggml_mulmat_tune_m *items;
|
||||
};
|
||||
|
||||
struct ggml_mulmat_tune {
|
||||
int version;
|
||||
|
||||
char model[16];
|
||||
|
||||
enum ggml_ftype ftype;
|
||||
|
||||
int n_shapes;
|
||||
// Given N/K, we bench for mul_mat [M,K] x [K,N].
|
||||
struct ggml_mulmat_tune_shape shapes[GGML_MULMAT_N_SHAPES];
|
||||
|
||||
int n_threads;
|
||||
};
|
||||
|
||||
struct ggml_mulmat_tune_time {
|
||||
struct ggml_task_profile *profile;
|
||||
int stage_time[3];
|
||||
int total_time;
|
||||
};
|
||||
|
||||
struct mm_cache_element {
|
||||
int M;
|
||||
int N;
|
||||
int K;
|
||||
struct ggml_task_profile *profile;
|
||||
int stages_time[3];
|
||||
};
|
||||
|
||||
// params for tune/bench.
|
||||
struct ggml_mulmat_tune_params {
|
||||
struct ggml_mulmat_tune_model model;
|
||||
int m_num;
|
||||
int n_pass;
|
||||
int n_threads;
|
||||
bool progress; // print and clear '.'
|
||||
bool output_console; // also print result to console
|
||||
const char *fname;
|
||||
};
|
||||
|
||||
// NOTE: stages_time is filled if not null.
|
||||
const struct ggml_task_profile *
|
||||
ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
|
||||
int N, int K, enum ggml_type src0_t,
|
||||
enum ggml_type src1_t, int stages_time[3]);
|
||||
|
||||
bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
|
||||
const char *model_name, int ftype,
|
||||
int n_threads);
|
||||
|
||||
void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
|
||||
const char *name, enum ggml_ftype ftype);
|
||||
|
||||
bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
|
||||
struct ggml_mulmat_tune_params *params,
|
||||
struct ggml_task_profile_factory *profile_factory);
|
||||
|
||||
void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune);
|
||||
|
||||
bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp);
|
||||
|
||||
bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp);
|
||||
|
||||
const struct ggml_mulmat_tune_shape *
|
||||
ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, int N, int K,
|
||||
enum ggml_type src0_type, enum ggml_type src1_type);
|
||||
|
||||
void ggml_mulmat_tune_estimate_time(const struct ggml_mulmat_tune_shape *shape,
|
||||
int M,
|
||||
struct ggml_mulmat_tune_time *profile_time);
|
||||
|
||||
const char *ggml_task_backend_name(enum ggml_task_backend backend);
|
||||
|
||||
int ggml_mulmat_tune_get_builtin_task_backends(
|
||||
enum ggml_task_backend *backends);
|
||||
|
||||
bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
|
||||
struct ggml_mulmat_tune_params *params);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
71
ggml.h
71
ggml.h
@ -1,5 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
|
||||
|
||||
//
|
||||
// GGML Tensor Library
|
||||
//
|
||||
@ -200,6 +202,7 @@
|
||||
#define GGML_MAX_OPT 4
|
||||
#define GGML_MAX_NAME 32
|
||||
#define GGML_DEFAULT_N_THREADS 4
|
||||
#define GGML_MAX_TASK_PROFILES 8
|
||||
|
||||
#define GGML_ASSERT(x) \
|
||||
do { \
|
||||
@ -347,7 +350,6 @@ extern "C" {
|
||||
GGML_OP_COUNT,
|
||||
};
|
||||
|
||||
|
||||
// ggml object
|
||||
struct ggml_object {
|
||||
size_t offs;
|
||||
@ -360,6 +362,54 @@ extern "C" {
|
||||
|
||||
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
||||
|
||||
// As part of task config profile solution, `ggml_task_backend` defines
|
||||
// backends for each task stage. Similar to `ggml_tensor.backend`,
|
||||
// `ggml_tensor.task_profile` generalizes how to configure tensor computing
|
||||
// at per task-stage level.
|
||||
//
|
||||
// The following enum values are designed as combination of hardware and
|
||||
// optional software interface.
|
||||
enum ggml_task_backend {
|
||||
GGML_TASK_BACKEND_NONE = 0,
|
||||
|
||||
// [0x10, 0x1F]: CPU
|
||||
GGML_TASK_BACKEND_CPU = 0x10,
|
||||
GGML_TASK_BACKEND_CPU_BLAS = 0x11,
|
||||
|
||||
// [0x20 - 0x2F]: GPU
|
||||
GGML_TASK_BACKEND_GPU = 0x20,
|
||||
GGML_TASK_BACKEND_GPU_CUDA = 0x21,
|
||||
GGML_TASK_BACKEND_GPU_CL = 0x22,
|
||||
};
|
||||
|
||||
// config for computing one of the 3 task stages of a tensor.
|
||||
struct ggml_task_stage {
|
||||
enum ggml_task_backend backend;
|
||||
bool parallel;
|
||||
// hint idle workers go waiting, valid only when parallel is false.
|
||||
bool wait;
|
||||
};
|
||||
|
||||
// config for computing a tensor.
|
||||
struct ggml_task_profile {
|
||||
// index 0: INIT, 1: COMPUTE, 2: FINALIZE
|
||||
struct ggml_task_stage stages[3];
|
||||
|
||||
// MUST be used only in testing codes.
|
||||
uint8_t dev_flags[4];
|
||||
};
|
||||
|
||||
struct ggml_task_profile_factory {
|
||||
struct ggml_task_profile f32_f32[GGML_MAX_TASK_PROFILES];
|
||||
int n_f32_f32;
|
||||
|
||||
struct ggml_task_profile f16_f32[GGML_MAX_TASK_PROFILES];
|
||||
int n_f16_f32;
|
||||
|
||||
struct ggml_task_profile qxx_f32[GGML_MAX_TASK_PROFILES];
|
||||
int n_qxx_f32;
|
||||
};
|
||||
|
||||
// n-dimensional tensor
|
||||
struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
@ -383,7 +433,8 @@ extern "C" {
|
||||
struct ggml_tensor * opt[GGML_MAX_OPT];
|
||||
|
||||
// thread scheduling
|
||||
int n_tasks;
|
||||
|
||||
struct ggml_task_profile task_profile;
|
||||
|
||||
// performance
|
||||
int perf_runs;
|
||||
@ -396,7 +447,7 @@ extern "C" {
|
||||
|
||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||
|
||||
char padding[4];
|
||||
char padding[12];
|
||||
};
|
||||
|
||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||
@ -407,6 +458,8 @@ extern "C" {
|
||||
int n_leafs;
|
||||
int n_threads;
|
||||
|
||||
struct ggml_mulmat_tune *tune;
|
||||
|
||||
size_t work_size;
|
||||
struct ggml_tensor * work;
|
||||
|
||||
@ -1287,9 +1340,21 @@ extern "C" {
|
||||
GGML_API int ggml_cpu_has_cublas (void);
|
||||
GGML_API int ggml_cpu_has_clblast (void);
|
||||
GGML_API int ggml_cpu_has_gpublas (void);
|
||||
GGML_API int ggml_cpu_has_cpublas (void);
|
||||
GGML_API int ggml_cpu_has_sse3 (void);
|
||||
GGML_API int ggml_cpu_has_vsx (void);
|
||||
|
||||
//
|
||||
// mulmat task profiles
|
||||
//
|
||||
GGML_API void ggml_mulmat_init_task_profiles(void);
|
||||
|
||||
GGML_API int ggml_mulmat_get_task_profiles(
|
||||
struct ggml_task_profile_factory *pf,
|
||||
enum ggml_type src0_t,
|
||||
enum ggml_type src1_t,
|
||||
struct ggml_task_profile **profiles);
|
||||
|
||||
//
|
||||
// Internal types and functions exposed for tests and benchmarks
|
||||
//
|
||||
|
160
llama.cpp
160
llama.cpp
@ -4,6 +4,7 @@
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#endif
|
||||
|
||||
#include "llama-util.h"
|
||||
@ -20,6 +21,10 @@
|
||||
#include "ggml-metal.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_MULMAT_TUNE
|
||||
#include "ggml-tune.h"
|
||||
#endif
|
||||
|
||||
#include <array>
|
||||
#include <ctime>
|
||||
#include <cinttypes>
|
||||
@ -280,6 +285,10 @@ struct llama_context {
|
||||
int buf_last = 0;
|
||||
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
||||
|
||||
#ifdef GGML_USE_MULMAT_TUNE
|
||||
struct ggml_mulmat_tune *tune = nullptr;
|
||||
#endif
|
||||
|
||||
void use_buf(struct ggml_context * ctx, int i) {
|
||||
#if defined(LLAMA_USE_SCRATCH)
|
||||
size_t last_size = 0;
|
||||
@ -1396,10 +1405,12 @@ static bool llama_eval_internal(
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||
ggml_cgraph gf = {};
|
||||
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
||||
gf.n_threads = n_threads;
|
||||
|
||||
#ifdef GGML_USE_MULMAT_TUNE
|
||||
gf.tune =lctx.tune;
|
||||
#endif
|
||||
|
||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
ggml_set_name(embd, "embd");
|
||||
@ -2732,7 +2743,150 @@ struct llama_context * llama_init_from_file(
|
||||
return ctx;
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_MULMAT_TUNE
|
||||
bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) {
|
||||
printf("\n");
|
||||
if (ctx->model.n_gpu_layers != 0) {
|
||||
fprintf(stderr, "[mulmat tune] error: is disabled by GPU offloading\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
const char *model_name = llama_model_type_name(ctx->model.type);
|
||||
|
||||
llama_hparams *hparams = &ctx->model.hparams;
|
||||
|
||||
enum ggml_ftype ggml_ftype;
|
||||
switch (hparams->ftype) {
|
||||
case LLAMA_FTYPE_ALL_F32:
|
||||
ggml_ftype = GGML_FTYPE_ALL_F32;
|
||||
break;
|
||||
case LLAMA_FTYPE_MOSTLY_F16:
|
||||
ggml_ftype = GGML_FTYPE_MOSTLY_F16;
|
||||
break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0:
|
||||
ggml_ftype = GGML_FTYPE_MOSTLY_Q4_0;
|
||||
break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1:
|
||||
ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1;
|
||||
break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
||||
ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1_SOME_F16;
|
||||
break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_0:
|
||||
ggml_ftype = GGML_FTYPE_MOSTLY_Q5_0;
|
||||
break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_1:
|
||||
ggml_ftype = GGML_FTYPE_MOSTLY_Q5_1;
|
||||
break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q8_0:
|
||||
ggml_ftype = GGML_FTYPE_MOSTLY_Q8_0;
|
||||
break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K:
|
||||
ggml_ftype = GGML_FTYPE_MOSTLY_Q2_K;
|
||||
break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_L:
|
||||
ggml_ftype = GGML_FTYPE_MOSTLY_Q3_K;
|
||||
break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_K_M:
|
||||
ggml_ftype = GGML_FTYPE_MOSTLY_Q4_K;
|
||||
break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M:
|
||||
ggml_ftype = GGML_FTYPE_MOSTLY_Q5_K;
|
||||
break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q6_K:
|
||||
ggml_ftype = GGML_FTYPE_MOSTLY_Q6_K;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
format("invalid output file type %d\n", hparams->ftype));
|
||||
}
|
||||
|
||||
int n_vocab = hparams->n_vocab;
|
||||
int n_embd = hparams->n_embd;
|
||||
int n_rot = hparams->n_rot;
|
||||
|
||||
int n_mult = hparams->n_mult;
|
||||
int n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult;
|
||||
|
||||
struct ggml_mulmat_tune_params params = {
|
||||
/*.model =*/ {
|
||||
/* .name =*/ model_name,
|
||||
/* .ftype =*/ ggml_ftype,
|
||||
/* .n_vocab =*/ n_vocab,
|
||||
/* .n_embd =*/ n_embd,
|
||||
/* .n_ff =*/ n_ff,
|
||||
/* .n_rot =*/ n_rot,
|
||||
},
|
||||
/* .m_num =*/ 8,
|
||||
/* .n_pass =*/ 1,
|
||||
/* .n_threads =*/ n_threads,
|
||||
/* .prrogress =*/ true,
|
||||
/* .output_console =*/ false,
|
||||
/* .fname =*/ fname,
|
||||
};
|
||||
|
||||
bool empty_fname = !fname || strcmp(fname, "") == 0;
|
||||
|
||||
ctx->tune = new(struct ggml_mulmat_tune);
|
||||
if (!ctx->tune) {
|
||||
throw std::runtime_error(format("failed to allocate memory for tune\n"));
|
||||
}
|
||||
|
||||
if (tune) {
|
||||
bool ok = ggml_mulmat_tune_bench(ctx->tune, ¶ms);
|
||||
if (!ok) {
|
||||
ggml_mulmat_tune_free(ctx->tune);
|
||||
return false;
|
||||
}
|
||||
if (!empty_fname) {
|
||||
ggml_mulmat_tune_free(ctx->tune);
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
if (empty_fname) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!empty_fname) {
|
||||
FILE *fp = fopen(fname, "r");
|
||||
if (!fp) {
|
||||
fprintf(stderr, "[mulmat tune] failed to open file %s.\n",
|
||||
fname);
|
||||
} else {
|
||||
bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp);
|
||||
fclose(fp);
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr,
|
||||
"[mulmat tune] failed to read data from %s\n",
|
||||
fname);
|
||||
return false;
|
||||
}
|
||||
|
||||
fprintf(stderr, "[mulmat tune] loaded data from %s\n", fname);
|
||||
|
||||
ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads);
|
||||
if (!ok) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
void llama_free(struct llama_context * ctx) {
|
||||
#ifdef GGML_USE_MULMAT_TUNE
|
||||
if (ctx->tune) {
|
||||
delete(ctx->tune);
|
||||
}
|
||||
#endif
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
|
3
llama.h
3
llama.h
@ -300,6 +300,9 @@ extern "C" {
|
||||
// Print system information
|
||||
LLAMA_API const char * llama_print_system_info(void);
|
||||
|
||||
// Experimental utility functionality for mulmat tunning.
|
||||
LLAMA_API bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
2
tests/.gitignore
vendored
Normal file
2
tests/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
/test-ggml-threading
|
||||
/test-ggml-tune
|
@ -12,3 +12,5 @@ llama_add_test(test-sampling.cpp)
|
||||
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
|
||||
# llama_add_test(test-grad0.c) # SLOW
|
||||
# llama_add_test(test-opt.c) # SLOW
|
||||
llama_add_test(test-ggml-threading.c)
|
||||
llama_add_test(test-ggml-tune.c)
|
||||
|
345
tests/test-ggml-threading.c
Normal file
345
tests/test-ggml-threading.c
Normal file
@ -0,0 +1,345 @@
|
||||
#include "ggml-threading.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
// Purposes:
|
||||
// 1. general overview of the threading behaviors.
|
||||
// 2. race (dead lock) detection.
|
||||
|
||||
// # build
|
||||
// cd build
|
||||
//
|
||||
// # build release:
|
||||
// cmake .. && cmake --build . --config Release
|
||||
//
|
||||
// # build with sanitize:
|
||||
// cmake .. -DLLAMA_SANITIZE_THREAD=ON && cmake --build . --config Release
|
||||
//
|
||||
// # run:
|
||||
// ./bin/test-ggml-threading
|
||||
|
||||
// How to turn off the warning on Apple: malloc: nano zone abandoned due to
|
||||
// inability to reserve vm space?
|
||||
// ==> export MallocNanoZone=0, no need to rebuild.
|
||||
// See `nano_init()` from
|
||||
// https://opensource.apple.com/source/libmalloc/libmalloc-140.40.1/src/nano_malloc.c.auto.html
|
||||
|
||||
// How to view the threading debug:
|
||||
// ==> uncomment `#define GGML_THREADING_DEBUG 1` from file ggml-threading.c
|
||||
|
||||
#define UNUSED(x) (void)(x)
|
||||
|
||||
#define MAX_N_THREADS 16
|
||||
|
||||
static const int n_repeat = 10;
|
||||
|
||||
// It's frustrating to use atomic with c11 on Windows, let's replace atomic
|
||||
// counter with array.
|
||||
static int work_done_arr[MAX_N_THREADS];
|
||||
|
||||
static enum ggml_compute_error
|
||||
mock_task_runner(struct ggml_compute_params *params, struct ggml_tensor *node) {
|
||||
int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000;
|
||||
if (node->task_profile.stages[params->type].parallel) {
|
||||
loops /= params->nth;
|
||||
}
|
||||
|
||||
volatile int64_t j = 0;
|
||||
for (int i = 0; i < loops; i++) {
|
||||
j++;
|
||||
}
|
||||
|
||||
UNUSED(j);
|
||||
|
||||
work_done_arr[params->ith]++;
|
||||
return GGML_COMPUTE_OK;
|
||||
}
|
||||
|
||||
int test_driver(int id, struct ggml_tensor *node, int n_threads) {
|
||||
printf("\n[test-ggml-threading] #%d, n_threads: %d\n", id, n_threads);
|
||||
|
||||
for (int i = 0; i < n_threads; i++) {
|
||||
work_done_arr[i] = 0;
|
||||
}
|
||||
|
||||
bool wait_on_done = (node->task_profile.dev_flags[0] > 0u);
|
||||
|
||||
enum ggml_threading_features features = GGML_THREADING_FEATURE_PERF;
|
||||
if (wait_on_done) {
|
||||
features |= GGML_THREADING_FEATURE_WAIT_ON_DONE;
|
||||
}
|
||||
|
||||
int t0 = (int)ggml_time_us();
|
||||
|
||||
struct ggml_threading_context *ctx =
|
||||
ggml_threading_start(n_threads, ggml_threading_graph_compute_thread,
|
||||
mock_task_runner, features, /*stages_time*/ NULL);
|
||||
|
||||
int t1 = (int)ggml_time_us();
|
||||
|
||||
for (int i = 0; i < n_repeat; i++) {
|
||||
enum ggml_compute_error err = ggml_threading_compute_tensor(
|
||||
ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
|
||||
if (err != GGML_COMPUTE_OK) {
|
||||
ggml_threading_stop(ctx);
|
||||
fprintf(stderr,
|
||||
"ggml_threading_compute_tensor failed with error: %d.\n",
|
||||
err);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
int t2 = (int)ggml_time_us();
|
||||
|
||||
ggml_threading_stop(ctx);
|
||||
|
||||
int t3 = (int)ggml_time_us();
|
||||
|
||||
int expect = 0;
|
||||
for (int i = 0; i < 3; i++) {
|
||||
struct ggml_task_stage *ts = &node->task_profile.stages[i];
|
||||
if (ts->backend != GGML_TASK_BACKEND_NONE) {
|
||||
if (ts->parallel) {
|
||||
expect += n_threads;
|
||||
} else {
|
||||
expect++;
|
||||
}
|
||||
}
|
||||
}
|
||||
expect *= n_repeat;
|
||||
|
||||
int actual = 0;
|
||||
for (int i = 0; i < n_threads; i++) {
|
||||
actual += work_done_arr[i];
|
||||
}
|
||||
|
||||
uint8_t loops = node->task_profile.dev_flags[1];
|
||||
|
||||
printf("\tloops: %2d million(s), ---wait_on_done---: %d\n\tstage-0: "
|
||||
"(parallel: %d, "
|
||||
"wait: %d)\n"
|
||||
"\tstage-1: (parallel: %d, wait: %d)\n",
|
||||
loops, wait_on_done, node->task_profile.stages[0].parallel,
|
||||
node->task_profile.stages[0].wait,
|
||||
node->task_profile.stages[1].parallel,
|
||||
node->task_profile.stages[1].wait);
|
||||
|
||||
if (actual == expect) {
|
||||
printf("\tthreading: init %6.3f ms, compute %6.3f ms, cleanup %6.3f "
|
||||
"ms, total %6.3f ms\n",
|
||||
1.0 * (t1 - t0) / 1000, 1.0 * (t2 - t1) / 1000,
|
||||
1.0 * (t3 - t2) / 1000, 1.0 * (t3 - t0) / 1000);
|
||||
return 0;
|
||||
}
|
||||
|
||||
fprintf(stderr, "\t== failed. expect %d done, actual %d done\n\n", expect,
|
||||
actual);
|
||||
|
||||
return 2;
|
||||
}
|
||||
|
||||
static enum ggml_compute_error
|
||||
mock_task_runner_fallback(struct ggml_compute_params *params,
|
||||
struct ggml_tensor *node) {
|
||||
UNUSED(params);
|
||||
if (node->backend == GGML_BACKEND_GPU) {
|
||||
// ... finally failed to compute in GPU.
|
||||
|
||||
node->backend = GGML_BACKEND_CPU;
|
||||
return GGML_COMPUTE_FALLBACK;
|
||||
} else {
|
||||
return GGML_COMPUTE_OK;
|
||||
}
|
||||
}
|
||||
|
||||
// By design, fallback should happen when attempt computing tensor in GPU,
|
||||
// thus it is not parallelled.
|
||||
int test_fallback(struct ggml_tensor *node) {
|
||||
struct ggml_threading_context *ctx = ggml_threading_start(
|
||||
1, ggml_threading_graph_compute_thread, mock_task_runner_fallback,
|
||||
/*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL);
|
||||
|
||||
enum ggml_compute_error err =
|
||||
ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
|
||||
if (err == GGML_COMPUTE_FALLBACK) {
|
||||
err = ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL,
|
||||
/*wsize*/ 0);
|
||||
}
|
||||
|
||||
ggml_threading_stop(ctx);
|
||||
if (err != GGML_COMPUTE_OK) {
|
||||
fprintf(stderr,
|
||||
"ggml_threading_compute_tensor failed with error: %d.\n", err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
ggml_time_init();
|
||||
|
||||
struct ggml_tensor node;
|
||||
memset(&node, 0, sizeof(struct ggml_tensor));
|
||||
|
||||
struct ggml_task_stage *stages = node.task_profile.stages;
|
||||
|
||||
stages[0].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
stages[2].backend = GGML_TASK_BACKEND_NONE;
|
||||
|
||||
int n_passed = 0;
|
||||
int n_tests = 0;
|
||||
|
||||
int parallel[3] = {0, 1, 2};
|
||||
|
||||
// In github build actions (windows-latest-cmake and ubuntu-latest-cmake):
|
||||
// When n_threads >= 4, the thread init time and compute time suddenly goes
|
||||
// down to 100x ~ 1000x slow -- comparing to n_threads == 2.
|
||||
//
|
||||
// But the tests (n_threads 1, 2, 4, 6) looks sound on my devices:
|
||||
// - MacBook air 2013, ubuntu 22.04
|
||||
// - MacBook pro 2018, macOS 13.4
|
||||
//
|
||||
// So I assume the github build host has limited multi-cpu quota.
|
||||
// Will skip computing when threading init time is too slow.
|
||||
//
|
||||
// NOTE: it's observed that when workload is 0 and n_threads >= number of
|
||||
// physical cores:
|
||||
// - the wait/wakeup time varies much: can be up to tens or hundreds of the
|
||||
// average time, thus greatly punishes those small workloads.
|
||||
// - wait_on_done is general faster than wait_now, can be 10x faster.
|
||||
|
||||
int threads_arr[] = {1, 2, 4, 8};
|
||||
int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
|
||||
|
||||
// millions of loops.
|
||||
uint8_t workload_arr[] = {0u, 1u, 10u};
|
||||
int workload_arr_len = sizeof(workload_arr) / sizeof(workload_arr[0]);
|
||||
|
||||
// node.task_profile.dev_flags: byte 0 for wait_on_done, byte 1 for loops.
|
||||
|
||||
for (int x = 0; x < workload_arr_len; x++) {
|
||||
node.task_profile.dev_flags[1] = workload_arr[x];
|
||||
|
||||
for (int i = 0; i < threads_arr_len; i++) {
|
||||
int n_threads = threads_arr[i];
|
||||
if (n_threads > MAX_N_THREADS) {
|
||||
abort();
|
||||
}
|
||||
|
||||
printf("\n[test-ggml-threading] ==== n_nodes: %d, n_threads: %d, "
|
||||
"loops: %2d million(s) ====\n",
|
||||
n_repeat, n_threads, workload_arr[x]);
|
||||
|
||||
if (n_threads > 1) { // skip this n_threads when too slow.
|
||||
int t0 = (int)ggml_time_us();
|
||||
|
||||
struct ggml_threading_context *ctx = ggml_threading_start(
|
||||
n_threads, ggml_threading_graph_compute_thread,
|
||||
mock_task_runner, 0, /*stages_time*/ NULL);
|
||||
|
||||
int t1 = (int)ggml_time_us();
|
||||
|
||||
ggml_threading_stop(ctx);
|
||||
|
||||
int elapsed_us = t1 - t0;
|
||||
if (elapsed_us > 500 * n_threads) {
|
||||
fprintf(stderr,
|
||||
"[test-ggml-threading] warning: it took took %.3f "
|
||||
"ms to start %d worker thread(s).\n",
|
||||
1.0 * elapsed_us / 1000, n_threads - 1);
|
||||
fprintf(stderr, "[test-ggml-threading] warning: looks like "
|
||||
"the environment is too slow to run this "
|
||||
"number of threads, skip.\n");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// multi-threads: parallel + wait_now/wait_on_done
|
||||
|
||||
if (n_threads == 1) {
|
||||
stages[0].parallel = false;
|
||||
stages[1].parallel = false;
|
||||
stages[0].wait = false;
|
||||
stages[1].wait = false;
|
||||
|
||||
n_tests++;
|
||||
if (test_driver(n_tests, &node, n_threads) == 0) {
|
||||
n_passed++;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int j = 0; j < 3; j++) {
|
||||
stages[0].wait = false;
|
||||
stages[1].wait = false;
|
||||
node.task_profile.dev_flags[0] = 0u;
|
||||
|
||||
if (parallel[j] == 0) {
|
||||
stages[0].parallel = false;
|
||||
stages[1].parallel = false;
|
||||
|
||||
n_tests++;
|
||||
if (test_driver(n_tests, &node, n_threads) == 0) {
|
||||
n_passed++;
|
||||
}
|
||||
} else if (parallel[j] == 1) {
|
||||
stages[0].parallel = true;
|
||||
stages[1].parallel = false;
|
||||
|
||||
for (int k = 0; k < 2; k++) {
|
||||
stages[1].wait = (k == 1);
|
||||
|
||||
if (!stages[1].wait) {
|
||||
n_tests++;
|
||||
if (test_driver(n_tests, &node, n_threads) == 0) {
|
||||
n_passed++;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// wait
|
||||
|
||||
for (int m = 0; m < 2; m++) {
|
||||
if (m == 1) {
|
||||
node.task_profile.dev_flags[0] = 1u;
|
||||
}
|
||||
n_tests++;
|
||||
if (test_driver(n_tests, &node, n_threads) == 0) {
|
||||
n_passed++;
|
||||
}
|
||||
node.task_profile.dev_flags[0] = 0u;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
stages[0].parallel = true;
|
||||
stages[1].parallel = true;
|
||||
|
||||
n_tests++;
|
||||
if (test_driver(n_tests, &node, n_threads) == 0) {
|
||||
n_passed++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
++n_tests;
|
||||
|
||||
node.backend = GGML_BACKEND_GPU;
|
||||
if (test_fallback(&node) == 0) {
|
||||
++n_passed;
|
||||
printf("\n[test-ggml-threading] test fallback: ok\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
printf("[test-ggml-threading] %d/%d passed.\n", n_passed, n_tests);
|
||||
|
||||
return (n_passed == n_tests) ? 0 : 1;
|
||||
}
|
200
tests/test-ggml-tune.c
Normal file
200
tests/test-ggml-tune.c
Normal file
@ -0,0 +1,200 @@
|
||||
#include "ggml-tune.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
static int bench(void);
|
||||
static int estimate_time_non_zero_NK(void);
|
||||
|
||||
static void init_params(struct ggml_mulmat_tune_params *params, int m_num) {
|
||||
*params = (struct ggml_mulmat_tune_params){
|
||||
.model =
|
||||
(struct ggml_mulmat_tune_model){
|
||||
.name = "3B", // fake
|
||||
.ftype = GGML_FTYPE_MOSTLY_Q4_0,
|
||||
.n_vocab = 4096,
|
||||
.n_embd = 1024,
|
||||
.n_ff = 2048,
|
||||
.n_rot = 128,
|
||||
},
|
||||
.m_num = m_num,
|
||||
.n_pass = 1,
|
||||
.n_threads = 1,
|
||||
.progress = false,
|
||||
.output_console = true,
|
||||
.fname = NULL};
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
int rv = bench();
|
||||
if (rv != 0) {
|
||||
return rv;
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
||||
rv = estimate_time_non_zero_NK();
|
||||
if (rv != 0) {
|
||||
return rv;
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bench(void) {
|
||||
printf("test: %s\n", __func__);
|
||||
|
||||
{
|
||||
enum ggml_task_backend backends[16];
|
||||
int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
|
||||
if (n_backends < 2) {
|
||||
printf("test: %s, skipped because no BLAS\n", __func__);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
struct ggml_init_params init_params = {
|
||||
/*.mem_size =*/1,
|
||||
/*.mem_buffer =*/NULL,
|
||||
/*.no_alloc =*/0,
|
||||
};
|
||||
struct ggml_context *ctx = ggml_init(init_params);
|
||||
GGML_ASSERT(ctx);
|
||||
ggml_free(ctx);
|
||||
}
|
||||
|
||||
struct ggml_mulmat_tune tune;
|
||||
|
||||
struct ggml_mulmat_tune_params params;
|
||||
|
||||
init_params(¶ms, /*m_num*/ 4);
|
||||
|
||||
bool ok = ggml_mulmat_tune_bench(&tune, ¶ms);
|
||||
ggml_mulmat_tune_free(&tune);
|
||||
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
int estimate_time_non_zero_NK(void) {
|
||||
printf("test: %s\n", __func__);
|
||||
|
||||
struct test_data_t {
|
||||
int M;
|
||||
int time[3]; // 3 profiles.
|
||||
};
|
||||
|
||||
struct ggml_mulmat_tune tune = {
|
||||
.version = 1,
|
||||
.ftype = GGML_FTYPE_MOSTLY_Q4_0,
|
||||
};
|
||||
|
||||
const int m_num = 2;
|
||||
|
||||
struct ggml_task_profile_factory pf;
|
||||
memset(&pf, 0, sizeof(struct ggml_task_profile_factory));
|
||||
|
||||
{
|
||||
pf.n_qxx_f32 = 2;
|
||||
pf.qxx_f32[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
|
||||
pf.qxx_f32[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||
|
||||
pf.qxx_f32[1].stages[0].backend = GGML_TASK_BACKEND_CPU;
|
||||
pf.qxx_f32[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
|
||||
}
|
||||
|
||||
struct ggml_mulmat_tune_params params;
|
||||
init_params(¶ms, m_num);
|
||||
|
||||
ggml_mulmat_tune_init(&tune, ¶ms, &pf);
|
||||
|
||||
struct ggml_mulmat_tune_shape *shape = NULL;
|
||||
for (int i = 0; i < tune.n_shapes; i++) {
|
||||
if (tune.shapes[i].N > 0 && tune.shapes[i].K > 0) {
|
||||
shape = &tune.shapes[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(shape);
|
||||
GGML_ASSERT(shape->n_profiles == 2);
|
||||
GGML_ASSERT(ggml_is_quantized(shape->src0_type));
|
||||
|
||||
printf("shape: N: %d, K: %d, n_profiles: %d\n", shape->N, shape->K,
|
||||
shape->n_profiles);
|
||||
|
||||
{
|
||||
shape->items[0] =
|
||||
(struct ggml_mulmat_tune_m){.M = 2, .stages_time = {2, 4, 0}};
|
||||
shape->items[1] =
|
||||
(struct ggml_mulmat_tune_m){.M = 4, .stages_time = {4, 8, 0}};
|
||||
|
||||
shape->items[2] =
|
||||
(struct ggml_mulmat_tune_m){.M = 2, .stages_time = {4, 4, 0}};
|
||||
shape->items[3] =
|
||||
(struct ggml_mulmat_tune_m){.M = 4, .stages_time = {4, 4, 0}};
|
||||
}
|
||||
|
||||
const struct test_data_t test_data[] = {
|
||||
{
|
||||
.M = 1, // out of range
|
||||
.time = {3, 8},
|
||||
},
|
||||
{
|
||||
.M = 2,
|
||||
.time = {6, 8},
|
||||
},
|
||||
{
|
||||
.M = 3,
|
||||
.time = {9, 8},
|
||||
},
|
||||
{
|
||||
.M = 4,
|
||||
.time = {12, 8},
|
||||
},
|
||||
{
|
||||
.M = 5, // out of range
|
||||
.time = {15, 8},
|
||||
},
|
||||
};
|
||||
|
||||
int n_tests = (int)(sizeof(test_data) / sizeof(struct test_data_t));
|
||||
|
||||
struct ggml_mulmat_tune_time profile_time[GGML_MAX_TASK_PROFILES];
|
||||
size_t profile_time_sz =
|
||||
sizeof(struct ggml_mulmat_tune_time) * GGML_MAX_TASK_PROFILES;
|
||||
|
||||
int n_passed = 0;
|
||||
for (int i = 0; i < n_tests; i++) {
|
||||
memset(profile_time, 0, profile_time_sz);
|
||||
const struct test_data_t *e = &test_data[i];
|
||||
|
||||
const struct ggml_mulmat_tune_shape *matched_shape =
|
||||
ggml_mulmat_tune_get_shape(&tune, shape->N, shape->K,
|
||||
shape->src0_type, shape->src1_type);
|
||||
GGML_ASSERT(matched_shape);
|
||||
GGML_ASSERT(matched_shape == shape);
|
||||
|
||||
ggml_mulmat_tune_estimate_time(matched_shape, e->M, profile_time);
|
||||
|
||||
for (int j = 0; j < shape->n_profiles; j++) {
|
||||
int actual = profile_time[j].total_time;
|
||||
int expect = e->time[j];
|
||||
if (expect != actual) {
|
||||
fprintf(stderr,
|
||||
"test fail. i: %d, j: %d, M: %d, expect: "
|
||||
"%d, actual: %d\n",
|
||||
i, j, e->M, expect, actual);
|
||||
} else {
|
||||
++n_passed;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
n_tests *= shape->n_profiles;
|
||||
printf("%2d of %2d pass\n", n_passed, n_tests);
|
||||
|
||||
ggml_mulmat_tune_free(&tune);
|
||||
|
||||
return n_passed == n_tests ? 0 : 1;
|
||||
}
|
Loading…
Reference in New Issue
Block a user