Merge branch 'master' into custom-attention-mask

This commit is contained in:
Georgi Gerganov 2023-09-18 11:15:18 +03:00
commit 58bb5110ca
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
44 changed files with 1180 additions and 413 deletions

View File

@ -265,22 +265,24 @@ jobs:
matrix: matrix:
include: include:
- build: 'noavx' - build: 'noavx'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF' defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx2' - build: 'avx2'
defines: '-DLLAMA_BUILD_SERVER=ON' defines: '-DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'avx' - build: 'avx'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF' defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx512' - build: 'avx512'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'clblast' - build: 'clblast'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"' defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas' - build: 'openblas'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Download OpenCL SDK - name: Download OpenCL SDK
id: get_opencl id: get_opencl
@ -397,11 +399,14 @@ jobs:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v3
with:
fetch-depth: 0
- uses: Jimver/cuda-toolkit@v0.2.11 - uses: Jimver/cuda-toolkit@v0.2.11
id: cuda-toolkit id: cuda-toolkit
with: with:
cuda: ${{ matrix.cuda }} cuda: ${{ matrix.cuda }}
method: 'network'
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]' sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
- name: Build - name: Build
@ -409,7 +414,7 @@ jobs:
run: | run: |
mkdir build mkdir build
cd build cd build
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
cmake --build . --config Release cmake --build . --config Release
- name: Determine tag name - name: Determine tag name
@ -485,6 +490,8 @@ jobs:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Determine tag name - name: Determine tag name
id: tag id: tag

View File

@ -80,6 +80,8 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF) option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"llama: max. batch size for using peer access")
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
@ -304,6 +306,7 @@ if (LLAMA_CUBLAS)
add_compile_definitions(GGML_CUDA_F16) add_compile_definitions(GGML_CUDA_F16)
endif() endif()
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
if (LLAMA_STATIC) if (LLAMA_STATIC)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@ -427,6 +430,7 @@ if (LLAMA_ALL_WARNINGS)
-Wextra -Wextra
-Wpedantic -Wpedantic
-Wcast-qual -Wcast-qual
-Wmissing-declarations
-Wno-unused-function -Wno-unused-function
-Wno-multichar -Wno-multichar
) )

View File

@ -95,16 +95,19 @@ CXXV := $(shell $(CXX) --version | head -n 1)
# #
# keep standard at C11 and C++11 # keep standard at C11 and C++11
MK_CPPFLAGS = -I. -Icommon
MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++11 -fPIC
# -Ofast tends to produce faster code, but may not be available for some compilers. # -Ofast tends to produce faster code, but may not be available for some compilers.
ifdef LLAMA_FAST ifdef LLAMA_FAST
OPT = -Ofast MK_CFLAGS += -Ofast
MK_HOST_CXXFLAGS += -Ofast
MK_CUDA_CXXFLAGS += -O3
else else
OPT = -O3 MK_CFLAGS += -O3
MK_CXXFLAGS += -O3
endif endif
MK_CPPFLAGS = -I. -Icommon
MK_CFLAGS = $(OPT) -std=c11 -fPIC
MK_CXXFLAGS = $(OPT) -std=c++11 -fPIC
MK_LDFLAGS =
# clock_gettime came in POSIX.1b (1993) # clock_gettime came in POSIX.1b (1993)
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
@ -172,9 +175,16 @@ endif # LLAMA_DISABLE_LOGS
# warnings # warnings
MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \ MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function -Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wmissing-declarations -Wno-unused-function -Wno-multichar
ifeq '' '$(findstring clang,$(shell $(CXX) --version))' # TODO(cebtenzzre): remove this once PR #2632 gets merged
TTFS_CXXFLAGS = $(CXXFLAGS) -Wno-missing-declarations
ifneq '' '$(findstring clang,$(shell $(CXX) --version))'
# clang++ only
MK_CXXFLAGS += -Wmissing-prototypes
TTFS_CXXFLAGS += -Wno-missing-prototypes
else
# g++ only # g++ only
MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds
endif endif
@ -225,7 +235,7 @@ ifndef RISCV
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
# Use all CPU extensions that are available: # Use all CPU extensions that are available:
MK_CFLAGS += -march=native -mtune=native MK_CFLAGS += -march=native -mtune=native
MK_CXXFLAGS += -march=native -mtune=native MK_HOST_CXXFLAGS += -march=native -mtune=native
# Usage AVX-only # Usage AVX-only
#MK_CFLAGS += -mfma -mf16c -mavx #MK_CFLAGS += -mfma -mf16c -mavx
@ -358,6 +368,11 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
else else
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
endif endif
ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
else
NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
#ifdef LLAMA_CUDA_CUBLAS #ifdef LLAMA_CUDA_CUBLAS
# NVCCFLAGS += -DGGML_CUDA_CUBLAS # NVCCFLAGS += -DGGML_CUDA_CUBLAS
#endif # LLAMA_CUDA_CUBLAS #endif # LLAMA_CUDA_CUBLAS
@ -365,7 +380,7 @@ ifdef LLAMA_CUDA_CCBIN
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif endif
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) -Wno-pedantic -c $< -o $@ $(NVCC) $(NVCCFLAGS) -Wno-pedantic -c $< -o $@
endif # LLAMA_CUBLAS endif # LLAMA_CUBLAS
ifdef LLAMA_CLBLAST ifdef LLAMA_CLBLAST
@ -435,8 +450,14 @@ endif # LLAMA_NO_K_QUANTS
# combine build flags with cmdline overrides # combine build flags with cmdline overrides
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS) override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS) override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
override CUDA_CXXFLAGS := $(MK_CUDA_CXXFLAGS) $(CUDA_CXXFLAGS)
override HOST_CXXFLAGS := $(MK_HOST_CXXFLAGS) $(HOST_CXXFLAGS)
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS) override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
# save CXXFLAGS before we add host-only options
NVCCFLAGS := $(NVCCFLAGS) $(CXXFLAGS) $(CUDA_CXXFLAGS) -Wno-pedantic -Xcompiler "$(HOST_CXXFLAGS)"
override CXXFLAGS += $(HOST_CXXFLAGS)
# #
# Print build information # Print build information
# #
@ -447,6 +468,7 @@ $(info I UNAME_P: $(UNAME_P))
$(info I UNAME_M: $(UNAME_M)) $(info I UNAME_M: $(UNAME_M))
$(info I CFLAGS: $(CFLAGS)) $(info I CFLAGS: $(CFLAGS))
$(info I CXXFLAGS: $(CXXFLAGS)) $(info I CXXFLAGS: $(CXXFLAGS))
$(info I NVCCFLAGS: $(NVCCFLAGS))
$(info I LDFLAGS: $(LDFLAGS)) $(info I LDFLAGS: $(LDFLAGS))
$(info I CC: $(CCV)) $(info I CC: $(CCV))
$(info I CXX: $(CXXV)) $(info I CXX: $(CXXV))
@ -492,22 +514,22 @@ main: examples/main/main.cpp build-info.h ggml.
@echo '==== Run ./main -h for help. ====' @echo '==== Run ./main -h for help. ===='
@echo @echo
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS) simple: examples/simple/simple.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS) quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS) quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS) perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS) embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS) save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
@ -524,7 +546,7 @@ gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS) train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(TTFS_CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@ -547,7 +569,7 @@ metal: examples/metal/metal.cpp ggml.o $(OBJS)
endif endif
build-info.h: $(wildcard .git/index) scripts/build-info.sh build-info.h: $(wildcard .git/index) scripts/build-info.sh
@sh scripts/build-info.sh > $@.tmp @sh scripts/build-info.sh $(CC) > $@.tmp
@if ! cmp -s $@.tmp $@; then \ @if ! cmp -s $@.tmp $@; then \
mv $@.tmp $@; \ mv $@.tmp $@; \
else \ else \
@ -560,7 +582,7 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
tests: $(TEST_TARGETS) tests: $(TEST_TARGETS)
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS) benchmark-matmult: examples/benchmark/benchmark-matmult.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
./$@ ./$@

View File

@ -392,12 +392,13 @@ Building the program with BLAS support may lead to some performance improvements
| LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). | | LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
---> --->
| Option | Legal values | Default | Description | | Option | Legal values | Default | Description |
|-------------------------|------------------------|---------|-------------| |--------------------------------|------------------------|---------|-------------|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. | | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
- #### hipBLAS - #### hipBLAS

View File

@ -78,7 +78,7 @@ int32_t get_num_physical_cores() {
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
} }
void process_escapes(std::string& input) { static void process_escapes(std::string& input) {
std::size_t input_len = input.length(); std::size_t input_len = input.length();
std::size_t output_idx = 0; std::size_t output_idx = 0;
@ -798,10 +798,10 @@ std::vector<llama_token> llama_tokenize(
// upper limit for the number of tokens // upper limit for the number of tokens
int n_tokens = text.length() + add_bos; int n_tokens = text.length() + add_bos;
std::vector<llama_token> result(n_tokens); std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos); n_tokens = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
if (n_tokens < 0) { if (n_tokens < 0) {
result.resize(-n_tokens); result.resize(-n_tokens);
int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos); int check = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
GGML_ASSERT(check == -n_tokens); GGML_ASSERT(check == -n_tokens);
} else { } else {
result.resize(n_tokens); result.resize(n_tokens);

View File

@ -3,6 +3,7 @@
#pragma once #pragma once
#include "llama.h" #include "llama.h"
#include "build-info.h"
#define LOG_NO_FILE_LINE_FUNCTION #define LOG_NO_FILE_LINE_FUNCTION
#include "log.h" #include "log.h"
@ -23,6 +24,11 @@
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
#define print_build_info() do { \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \
fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \
} while(0)
// //
// CLI argument parsing // CLI argument parsing
// //

View File

@ -158,7 +158,7 @@ namespace console {
} }
} }
char32_t getchar32() { static char32_t getchar32() {
#if defined(_WIN32) #if defined(_WIN32)
HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE); HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
wchar_t high_surrogate = 0; wchar_t high_surrogate = 0;
@ -212,7 +212,7 @@ namespace console {
#endif #endif
} }
void pop_cursor() { static void pop_cursor() {
#if defined(_WIN32) #if defined(_WIN32)
if (hConsole != NULL) { if (hConsole != NULL) {
CONSOLE_SCREEN_BUFFER_INFO bufferInfo; CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
@ -233,7 +233,7 @@ namespace console {
putc('\b', out); putc('\b', out);
} }
int estimateWidth(char32_t codepoint) { static int estimateWidth(char32_t codepoint) {
#if defined(_WIN32) #if defined(_WIN32)
(void)codepoint; (void)codepoint;
return 1; return 1;
@ -242,7 +242,7 @@ namespace console {
#endif #endif
} }
int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) { static int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
#if defined(_WIN32) #if defined(_WIN32)
CONSOLE_SCREEN_BUFFER_INFO bufferInfo; CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) { if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) {
@ -303,7 +303,7 @@ namespace console {
#endif #endif
} }
void replace_last(char ch) { static void replace_last(char ch) {
#if defined(_WIN32) #if defined(_WIN32)
pop_cursor(); pop_cursor();
put_codepoint(&ch, 1, 1); put_codepoint(&ch, 1, 1);
@ -312,7 +312,7 @@ namespace console {
#endif #endif
} }
void append_utf8(char32_t ch, std::string & out) { static void append_utf8(char32_t ch, std::string & out) {
if (ch <= 0x7F) { if (ch <= 0x7F) {
out.push_back(static_cast<unsigned char>(ch)); out.push_back(static_cast<unsigned char>(ch));
} else if (ch <= 0x7FF) { } else if (ch <= 0x7FF) {
@ -333,7 +333,7 @@ namespace console {
} }
// Helper function to remove the last UTF-8 character from a string // Helper function to remove the last UTF-8 character from a string
void pop_back_utf8_char(std::string & line) { static void pop_back_utf8_char(std::string & line) {
if (line.empty()) { if (line.empty()) {
return; return;
} }
@ -349,7 +349,7 @@ namespace console {
line.erase(pos); line.erase(pos);
} }
bool readline_advanced(std::string & line, bool multiline_input) { static bool readline_advanced(std::string & line, bool multiline_input) {
if (out != stdout) { if (out != stdout) {
fflush(stdout); fflush(stdout);
} }
@ -452,7 +452,7 @@ namespace console {
return has_more; return has_more;
} }
bool readline_simple(std::string & line, bool multiline_input) { static bool readline_simple(std::string & line, bool multiline_input) {
#if defined(_WIN32) #if defined(_WIN32)
std::wstring wline; std::wstring wline;
if (!std::getline(std::wcin, wline)) { if (!std::getline(std::wcin, wline)) {

View File

@ -9,7 +9,7 @@
namespace grammar_parser { namespace grammar_parser {
// NOTE: assumes valid utf8 (but checks for overrun) // NOTE: assumes valid utf8 (but checks for overrun)
// copied from llama.cpp // copied from llama.cpp
std::pair<uint32_t, const char *> decode_utf8(const char * src) { static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t first_byte = static_cast<uint8_t>(*src); uint8_t first_byte = static_cast<uint8_t>(*src);
uint8_t highbits = first_byte >> 4; uint8_t highbits = first_byte >> 4;
@ -24,19 +24,19 @@ namespace grammar_parser {
return std::make_pair(value, pos); return std::make_pair(value, pos);
} }
uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) { static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size()); uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id)); auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
return result.first->second; return result.first->second;
} }
uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) { static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size()); uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id; state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
return next_id; return next_id;
} }
void add_rule( static void add_rule(
parse_state & state, parse_state & state,
uint32_t rule_id, uint32_t rule_id,
const std::vector<llama_grammar_element> & rule) { const std::vector<llama_grammar_element> & rule) {
@ -46,11 +46,11 @@ namespace grammar_parser {
state.rules[rule_id] = rule; state.rules[rule_id] = rule;
} }
bool is_word_char(char c) { static bool is_word_char(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9'); return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
} }
std::pair<uint32_t, const char *> parse_hex(const char * src, int size) { static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
const char * pos = src; const char * pos = src;
const char * end = src + size; const char * end = src + size;
uint32_t value = 0; uint32_t value = 0;
@ -73,7 +73,7 @@ namespace grammar_parser {
return std::make_pair(value, pos); return std::make_pair(value, pos);
} }
const char * parse_space(const char * src, bool newline_ok) { static const char * parse_space(const char * src, bool newline_ok) {
const char * pos = src; const char * pos = src;
while (*pos == ' ' || *pos == '\t' || *pos == '#' || while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
(newline_ok && (*pos == '\r' || *pos == '\n'))) { (newline_ok && (*pos == '\r' || *pos == '\n'))) {
@ -88,7 +88,7 @@ namespace grammar_parser {
return pos; return pos;
} }
const char * parse_name(const char * src) { static const char * parse_name(const char * src) {
const char * pos = src; const char * pos = src;
while (is_word_char(*pos)) { while (is_word_char(*pos)) {
pos++; pos++;
@ -99,7 +99,7 @@ namespace grammar_parser {
return pos; return pos;
} }
std::pair<uint32_t, const char *> parse_char(const char * src) { static std::pair<uint32_t, const char *> parse_char(const char * src) {
if (*src == '\\') { if (*src == '\\') {
switch (src[1]) { switch (src[1]) {
case 'x': return parse_hex(src + 2, 2); case 'x': return parse_hex(src + 2, 2);
@ -129,7 +129,7 @@ namespace grammar_parser {
uint32_t rule_id, uint32_t rule_id,
bool is_nested); bool is_nested);
const char * parse_sequence( static const char * parse_sequence(
parse_state & state, parse_state & state,
const char * src, const char * src,
const std::string & rule_name, const std::string & rule_name,
@ -247,7 +247,7 @@ namespace grammar_parser {
return pos; return pos;
} }
const char * parse_rule(parse_state & state, const char * src) { static const char * parse_rule(parse_state & state, const char * src) {
const char * name_end = parse_name(src); const char * name_end = parse_name(src);
const char * pos = parse_space(name_end, false); const char * pos = parse_space(name_end, false);
size_t name_len = name_end - src; size_t name_len = name_end - src;
@ -285,7 +285,7 @@ namespace grammar_parser {
} }
} }
void print_grammar_char(FILE * file, uint32_t c) { static void print_grammar_char(FILE * file, uint32_t c) {
if (0x20 <= c && c <= 0x7f) { if (0x20 <= c && c <= 0x7f) {
fprintf(file, "%c", static_cast<char>(c)); fprintf(file, "%c", static_cast<char>(c));
} else { } else {
@ -294,7 +294,7 @@ namespace grammar_parser {
} }
} }
bool is_char_element(llama_grammar_element elem) { static bool is_char_element(llama_grammar_element elem) {
switch (elem.type) { switch (elem.type) {
case LLAMA_GRETYPE_CHAR: return true; case LLAMA_GRETYPE_CHAR: return true;
case LLAMA_GRETYPE_CHAR_NOT: return true; case LLAMA_GRETYPE_CHAR_NOT: return true;
@ -304,7 +304,7 @@ namespace grammar_parser {
} }
} }
void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) { static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
for (auto elem : rule) { for (auto elem : rule) {
switch (elem.type) { switch (elem.type) {
case LLAMA_GRETYPE_END: fprintf(file, "END"); break; case LLAMA_GRETYPE_END: fprintf(file, "END"); break;
@ -334,7 +334,7 @@ namespace grammar_parser {
fprintf(file, "\n"); fprintf(file, "\n");
} }
void print_rule( static void print_rule(
FILE * file, FILE * file,
uint32_t rule_id, uint32_t rule_id,
const std::vector<llama_grammar_element> & rule, const std::vector<llama_grammar_element> & rule,

248
convert-starcoder-hf-to-gguf.py Executable file
View File

@ -0,0 +1,248 @@
#!/usr/bin/env python3
# HF starcoder --> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import struct
import sys
from pathlib import Path
from typing import Any
import numpy as np
import torch
from transformers import AutoTokenizer # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
def bytes_to_unicode():
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n += 1
return dict(zip(bs, (chr(n) for n in cs)))
def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "GPTBigCodeForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit(1)
# get number of model parts
num_parts = count_model_parts(dir_model)
ARCH=gguf.MODEL_ARCH.STARCODER
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["n_layer"]
gguf_writer.add_name("StarCoder")
gguf_writer.add_context_length(hparams["n_positions"])
gguf_writer.add_embedding_length(hparams["n_embd"])
gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(hparams["n_head"])
gguf_writer.add_head_count_kv(1)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
gguf_writer.add_file_type(ftype)
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytearray] = []
scores: list[float] = []
toktypes: list[int] = []
tokenizer_json_file = dir_model / 'tokenizer.json'
if not tokenizer_json_file.is_file():
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
sys.exit(1)
# gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2")
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)
print("gguf: get gpt2 tokenizer vocab")
# The number of tokens in tokenizer.json can differ from the expected vocab size.
# This causes downstream issues with mismatched tensor sizes when running the inference
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
byte_encoder = bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}
for i in range(vocab_size):
if i in reverse_vocab:
try:
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
except KeyError:
text = bytearray()
for c in reverse_vocab[i]:
if ord(c) < 256: # single byte character
text.append(byte_decoder[ord(c)])
else: # multibyte special token character
text.extend(c.encode('utf-8'))
else:
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
pad_token = f"[PAD{i}]".encode("utf8")
text = bytearray(pad_token)
tokens.append(text)
scores.append(0.0) # dymmy
toktypes.append(gguf.TokenType.NORMAL) # dummy
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# params for qkv transform
n_head = hparams["n_head"]
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
head_dim = hparams["n_embd"] // n_head
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(dir_model / part_name, map_location="cpu")
for name in model_part.keys():
data = model_part[name]
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

View File

@ -9,12 +9,12 @@
#endif #endif
#ifdef LLAMA_DEFAULT_RMS_EPS #ifdef LLAMA_DEFAULT_RMS_EPS
static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
#else #else
static const float rms_norm_eps = 5e-6f; constexpr float rms_norm_eps = 5e-6f;
#endif #endif
float frand() { static float frand() {
return (float)rand()/(float)RAND_MAX; return (float)rand()/(float)RAND_MAX;
} }
@ -25,19 +25,21 @@ struct random_normal_distribution {
float max; float max;
}; };
void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) { static void init_random_normal_distribution(
struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max
) {
rnd->gen = std::mt19937(seed); rnd->gen = std::mt19937(seed);
rnd->nd = std::normal_distribution<float>{mean, std}; rnd->nd = std::normal_distribution<float>{mean, std};
rnd->min = min; rnd->min = min;
rnd->max = max; rnd->max = max;
} }
float frand_normal(struct random_normal_distribution * rnd) { static float frand_normal(struct random_normal_distribution * rnd) {
const float r = rnd->nd(rnd->gen); const float r = rnd->nd(rnd->gen);
return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r); return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
} }
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) { static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
if (plan.work_size > 0) { if (plan.work_size > 0) {
@ -48,13 +50,9 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
ggml_graph_compute(graph, &plan); ggml_graph_compute(graph, &plan);
} }
struct ggml_tensor * randomize_tensor( static struct ggml_tensor * randomize_tensor(
struct ggml_tensor * tensor, struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
int ndims, ) {
const int64_t ne[],
float fmin,
float fmax) {
switch (ndims) { switch (ndims) {
case 1: case 1:
for (int i0 = 0; i0 < ne[0]; i0++) { for (int i0 = 0; i0 < ne[0]; i0++) {
@ -95,11 +93,9 @@ struct ggml_tensor * randomize_tensor(
return tensor; return tensor;
} }
struct ggml_tensor * randomize_tensor_normal( static struct ggml_tensor * randomize_tensor_normal(
struct ggml_tensor * tensor, struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd
int ndims, ) {
const int64_t ne[],
struct random_normal_distribution * rnd) {
float scale = 1.0; // xavier float scale = 1.0; // xavier
switch (ndims) { switch (ndims) {
case 1: case 1:
@ -159,7 +155,7 @@ struct llama_hparams {
} }
}; };
uint32_t get_n_ff(const struct llama_hparams* hparams) { static uint32_t get_n_ff(const struct llama_hparams* hparams) {
const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
return n_ff; return n_ff;
} }
@ -260,7 +256,7 @@ struct llama_model_lora {
std::vector<llama_layer_lora> layers; std::vector<llama_layer_lora> layers;
}; };
void init_model(struct llama_model * model) { static void init_model(struct llama_model * model) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_embd = hparams.n_embd; const uint32_t n_embd = hparams.n_embd;
@ -297,7 +293,7 @@ void init_model(struct llama_model * model) {
} }
void init_model_lora(struct llama_model_lora * model) { static void init_model_lora(struct llama_model_lora * model) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_embd = hparams.n_embd; const uint32_t n_embd = hparams.n_embd;
@ -340,7 +336,7 @@ void init_model_lora(struct llama_model_lora * model) {
} }
} }
void set_param_model(struct llama_model * model) { static void set_param_model(struct llama_model * model) {
const auto& hparams = model->hparams; const auto& hparams = model->hparams;
const uint32_t n_layer = hparams.n_layer; const uint32_t n_layer = hparams.n_layer;
@ -366,7 +362,7 @@ void set_param_model(struct llama_model * model) {
} }
} }
void set_param_model_lora(struct llama_model_lora * model) { static void set_param_model_lora(struct llama_model_lora * model) {
const auto& hparams = model->hparams; const auto& hparams = model->hparams;
const uint32_t n_layer = hparams.n_layer; const uint32_t n_layer = hparams.n_layer;
@ -397,7 +393,7 @@ void set_param_model_lora(struct llama_model_lora * model) {
} }
} }
void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) { static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_layer = hparams.n_layer; const uint32_t n_layer = hparams.n_layer;
@ -426,7 +422,9 @@ void randomize_model(struct llama_model * model, int seed, float mean, float std
} }
void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, float std, float min, float max) { static void randomize_model_lora(
struct llama_model_lora * model, int seed, float mean, float std, float min, float max
) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_layer = hparams.n_layer; const uint32_t n_layer = hparams.n_layer;
@ -459,7 +457,7 @@ void randomize_model_lora(struct llama_model_lora * model, int seed, float mean,
} }
} }
bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_ctx = hparams.n_ctx; const uint32_t n_ctx = hparams.n_ctx;
@ -495,7 +493,7 @@ bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int
return true; return true;
} }
bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) { static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_ctx = hparams.n_ctx; const uint32_t n_ctx = hparams.n_ctx;
@ -531,15 +529,15 @@ bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora *
return true; return true;
} }
struct ggml_tensor * forward( static struct ggml_tensor * forward(
struct llama_model * model, struct llama_model * model,
struct llama_kv_cache * cache, struct llama_kv_cache * cache,
struct ggml_context * ctx0, struct ggml_context * ctx0,
struct ggml_cgraph * gf, struct ggml_cgraph * gf,
struct ggml_tensor * tokens_input, struct ggml_tensor * tokens_input,
const int n_tokens, const int n_tokens,
const int n_past) { const int n_past
) {
const int N = n_tokens; const int N = n_tokens;
struct llama_kv_cache& kv_self = *cache; struct llama_kv_cache& kv_self = *cache;
@ -764,25 +762,25 @@ struct ggml_tensor * forward(
return inpL; return inpL;
} }
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
GGML_ASSERT(tensor->n_dims == 1); GGML_ASSERT(tensor->n_dims == 1);
GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[0] == ne0);
} }
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
GGML_ASSERT(tensor->n_dims == 2); GGML_ASSERT(tensor->n_dims == 2);
GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[0] == ne0);
GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[1] == ne1);
} }
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
GGML_ASSERT(tensor->n_dims == 3); GGML_ASSERT(tensor->n_dims == 3);
GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[0] == ne0);
GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[1] == ne1);
GGML_ASSERT(tensor->ne[2] == ne2); GGML_ASSERT(tensor->ne[2] == ne2);
} }
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
GGML_ASSERT(tensor->n_dims == 4); GGML_ASSERT(tensor->n_dims == 4);
GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[0] == ne0);
GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[1] == ne1);
@ -790,7 +788,7 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
GGML_ASSERT(tensor->ne[3] == ne3); GGML_ASSERT(tensor->ne[3] == ne3);
} }
struct ggml_tensor * forward_batch( static struct ggml_tensor * forward_batch(
struct llama_model * model, struct llama_model * model,
struct llama_kv_cache * cache, struct llama_kv_cache * cache,
struct ggml_context * ctx0, struct ggml_context * ctx0,
@ -798,8 +796,8 @@ struct ggml_tensor * forward_batch(
struct ggml_tensor * tokens_input, struct ggml_tensor * tokens_input,
const int n_tokens, const int n_tokens,
const int n_past, const int n_past,
const int n_batch) { const int n_batch
) {
const int N = n_tokens; const int N = n_tokens;
struct llama_kv_cache& kv_self = *cache; struct llama_kv_cache& kv_self = *cache;
@ -1090,16 +1088,15 @@ struct ggml_tensor * forward_batch(
return inpL; return inpL;
} }
static struct ggml_tensor * forward_lora(
struct ggml_tensor * forward_lora(
struct llama_model_lora * model, struct llama_model_lora * model,
struct llama_kv_cache * cache, struct llama_kv_cache * cache,
struct ggml_context * ctx0, struct ggml_context * ctx0,
struct ggml_cgraph * gf, struct ggml_cgraph * gf,
struct ggml_tensor * tokens_input, struct ggml_tensor * tokens_input,
const int n_tokens, const int n_tokens,
const int n_past) { const int n_past
) {
const int N = n_tokens; const int N = n_tokens;
struct llama_kv_cache& kv_self = *cache; struct llama_kv_cache& kv_self = *cache;
@ -1353,7 +1350,7 @@ struct ggml_tensor * forward_lora(
return inpL; return inpL;
} }
void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
assert(logits->n_dims == 2); assert(logits->n_dims == 2);
assert(probs->n_dims == 2); assert(probs->n_dims == 2);
assert(best_samples->n_dims == 1); assert(best_samples->n_dims == 1);
@ -1384,7 +1381,10 @@ void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, str
} }
} }
void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { static void sample_softmax_batch(
struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
struct ggml_tensor * best_samples
) {
GGML_ASSERT(best_samples->n_dims == 2); GGML_ASSERT(best_samples->n_dims == 2);
GGML_ASSERT(logits->n_dims == 3); GGML_ASSERT(logits->n_dims == 3);
GGML_ASSERT(probs->n_dims == 3); GGML_ASSERT(probs->n_dims == 3);
@ -1418,7 +1418,7 @@ void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits
} }
} }
void print_row(struct ggml_tensor * probs, int i) { static void print_row(struct ggml_tensor * probs, int i) {
for (int k = 0; k < probs->ne[0]; ++k) { for (int k = 0; k < probs->ne[0]; ++k) {
float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
printf(" %.2f", p); printf(" %.2f", p);
@ -1426,7 +1426,7 @@ void print_row(struct ggml_tensor * probs, int i) {
printf("\n"); printf("\n");
} }
void print_matrix(struct ggml_tensor * probs) { static void print_matrix(struct ggml_tensor * probs) {
assert(probs->n_dims == 2); assert(probs->n_dims == 2);
for (int i = 0; i < probs->ne[1]; ++i) { for (int i = 0; i < probs->ne[1]; ++i) {
for (int k = 0; k < probs->ne[0]; ++k) { for (int k = 0; k < probs->ne[0]; ++k) {
@ -1437,7 +1437,7 @@ void print_matrix(struct ggml_tensor * probs) {
} }
} }
void print_token(int token, int n_vocab) { static void print_token(int token, int n_vocab) {
for (int k = 0; k < token; ++k) { for (int k = 0; k < token; ++k) {
printf(" "); printf(" ");
} }
@ -1448,14 +1448,14 @@ void print_token(int token, int n_vocab) {
printf("\n"); printf("\n");
} }
void print_tokens(struct ggml_tensor * tokens, int n_vocab) { static void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
for (int i=0; i<tokens->ne[0]; ++i) { for (int i=0; i<tokens->ne[0]; ++i) {
int token = ggml_get_i32_1d(tokens, i); int token = ggml_get_i32_1d(tokens, i);
print_token(token, n_vocab); print_token(token, n_vocab);
} }
} }
void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
int n_tokens = tokens_input->ne[0]; int n_tokens = tokens_input->ne[0];
int n_vocab = targets->ne[0]; int n_vocab = targets->ne[0];
float randomness = 0.0f; float randomness = 0.0f;
@ -1476,7 +1476,9 @@ void get_example_targets(int example_id, struct ggml_tensor * tokens_input, stru
} }
} }
void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { static void get_example_targets_batch(
struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
) {
GGML_ASSERT(tokens_input->n_dims == 2); GGML_ASSERT(tokens_input->n_dims == 2);
GGML_ASSERT( targets->n_dims == 3); GGML_ASSERT( targets->n_dims == 3);
int n_tokens = tokens_input->ne[0]; int n_tokens = tokens_input->ne[0];
@ -1499,7 +1501,7 @@ void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct
} }
} }
void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) { static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
int n_tokens = tokens_input->ne[0]; int n_tokens = tokens_input->ne[0];
int n_vocab = targets->ne[0]; int n_vocab = targets->ne[0];
for (int i=0; i<n_tokens-n_shift; ++i) { for (int i=0; i<n_tokens-n_shift; ++i) {
@ -1510,12 +1512,16 @@ void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * tar
} }
} }
struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { static struct ggml_tensor * square_error_loss(
struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
) {
// todo: instead of a-b: a[1:]-b[:-1] // todo: instead of a-b: a[1:]-b[:-1]
return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b))); return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
} }
struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { static struct ggml_tensor * cross_entropy_loss(
struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
) {
const float eps = 1e-3f; const float eps = 1e-3f;
return return
ggml_sum(ctx, ggml_sum(ctx,

View File

@ -3,6 +3,3 @@ add_executable(${TARGET} beam-search.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@ -1,6 +1,5 @@
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "build-info.h"
#include <cassert> #include <cassert>
#include <cinttypes> #include <cinttypes>
@ -30,7 +29,8 @@ struct ostream_beam_view {
llama_context * ctx; llama_context * ctx;
llama_beam_view beam_view; llama_beam_view beam_view;
}; };
std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens("; os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) { for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]); os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
@ -46,7 +46,7 @@ struct beam_search_callback_data {
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same. // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
// For example, eob can be flagged due to maximum token length, stop words, etc. // For example, eob can be flagged due to maximum token length, stop words, etc.
bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) { static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx); return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
} }
@ -56,7 +56,7 @@ bool is_at_eob(const beam_search_callback_data & callback_data, const llama_toke
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. // * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
// This is also called when the stop condition is met. // This is also called when the stop condition is met.
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data. // Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) { static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr); auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
// Mark beams as EOS as needed. // Mark beams as EOS as needed.
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {

View File

@ -1,7 +1,8 @@
set(TARGET benchmark) set(TARGET benchmark)
add_executable(${TARGET} benchmark-matmult.cpp) add_executable(${TARGET} benchmark-matmult.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO) if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO) add_dependencies(${TARGET} BUILD_INFO)

View File

@ -1,5 +1,5 @@
#include "common.h"
#include "ggml.h" #include "ggml.h"
#include "build-info.h"
#include <locale.h> #include <locale.h>
#include <assert.h> #include <assert.h>
@ -99,7 +99,7 @@ int main(int argc, char ** argv) {
exit(1); exit(1);
} }
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); print_build_info();
printf("Starting Test\n"); printf("Starting Test\n");
// create the ggml context // create the ggml context

View File

@ -115,7 +115,7 @@ struct TransformerWeights {
} }
}; };
void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) { static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
// we calloc instead of malloc to keep valgrind happy // we calloc instead of malloc to keep valgrind happy
w->token_embedding_table = new float[p->vocab_size * p->dim](); w->token_embedding_table = new float[p->vocab_size * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
@ -158,7 +158,7 @@ void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
} }
} }
int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) { static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1; if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1; if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1; if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
@ -189,7 +189,7 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shar
return 0; return 0;
} }
void print_sample_weights(TransformerWeights *w){ static void print_sample_weights(TransformerWeights *w){
printf("----- Quick print of first of the weight vales of all the variables\n"); printf("----- Quick print of first of the weight vales of all the variables\n");
printf("%f\n", w->token_embedding_table[0]); printf("%f\n", w->token_embedding_table[0]);
printf("%f\n", w->rms_att_weight[0]); printf("%f\n", w->rms_att_weight[0]);
@ -324,7 +324,7 @@ struct train_params {
int mem_compute1_gb; int mem_compute1_gb;
}; };
void print_params(struct my_llama_hparams * params) { static void print_params(struct my_llama_hparams * params) {
printf("%s: n_vocab: %d\n", __func__, params->n_vocab); printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
printf("%s: n_ctx: %d\n", __func__, params->n_ctx); printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
printf("%s: n_embd: %d\n", __func__, params->n_embd); printf("%s: n_embd: %d\n", __func__, params->n_embd);
@ -335,7 +335,7 @@ void print_params(struct my_llama_hparams * params) {
printf("%s: n_rot: %d\n", __func__, params->n_rot); printf("%s: n_rot: %d\n", __func__, params->n_rot);
} }
void init_model(struct my_llama_model * model) { static void init_model(struct my_llama_model * model) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_embd = hparams.n_embd; const uint32_t n_embd = hparams.n_embd;
@ -408,17 +408,17 @@ void init_model(struct my_llama_model * model) {
} }
} }
float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
return *ptr; return *ptr;
} }
int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
return *ptr; return *ptr;
} }
void print_row(struct ggml_tensor * probs, int i) { static void print_row(struct ggml_tensor * probs, int i) {
for (int k = 0; k < probs->ne[0]; ++k) { for (int k = 0; k < probs->ne[0]; ++k) {
float p = get_f32_2d(probs, k, i); float p = get_f32_2d(probs, k, i);
printf(" %f", p); printf(" %f", p);
@ -426,7 +426,7 @@ void print_row(struct ggml_tensor * probs, int i) {
printf("\n"); printf("\n");
} }
void print_matrix(struct ggml_tensor * probs) { static void print_matrix(struct ggml_tensor * probs) {
assert(probs->n_dims == 2); assert(probs->n_dims == 2);
for (int i = 0; i < probs->ne[1]; ++i) { for (int i = 0; i < probs->ne[1]; ++i) {
for (int k = 0; k < probs->ne[0]; ++k) { for (int k = 0; k < probs->ne[0]; ++k) {
@ -531,7 +531,7 @@ struct llama_file {
} }
}; };
bool is_ggml_file(const char *filename) { static bool is_ggml_file(const char * filename) {
llama_file file(filename, "rb"); llama_file file(filename, "rb");
if (file.size < 4) { if (file.size < 4) {
return false; return false;
@ -549,7 +549,7 @@ static std::string llama_escape_whitespaces(const std::string& text) {
return out.str(); return out.str();
} }
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
if (is_ggml_file(filename)) { if (is_ggml_file(filename)) {
struct ggml_context * ctx_data = NULL; struct ggml_context * ctx_data = NULL;
@ -637,7 +637,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
} }
} }
void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
int ct; int ct;
switch (gg_weights->n_dims){ switch (gg_weights->n_dims){
case 1: case 1:
@ -673,7 +673,9 @@ void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * kar
} }
} }
void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) { static void save_as_llama_model(
struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
) {
// convert AK weights into GG weights one by one. // convert AK weights into GG weights one by one.
// w->token_embedding_table -> model->tok_embeddings // w->token_embedding_table -> model->tok_embeddings
// float* -> struct ggml_tensor // float* -> struct ggml_tensor
@ -785,7 +787,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
gguf_free(ctx); gguf_free(ctx);
} }
struct train_params get_default_train_params() { static struct train_params get_default_train_params() {
struct train_params params; struct train_params params;
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf"; params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
params.fn_llama2c_output_model = "ak_llama_model.bin"; params.fn_llama2c_output_model = "ak_llama_model.bin";
@ -835,7 +837,7 @@ struct train_params get_default_train_params() {
return params; return params;
} }
void print_usage(int /*argc*/, char ** argv, const struct train_params * params) { static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "options:\n"); fprintf(stderr, "options:\n");
@ -846,7 +848,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
fprintf(stderr, "\n"); fprintf(stderr, "\n");
} }
bool params_parse(int argc, char ** argv, struct train_params * params) { static bool params_parse(int argc, char ** argv, struct train_params * params) {
bool invalid_param = false; bool invalid_param = false;
bool reqd_param_found = false; bool reqd_param_found = false;
std::string arg; std::string arg;
@ -901,7 +903,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
return true; return true;
} }
std::string basename(const std::string &path) { static std::string basename(const std::string &path) {
size_t pos = path.find_last_of("/\\"); size_t pos = path.find_last_of("/\\");
if (pos == std::string::npos) { if (pos == std::string::npos) {
return path; return path;

View File

@ -1,3 +1,4 @@
#include "common.h"
#include "embd-input.h" #include "embd-input.h"
#include <cassert> #include <cassert>
@ -22,7 +23,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
return nullptr; return nullptr;
} }
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) { if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = uint32_t(time(NULL)); params.seed = uint32_t(time(NULL));

View File

@ -3,7 +3,6 @@
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "build-info.h"
extern "C" { extern "C" {

View File

@ -1,6 +1,5 @@
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "build-info.h"
#include <ctime> #include <ctime>
@ -17,7 +16,7 @@ int main(int argc, char ** argv) {
params.embedding = true; params.embedding = true;
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) { if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL); params.seed = time(NULL);

View File

@ -20,7 +20,7 @@ static std::string to_string(const T & val) {
return ss.str(); return ss.str();
} }
bool gguf_ex_write(const std::string & fname) { static bool gguf_ex_write(const std::string & fname) {
struct gguf_context * ctx = gguf_init_empty(); struct gguf_context * ctx = gguf_init_empty();
gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12); gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12);
@ -85,7 +85,7 @@ bool gguf_ex_write(const std::string & fname) {
} }
// just read tensor info // just read tensor info
bool gguf_ex_read_0(const std::string & fname) { static bool gguf_ex_read_0(const std::string & fname) {
struct gguf_init_params params = { struct gguf_init_params params = {
/*.no_alloc = */ false, /*.no_alloc = */ false,
/*.ctx = */ NULL, /*.ctx = */ NULL,
@ -143,7 +143,7 @@ bool gguf_ex_read_0(const std::string & fname) {
} }
// read and create ggml_context containing the tensors and their data // read and create ggml_context containing the tensors and their data
bool gguf_ex_read_1(const std::string & fname) { static bool gguf_ex_read_1(const std::string & fname) {
struct ggml_context * ctx_data = NULL; struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = { struct gguf_init_params params = {

View File

@ -41,7 +41,8 @@ static std::ostringstream * g_output_ss;
static std::vector<llama_token> * g_output_tokens; static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false; static bool is_interacting = false;
void write_logfile(
static void write_logfile(
const llama_context * ctx, const gpt_params & params, const llama_model * model, const llama_context * ctx, const gpt_params & params, const llama_model * model,
const std::vector<llama_token> & input_tokens, const std::string & output, const std::vector<llama_token> & input_tokens, const std::string & output,
const std::vector<llama_token> & output_tokens const std::vector<llama_token> & output_tokens
@ -86,7 +87,7 @@ void write_logfile(
} }
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
void sigint_handler(int signo) { static void sigint_handler(int signo) {
if (signo == SIGINT) { if (signo == SIGINT) {
if (!is_interacting) { if (!is_interacting) {
is_interacting = true; is_interacting = true;
@ -148,6 +149,7 @@ int main(int argc, char ** argv) {
} }
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
if (params.seed == LLAMA_DEFAULT_SEED) { if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL); params.seed = time(NULL);

View File

@ -1,6 +1,5 @@
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "build-info.h"
#include <cmath> #include <cmath>
#include <cstdio> #include <cstdio>
@ -28,9 +27,10 @@ struct results_log_softmax {
float prob; float prob;
}; };
void write_logfile(const llama_context * ctx, const gpt_params & params, static void write_logfile(
const llama_model * model, const struct results_perplexity & results) { const llama_context * ctx, const gpt_params & params, const llama_model * model,
const struct results_perplexity & results
) {
if (params.logdir.empty()) { if (params.logdir.empty()) {
return; return;
} }
@ -76,7 +76,7 @@ void write_logfile(const llama_context * ctx, const gpt_params & params,
fclose(logfile); fclose(logfile);
} }
std::vector<float> softmax(const std::vector<float>& logits) { static std::vector<float> softmax(const std::vector<float>& logits) {
std::vector<float> probs(logits.size()); std::vector<float> probs(logits.size());
float max_logit = logits[0]; float max_logit = logits[0];
for (float v : logits) max_logit = std::max(max_logit, v); for (float v : logits) max_logit = std::max(max_logit, v);
@ -92,7 +92,7 @@ std::vector<float> softmax(const std::vector<float>& logits) {
return probs; return probs;
} }
results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
float max_logit = logits[0]; float max_logit = logits[0];
for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]); for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
double sum_exp = 0.0; double sum_exp = 0.0;
@ -100,9 +100,10 @@ results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
} }
void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers, static void process_logits(
double & nll, double & nll2, float * logit_history, float * prob_history) { int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
double & nll, double & nll2, float * logit_history, float * prob_history
) {
std::mutex mutex; std::mutex mutex;
int counter = 0; int counter = 0;
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
@ -130,7 +131,7 @@ void process_logits(int n_vocab, const float * logits, const int * tokens, int n
} }
results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]` // Output: `perplexity: 13.5106 [114/114]`
@ -260,8 +261,7 @@ results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params)
return {tokens, std::exp(nll / count), logit_history, prob_history}; return {tokens, std::exp(nll / count), logit_history, prob_history};
} }
results_perplexity perplexity(llama_context * ctx, const gpt_params & params) { static results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
if (params.ppl_stride > 0) { if (params.ppl_stride > 0) {
return perplexity_v2(ctx, params); return perplexity_v2(ctx, params);
} }
@ -400,8 +400,9 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
return {tokens, ppl, logit_history, prob_history}; return {tokens, ppl, logit_history, prob_history};
} }
std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int> & tokens, int n_past, int n_batch, static std::vector<float> hellaswag_evaluate_tokens(
int n_vocab, int n_thread) { llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch, int n_vocab, int n_thread
) {
std::vector<float> result; std::vector<float> result;
result.reserve(tokens.size() * n_vocab); result.reserve(tokens.size() * n_vocab);
size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch; size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
@ -421,7 +422,7 @@ std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vec
return result; return result;
} }
void hellaswag_score(llama_context * ctx, const gpt_params & params) { static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
// Calculates hellaswag score (acc_norm) from prompt // Calculates hellaswag score (acc_norm) from prompt
// //
// Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
@ -668,7 +669,7 @@ int main(int argc, char ** argv) {
params.n_ctx += params.ppl_stride/2; params.n_ctx += params.ppl_stride/2;
} }
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) { if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL); params.seed = time(NULL);

View File

@ -2,4 +2,5 @@ set(TARGET quantize-stats)
add_executable(${TARGET} quantize-stats.cpp) add_executable(${TARGET} quantize-stats.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@ -1,7 +1,6 @@
#include "ggml.h"
#include "build-info.h"
#define LLAMA_API_INTERNAL #define LLAMA_API_INTERNAL
#include "common.h"
#include "ggml.h"
#include "llama.h" #include "llama.h"
#include <algorithm> #include <algorithm>
@ -34,8 +33,8 @@ struct quantize_stats_params {
std::vector<enum ggml_type> include_types; std::vector<enum ggml_type> include_types;
}; };
const size_t HISTOGRAM_BUCKETS = 150; constexpr size_t HISTOGRAM_BUCKETS = 150;
const double HISTOGRAM_RANGE = 0.03; constexpr double HISTOGRAM_RANGE = 0.03;
struct error_stats { struct error_stats {
size_t num_samples; size_t num_samples;
@ -44,8 +43,7 @@ struct error_stats {
uint64_t error_histogram[HISTOGRAM_BUCKETS]; uint64_t error_histogram[HISTOGRAM_BUCKETS];
}; };
static void quantize_stats_print_usage(int /*argc*/, char ** argv) {
void quantize_stats_print_usage(int /*argc*/, char ** argv) {
quantize_stats_params params; quantize_stats_params params;
fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
@ -71,7 +69,7 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) {
} }
// Check if a layer is included/excluded by command line // Check if a layer is included/excluded by command line
bool layer_included(const quantize_stats_params & params, const std::string & layer) { static bool layer_included(const quantize_stats_params & params, const std::string & layer) {
for (const auto& excluded : params.exclude_layers) { for (const auto& excluded : params.exclude_layers) {
if (std::regex_search(layer, std::regex(excluded))) { if (std::regex_search(layer, std::regex(excluded))) {
return false; return false;
@ -86,7 +84,7 @@ bool layer_included(const quantize_stats_params & params, const std::string & la
} }
// Update error statistics given vectors with the before/after result of quantization // Update error statistics given vectors with the before/after result of quantization
void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) { static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
for (int64_t i = 0; i < nelements; i++) { for (int64_t i = 0; i < nelements; i++) {
double diff = input[i] - output[i]; double diff = input[i] - output[i];
stats.total_error += diff * diff; stats.total_error += diff * diff;
@ -96,14 +94,14 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou
stats.num_samples += nelements; stats.num_samples += nelements;
} }
void combine_error_stats(error_stats & into, const error_stats & from) { static void combine_error_stats(error_stats & into, const error_stats & from) {
into.num_samples += from.num_samples; into.num_samples += from.num_samples;
into.total_error += from.total_error; into.total_error += from.total_error;
if (from.max_error > into.max_error) into.max_error = from.max_error; if (from.max_error > into.max_error) into.max_error = from.max_error;
for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i]; for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
} }
double find_quantile(const error_stats & stats, double quantile) { static double find_quantile(const error_stats & stats, double quantile) {
double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0); double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
double accum = 0; double accum = 0;
@ -116,7 +114,7 @@ double find_quantile(const error_stats & stats, double quantile) {
return INFINITY; return INFINITY;
} }
void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) { static void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
double rmse = sqrt(stats.total_error / (double) stats.num_samples); double rmse = sqrt(stats.total_error / (double) stats.num_samples);
double median = find_quantile(stats, .5); double median = find_quantile(stats, .5);
double pct95 = find_quantile(stats, .95); double pct95 = find_quantile(stats, .95);
@ -143,17 +141,10 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
} }
void test_roundtrip_on_chunk( static void test_roundtrip_on_chunk(
const ggml_tensor * layer, const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
int64_t offset, float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
int64_t chunk_size, ) {
const ggml_type_traits_t & qfns,
bool use_reference,
float * input_scratch,
char * quantized_scratch,
float * output_scratch,
error_stats & stats) {
if (layer->type == GGML_TYPE_F16) { if (layer->type == GGML_TYPE_F16) {
for (int i = 0; i < chunk_size; i++) { for (int i = 0; i < chunk_size; i++) {
input_scratch[i] = ggml_get_f32_1d(layer, i + offset); input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
@ -174,18 +165,11 @@ void test_roundtrip_on_chunk(
// Run quantization function for a single layer and update error stats // Run quantization function for a single layer and update error stats
void test_roundtrip_on_layer( static void test_roundtrip_on_layer(
std::string & name, std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
bool print_layer_stats, const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
const ggml_type_traits_t & qfns, std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
bool use_reference, ) {
const ggml_tensor * layer,
std::vector<float> & input_scratch,
std::vector<char> & quantized_scratch,
std::vector<float> & output_scratch,
error_stats & total_error,
int max_thread = 0) {
assert(tensor_is_contiguous(layer)); assert(tensor_is_contiguous(layer));
error_stats layer_error {}; error_stats layer_error {};
uint64_t nelements = ggml_nelements(layer); uint64_t nelements = ggml_nelements(layer);
@ -314,7 +298,7 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); print_build_info();
// load the model // load the model
fprintf(stderr, "Loading model\n"); fprintf(stderr, "Loading model\n");

View File

@ -2,6 +2,7 @@ set(TARGET quantize)
add_executable(${TARGET} quantize.cpp) add_executable(${TARGET} quantize.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO) if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO) add_dependencies(${TARGET} BUILD_INFO)

View File

@ -1,5 +1,4 @@
#include "build-info.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include <cstdio> #include <cstdio>
@ -40,7 +39,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
}; };
bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
std::string ftype_str; std::string ftype_str;
for (auto ch : ftype_str_in) { for (auto ch : ftype_str_in) {
@ -72,7 +71,7 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
// usage: // usage:
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
// //
void usage(const char * executable) { static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
@ -161,7 +160,7 @@ int main(int argc, char ** argv) {
} }
} }
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); print_build_info();
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str()); fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
if (params.nthread > 0) { if (params.nthread > 0) {

View File

@ -1,6 +1,5 @@
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "build-info.h"
#include <vector> #include <vector>
#include <cstdio> #include <cstdio>
@ -17,7 +16,7 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); print_build_info();
if (params.n_predict < 0) { if (params.n_predict < 0) {
params.n_predict = 16; params.n_predict = 16;

View File

@ -1083,8 +1083,9 @@ static json format_final_response(llama_server_context &llama, const std::string
return res; return res;
} }
static json format_partial_response(llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs) static json format_partial_response(
{ llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs
) {
json res = json{ json res = json{
{"content", content}, {"content", content},
{"stop", false}, {"stop", false},
@ -1215,7 +1216,7 @@ static void log_server_request(const Request &req, const Response &res)
}); });
} }
bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) { static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) {
return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx); return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
} }
@ -1225,7 +1226,7 @@ bool is_at_eob(llama_server_context & server_context, const llama_token * tokens
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. // * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
// This is also called when the stop condition is met. // This is also called when the stop condition is met.
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data. // Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
void beam_search_callback(void * callback_data, llama_beams_state beams_state) { static void beam_search_callback(void *callback_data, llama_beams_state beams_state) {
auto & llama = *static_cast<llama_server_context*>(callback_data); auto & llama = *static_cast<llama_server_context*>(callback_data);
// Mark beams as EOS as needed. // Mark beams as EOS as needed.
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
@ -1258,7 +1259,8 @@ struct token_translator {
std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); } std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); }
}; };
void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) { static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama)
{
auto & gtps = llama.generated_token_probs; auto & gtps = llama.generated_token_probs;
auto translator = token_translator{llama.ctx}; auto translator = token_translator{llama.ctx};
auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); }; auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };

View File

@ -3,6 +3,3 @@ add_executable(${TARGET} simple.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@ -1,5 +1,3 @@
#include "build-info.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"

View File

@ -973,10 +973,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
buf[size] = '\0'; buf[size] = '\0';
int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false); int n_tokens = llama_tokenize(lctx, buf.data(), buf.size(), out.data(), out.size(), false);
if (n_tokens < 0) { if (n_tokens < 0) {
out.resize(-n_tokens); out.resize(-n_tokens);
n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false); n_tokens = llama_tokenize(lctx, buf.data(), buf.size(), out.data(), out.size(), false);
} }
GGML_ASSERT(n_tokens >= 0); GGML_ASSERT(n_tokens >= 0);
out.resize(n_tokens); out.resize(n_tokens);

View File

@ -31,6 +31,9 @@
#define cublasSetStream hipblasSetStream #define cublasSetStream hipblasSetStream
#define cublasSgemm hipblasSgemm #define cublasSgemm hipblasSgemm
#define cublasStatus_t hipblasStatus_t #define cublasStatus_t hipblasStatus_t
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
#define cudaDeviceProp hipDeviceProp_t #define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize #define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t #define cudaError_t hipError_t
@ -61,7 +64,7 @@
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamNonBlocking hipStreamNonBlocking #define cudaStreamNonBlocking hipStreamNonBlocking
#define cudaStreamSynchronize hipStreamSynchronize #define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0) #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
#define cudaStream_t hipStream_t #define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess #define cudaSuccess hipSuccess
#else #else
@ -190,6 +193,12 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
} while (0) } while (0)
#endif // CUDART_VERSION >= 11 #endif // CUDART_VERSION >= 11
#if CUDART_VERSION >= 11100
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
#else
#define GGML_CUDA_ASSUME(x)
#endif // CUDART_VERSION >= 11100
#ifdef GGML_CUDA_F16 #ifdef GGML_CUDA_F16
typedef half dfloat; // dequantize float typedef half dfloat; // dequantize float
typedef half2 dfloat2; typedef half2 dfloat2;
@ -418,6 +427,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
#endif #endif
#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
#define MUL_MAT_SRC1_COL_STRIDE 128 #define MUL_MAT_SRC1_COL_STRIDE 128
#define MAX_STREAMS 8 #define MAX_STREAMS 8
@ -2145,10 +2158,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
__builtin_assume(i_offset >= 0); GGML_CUDA_ASSUME(i_offset >= 0);
__builtin_assume(i_offset < nwarps); GGML_CUDA_ASSUME(i_offset < nwarps);
__builtin_assume(k >= 0); GGML_CUDA_ASSUME(k >= 0);
__builtin_assume(k < WARP_SIZE); GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI4_0; const int kbx = k / QI4_0;
const int kqsx = k % QI4_0; const int kqsx = k % QI4_0;
@ -2239,10 +2252,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
__builtin_assume(i_offset >= 0); GGML_CUDA_ASSUME(i_offset >= 0);
__builtin_assume(i_offset < nwarps); GGML_CUDA_ASSUME(i_offset < nwarps);
__builtin_assume(k >= 0); GGML_CUDA_ASSUME(k >= 0);
__builtin_assume(k < WARP_SIZE); GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI4_1; const int kbx = k / QI4_1;
const int kqsx = k % QI4_1; const int kqsx = k % QI4_1;
@ -2331,10 +2344,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
__builtin_assume(i_offset >= 0); GGML_CUDA_ASSUME(i_offset >= 0);
__builtin_assume(i_offset < nwarps); GGML_CUDA_ASSUME(i_offset < nwarps);
__builtin_assume(k >= 0); GGML_CUDA_ASSUME(k >= 0);
__builtin_assume(k < WARP_SIZE); GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI5_0; const int kbx = k / QI5_0;
const int kqsx = k % QI5_0; const int kqsx = k % QI5_0;
@ -2445,10 +2458,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
__builtin_assume(i_offset >= 0); GGML_CUDA_ASSUME(i_offset >= 0);
__builtin_assume(i_offset < nwarps); GGML_CUDA_ASSUME(i_offset < nwarps);
__builtin_assume(k >= 0); GGML_CUDA_ASSUME(k >= 0);
__builtin_assume(k < WARP_SIZE); GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI5_1; const int kbx = k / QI5_1;
const int kqsx = k % QI5_1; const int kqsx = k % QI5_1;
@ -2551,10 +2564,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
__builtin_assume(i_offset >= 0); GGML_CUDA_ASSUME(i_offset >= 0);
__builtin_assume(i_offset < nwarps); GGML_CUDA_ASSUME(i_offset < nwarps);
__builtin_assume(k >= 0); GGML_CUDA_ASSUME(k >= 0);
__builtin_assume(k < WARP_SIZE); GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI8_0; const int kbx = k / QI8_0;
const int kqsx = k % QI8_0; const int kqsx = k % QI8_0;
@ -2642,10 +2655,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
__builtin_assume(i_offset >= 0); GGML_CUDA_ASSUME(i_offset >= 0);
__builtin_assume(i_offset < nwarps); GGML_CUDA_ASSUME(i_offset < nwarps);
__builtin_assume(k >= 0); GGML_CUDA_ASSUME(k >= 0);
__builtin_assume(k < WARP_SIZE); GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI2_K; const int kbx = k / QI2_K;
const int kqsx = k % QI2_K; const int kqsx = k % QI2_K;
@ -2763,10 +2776,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
__builtin_assume(i_offset >= 0); GGML_CUDA_ASSUME(i_offset >= 0);
__builtin_assume(i_offset < nwarps); GGML_CUDA_ASSUME(i_offset < nwarps);
__builtin_assume(k >= 0); GGML_CUDA_ASSUME(k >= 0);
__builtin_assume(k < WARP_SIZE); GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI3_K; const int kbx = k / QI3_K;
const int kqsx = k % QI3_K; const int kqsx = k % QI3_K;
@ -2981,10 +2994,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
__builtin_assume(i_offset >= 0); GGML_CUDA_ASSUME(i_offset >= 0);
__builtin_assume(i_offset < nwarps); GGML_CUDA_ASSUME(i_offset < nwarps);
__builtin_assume(k >= 0); GGML_CUDA_ASSUME(k >= 0);
__builtin_assume(k < WARP_SIZE); GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI4_K; // == 0 if QK_K == 256 const int kbx = k / QI4_K; // == 0 if QK_K == 256
const int kqsx = k % QI4_K; // == k if QK_K == 256 const int kqsx = k % QI4_K; // == k if QK_K == 256
@ -3162,10 +3175,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
__builtin_assume(i_offset >= 0); GGML_CUDA_ASSUME(i_offset >= 0);
__builtin_assume(i_offset < nwarps); GGML_CUDA_ASSUME(i_offset < nwarps);
__builtin_assume(k >= 0); GGML_CUDA_ASSUME(k >= 0);
__builtin_assume(k < WARP_SIZE); GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI5_K; // == 0 if QK_K == 256 const int kbx = k / QI5_K; // == 0 if QK_K == 256
const int kqsx = k % QI5_K; // == k if QK_K == 256 const int kqsx = k % QI5_K; // == k if QK_K == 256
@ -3291,10 +3304,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
__builtin_assume(i_offset >= 0); GGML_CUDA_ASSUME(i_offset >= 0);
__builtin_assume(i_offset < nwarps); GGML_CUDA_ASSUME(i_offset < nwarps);
__builtin_assume(k >= 0); GGML_CUDA_ASSUME(k >= 0);
__builtin_assume(k < WARP_SIZE); GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI6_K; // == 0 if QK_K == 256 const int kbx = k / QI6_K; // == 0 if QK_K == 256
const int kqsx = k % QI6_K; // == k if QK_K == 256 const int kqsx = k % QI6_K; // == k if QK_K == 256
@ -6252,6 +6265,43 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
} }
} }
void ggml_cuda_set_peer_access(const int n_tokens) {
static bool peer_access_enabled = false;
const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
if (peer_access_enabled == enable_peer_access) {
return;
}
#ifdef NDEBUG
for (int id = 0; id < g_device_count; ++id) {
CUDA_CHECK(ggml_cuda_set_device(id));
for (int id_other = 0; id_other < g_device_count; ++id_other) {
if (id == id_other) {
continue;
}
if (id != g_main_device && id_other != g_main_device) {
continue;
}
int can_access_peer;
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
if (can_access_peer) {
if (enable_peer_access) {
CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
} else {
CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
}
}
}
}
#endif // NDEBUG
peer_access_enabled = enable_peer_access;
}
static void ggml_cuda_op_mul_mat( static void ggml_cuda_op_mul_mat(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
const bool convert_src1_to_q8_1) { const bool convert_src1_to_q8_1) {
@ -6276,6 +6326,8 @@ static void ggml_cuda_op_mul_mat(
const int nb2 = dst->nb[2]; const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3]; const int nb3 = dst->nb[3];
ggml_cuda_set_peer_access(ne11);
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
@ -6408,7 +6460,7 @@ static void ggml_cuda_op_mul_mat(
// wait for main GPU data if necessary // wait for main GPU data if necessary
if (split && (id != g_main_device || is != 0)) { if (split && (id != g_main_device || is != 0)) {
CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0])); CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0));
} }
for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) { for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
@ -6530,7 +6582,7 @@ static void ggml_cuda_op_mul_mat(
CUDA_CHECK(ggml_cuda_set_device(g_main_device)); CUDA_CHECK(ggml_cuda_set_device(g_main_device));
for (int64_t id = 0; id < g_device_count; ++id) { for (int64_t id = 0; id < g_device_count; ++id) {
for (int64_t is = 0; is < is_max; ++is) { for (int64_t is = 0; is < is_max; ++is) {
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is])); CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
} }
} }
} }
@ -6964,6 +7016,7 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
return; return;
} }
if (g_scratch_buffer == nullptr) { if (g_scratch_buffer == nullptr) {
ggml_cuda_set_device(g_main_device);
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size)); CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
} }
@ -7003,7 +7056,7 @@ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
ggml_cuda_assign_buffers_impl(tensor, false, true, false); ggml_cuda_assign_buffers_impl(tensor, false, true, false);
} }
void ggml_cuda_set_main_device(int main_device) { void ggml_cuda_set_main_device(const int main_device) {
if (main_device >= g_device_count) { if (main_device >= g_device_count) {
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
main_device, g_device_count, g_main_device); main_device, g_device_count, g_main_device);
@ -7017,11 +7070,11 @@ void ggml_cuda_set_main_device(int main_device) {
} }
} }
void ggml_cuda_set_mul_mat_q(bool mul_mat_q) { void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
g_mul_mat_q = mul_mat_q; g_mul_mat_q = mul_mat_q;
} }
void ggml_cuda_set_scratch_size(size_t scratch_size) { void ggml_cuda_set_scratch_size(const size_t scratch_size) {
g_scratch_size = scratch_size; g_scratch_size = scratch_size;
} }

View File

@ -84,6 +84,7 @@ class MODEL_ARCH(IntEnum):
GPTJ : int = auto() GPTJ : int = auto()
GPTNEOX : int = auto() GPTNEOX : int = auto()
MPT : int = auto() MPT : int = auto()
STARCODER : int = auto()
class MODEL_TENSOR(IntEnum): class MODEL_TENSOR(IntEnum):
@ -114,6 +115,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.GPTJ: "gptj", MODEL_ARCH.GPTJ: "gptj",
MODEL_ARCH.GPTNEOX: "gptneox", MODEL_ARCH.GPTNEOX: "gptneox",
MODEL_ARCH.MPT: "mpt", MODEL_ARCH.MPT: "mpt",
MODEL_ARCH.STARCODER: "starcoder",
} }
MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = { MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
@ -171,6 +173,18 @@ MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
}, },
MODEL_ARCH.STARCODER: {
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
MODEL_TENSOR.POS_EMBD: "position_embd",
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
MODEL_TENSOR.OUTPUT: "output",
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
},
MODEL_ARCH.GPT2: { MODEL_ARCH.GPT2: {
# TODO # TODO
}, },

402
llama.cpp
View File

@ -1,3 +1,4 @@
#define LLAMA_API_INTERNAL
#include "llama.h" #include "llama.h"
#include "ggml.h" #include "ggml.h"
@ -109,7 +110,7 @@ static size_t utf8_len(char src) {
return lookup[highbits]; return lookup[highbits];
} }
void replace_all(std::string & s, const std::string & search, const std::string & replace) { static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
std::string result; std::string result;
for (size_t pos = 0; ; pos += search.length()) { for (size_t pos = 0; ; pos += search.length()) {
auto new_pos = s.find(search, pos); auto new_pos = s.find(search, pos);
@ -161,6 +162,7 @@ enum llm_arch {
LLM_ARCH_GPTJ, LLM_ARCH_GPTJ,
LLM_ARCH_GPTNEOX, LLM_ARCH_GPTNEOX,
LLM_ARCH_MPT, LLM_ARCH_MPT,
LLM_ARCH_STARCODER,
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
}; };
@ -172,6 +174,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
{ LLM_ARCH_GPTNEOX, "gptneox" }, { LLM_ARCH_GPTNEOX, "gptneox" },
{ LLM_ARCH_MPT, "mpt" }, { LLM_ARCH_MPT, "mpt" },
{ LLM_ARCH_BAICHUAN, "baichuan" }, { LLM_ARCH_BAICHUAN, "baichuan" },
{ LLM_ARCH_STARCODER, "starcoder" },
}; };
enum llm_kv { enum llm_kv {
@ -377,6 +380,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
}, },
}, },
{
LLM_ARCH_STARCODER,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_POS_EMBD, "position_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
},
},
{ {
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
{ {
@ -896,9 +914,11 @@ static llama_state g_state;
// available llama models // available llama models
enum e_model { enum e_model {
MODEL_UNKNOWN, MODEL_UNKNOWN,
MODEL_1B,
MODEL_3B, MODEL_3B,
MODEL_7B, MODEL_7B,
MODEL_13B, MODEL_13B,
MODEL_15B,
MODEL_30B, MODEL_30B,
MODEL_34B, MODEL_34B,
MODEL_40B, MODEL_40B,
@ -908,6 +928,7 @@ enum e_model {
static const size_t kB = 1024; static const size_t kB = 1024;
static const size_t MB = kB*kB; static const size_t MB = kB*kB;
static const size_t GB = kB*kB*kB;
// default hparams (LLaMA 7B) // default hparams (LLaMA 7B)
struct llama_hparams { struct llama_hparams {
@ -967,13 +988,22 @@ struct llama_layer {
struct ggml_tensor * wo; struct ggml_tensor * wo;
struct ggml_tensor * wqkv; struct ggml_tensor * wqkv;
// attention bias
struct ggml_tensor * bo;
struct ggml_tensor * bqkv;
// normalization // normalization
struct ggml_tensor * ffn_norm; struct ggml_tensor * ffn_norm;
struct ggml_tensor * ffn_norm_b;
// ff // ff
struct ggml_tensor * w1; // ffn_gate struct ggml_tensor * w1; // ffn_gate
struct ggml_tensor * w2; // ffn_down struct ggml_tensor * w2; // ffn_down
struct ggml_tensor * w3; // ffn_up struct ggml_tensor * w3; // ffn_up
// ff bias
struct ggml_tensor * b2; // ffn_down
struct ggml_tensor * b3; // ffn_up
}; };
struct llama_kv_cell { struct llama_kv_cell {
@ -1067,6 +1097,7 @@ struct llama_model {
llama_vocab vocab; llama_vocab vocab;
struct ggml_tensor * tok_embeddings; struct ggml_tensor * tok_embeddings;
struct ggml_tensor * pos_embeddings;
struct ggml_tensor * output_norm; struct ggml_tensor * output_norm;
struct ggml_tensor * output_norm_b; struct ggml_tensor * output_norm_b;
@ -1334,6 +1365,7 @@ struct llama_model_loader {
int n_created = 0; int n_created = 0;
int64_t n_elements = 0; int64_t n_elements = 0;
size_t n_bytes = 0;
bool use_mmap = false; bool use_mmap = false;
@ -1366,6 +1398,7 @@ struct llama_model_loader {
const char * name = gguf_get_tensor_name(ctx_gguf, i); const char * name = gguf_get_tensor_name(ctx_gguf, i);
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name); struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
n_elements += ggml_nelements(t); n_elements += ggml_nelements(t);
n_bytes += ggml_nbytes(t);
} }
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@ -1644,7 +1677,7 @@ struct llama_model_loader {
// load LLaMA models // load LLaMA models
// //
std::string llama_model_ftype_name(enum llama_ftype ftype) { static std::string llama_model_ftype_name(enum llama_ftype ftype) {
if (ftype & LLAMA_FTYPE_GUESSED) { if (ftype & LLAMA_FTYPE_GUESSED) {
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)"; return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
} }
@ -1677,9 +1710,11 @@ std::string llama_model_ftype_name(enum llama_ftype ftype) {
static const char * llama_model_type_name(e_model type) { static const char * llama_model_type_name(e_model type) {
switch (type) { switch (type) {
case MODEL_1B: return "1B";
case MODEL_3B: return "3B"; case MODEL_3B: return "3B";
case MODEL_7B: return "7B"; case MODEL_7B: return "7B";
case MODEL_13B: return "13B"; case MODEL_13B: return "13B";
case MODEL_15B: return "15B";
case MODEL_30B: return "30B"; case MODEL_30B: return "30B";
case MODEL_34B: return "34B"; case MODEL_34B: return "34B";
case MODEL_40B: return "40B"; case MODEL_40B: return "40B";
@ -1797,6 +1832,17 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_STARCODER:
{
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
switch (hparams.n_layer) {
case 24: model.type = e_model::MODEL_1B; break;
case 36: model.type = e_model::MODEL_3B; break;
case 42: model.type = e_model::MODEL_7B; break;
case 40: model.type = e_model::MODEL_15B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
default: (void)0; default: (void)0;
}; };
@ -1950,7 +1996,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9); LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
if (ml.n_bytes < GB) {
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
} else {
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
}
// general kv // general kv
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str()); LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
@ -2250,6 +2301,85 @@ static void llm_load_tensors(
} }
} }
} break; } break;
case LLM_ARCH_STARCODER:
{
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
// output
{
ggml_backend backend_norm;
ggml_backend backend_output;
if (n_gpu_layers > int(n_layer)) {
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
#else
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
#endif // _WIN32
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
} else {
backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU;
}
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
if (backend_norm == GGML_BACKEND_GPU) {
vram_weights += ggml_nbytes(model.output_norm);
vram_weights += ggml_nbytes(model.output_norm_b);
}
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
vram_weights += ggml_nbytes(model.output);
}
}
const uint32_t n_ff = hparams.n_ff;
const int i_gpu_start = n_layer - n_gpu_layers;
model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
if (backend == GGML_BACKEND_GPU) {
vram_weights +=
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) +
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3);
}
}
} break;
default: default:
throw std::runtime_error("unknown architecture"); throw std::runtime_error("unknown architecture");
}; };
@ -3445,6 +3575,248 @@ static struct ggml_cgraph * llm_build_falcon(
return gf; return gf;
} }
static struct ggml_cgraph * llm_build_starcoder(
llama_context & lctx,
llama_batch & batch) {
const auto & model = lctx.model;
const auto & hparams = model.hparams;
const auto & kv_self = lctx.kv_self;
GGML_ASSERT(!!kv_self.ctx);
const int64_t n_embd = hparams.n_embd;
const int64_t n_layer = hparams.n_layer;
const int64_t n_ctx = hparams.n_ctx;
const int64_t n_head = hparams.n_head;
const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_embd_head = hparams.n_embd_head();
const int64_t n_embd_gqa = hparams.n_embd_gqa();
GGML_ASSERT(n_embd_head == hparams.n_rot);
const float norm_eps = hparams.f_norm_eps;
const int32_t n_tokens = batch.n_tokens;
auto & buf_compute = lctx.buf_compute;
struct ggml_init_params params = {
/*.mem_size =*/ buf_compute.size,
/*.mem_buffer =*/ buf_compute.data,
/*.no_alloc =*/ false,
};
params.no_alloc = true;
struct ggml_context * ctx0 = ggml_init(params);
ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_tensor * cur;
struct ggml_tensor * token;
struct ggml_tensor * position;
struct ggml_tensor * inpL;
if (batch.token) {
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
ggml_allocr_alloc(lctx.alloc, inp_tokens);
if (!ggml_allocr_is_measure(lctx.alloc)) {
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
}
ggml_set_name(inp_tokens, "inp_tokens");
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
} else {
#ifdef GGML_USE_MPI
GGML_ASSERT(false && "not implemented");
#endif
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
ggml_allocr_alloc(lctx.alloc, token);
if (!ggml_allocr_is_measure(lctx.alloc)) {
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
}
}
{
// Compute position embeddings.
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
ggml_allocr_alloc(lctx.alloc, inp_positions);
if (!ggml_allocr_is_measure(lctx.alloc)) {
for (int i = 0; i < n_tokens; ++i) {
((int32_t *) inp_positions->data)[i] = batch.pos[i];
}
}
ggml_set_name(inp_positions, "inp_positions");
position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
}
// KQ_scale
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
ggml_allocr_alloc(lctx.alloc, KQ_scale);
if (!ggml_allocr_is_measure(lctx.alloc)) {
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
}
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_ctx, n_tokens, 1);
ggml_allocr_alloc(lctx.alloc, KQ_mask);
if (!ggml_allocr_is_measure(lctx.alloc)) {
float * data = (float *) KQ_mask->data;
memset(data, 0, ggml_nbytes(KQ_mask));
for (int h = 0; h < 1; ++h) {
for (int j = 0; j < n_tokens; ++j) {
for (int i = 0; i < n_ctx; ++i) {
if (!kv_self.cells[i].has_seq_id(batch.seq_id[j]) || kv_self.cells[i].pos > batch.pos[j]) {
data[h*(n_ctx*n_tokens) + j*n_ctx + i] = -INFINITY;
}
}
}
}
}
inpL = ggml_add(ctx0, token, position);
ggml_set_name(inpL, "inpL");
for (int il = 0; il < n_layer; ++il) {
{
// Norm
cur = ggml_norm(ctx0, inpL, norm_eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
}
{
// Self Attention
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
struct ggml_tensor * Qcur = tmpq;
struct ggml_tensor * Kcur = tmpk;
{
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
ggml_set_name(Vcur, "Vcur");
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_self.head));
ggml_set_name(k, "k");
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
( n_ctx)*ggml_element_size(kv_self.v),
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_self.head*ggml_element_size(kv_self.v));
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
}
struct ggml_tensor * Q =
ggml_permute(ctx0,
ggml_cpy(ctx0,
Qcur,
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
0, 2, 1, 3);
ggml_set_name(Q, "Q");
struct ggml_tensor * K =
ggml_view_3d(ctx0, kv_self.k,
n_embd_head, n_ctx, n_head_kv,
ggml_element_size(kv_self.k)*n_embd_gqa,
ggml_element_size(kv_self.k)*n_embd_head,
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
ggml_set_name(K, "K");
// K * Q
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
ggml_set_name(KQ, "KQ");
// KQ_scaled = KQ / sqrt(n_embd_head)
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
ggml_set_name(KQ_scaled, "KQ_scaled");
// KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
ggml_set_name(KQ_masked, "KQ_masked");
// KQ = soft_max(KQ_masked)
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
ggml_set_name(KQ_soft_max, "KQ_soft_max");
// split cached V into n_head heads
struct ggml_tensor * V =
ggml_view_3d(ctx0, kv_self.v,
n_ctx, n_embd_head, n_head_kv,
ggml_element_size(kv_self.v)*n_ctx,
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
ggml_set_name(V, "V");
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
ggml_set_name(KQV, "KQV");
// KQV_merged = KQV.permute(0, 2, 1, 3)
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
ggml_set_name(KQV_merged, "KQV_merged");
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
cur = ggml_cpy(ctx0,
KQV_merged,
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens));
ggml_set_name(cur, "KQV_merged_contiguous");
}
// Projection
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
// Add the input
cur = ggml_add(ctx0, cur, inpL);
struct ggml_tensor * inpFF = cur;
// FF
{
// Norm
{
cur = ggml_norm(ctx0, inpFF, norm_eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
}
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
// GELU activation
cur = ggml_gelu(ctx0, cur);
// Projection
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
}
inpL = ggml_add(ctx0, cur, inpFF);
}
// Output Norm
{
cur = ggml_norm(ctx0, inpL, norm_eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
}
ggml_set_name(cur, "result_norm");
cur = ggml_mul_mat(ctx0, model.output, cur);
ggml_set_name(cur, "result_output");
ggml_build_forward_expand(gf, cur);
ggml_free(ctx0);
return gf;
}
static struct ggml_cgraph * llama_build_graph( static struct ggml_cgraph * llama_build_graph(
llama_context & lctx, llama_context & lctx,
llama_batch & batch) { llama_batch & batch) {
@ -3465,6 +3837,10 @@ static struct ggml_cgraph * llama_build_graph(
{ {
result = llm_build_falcon(lctx, batch); result = llm_build_falcon(lctx, batch);
} break; } break;
case LLM_ARCH_STARCODER:
{
result = llm_build_starcoder(lctx, batch);
} break;
default: default:
GGML_ASSERT(false); GGML_ASSERT(false);
}; };
@ -4070,7 +4446,7 @@ struct llama_grammar_candidate {
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`. // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8( static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
const char * src, const char * src,
llama_partial_utf8 partial_start) { llama_partial_utf8 partial_start) {
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 }; static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
@ -5668,7 +6044,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} }
// TODO: after the GGUF PR, this likely won't work and needs to be updated // TODO: after the GGUF PR, this likely won't work and needs to be updated
int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) { static int llama_apply_lora_from_file_internal(
const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
) {
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
const int64_t t_start_lora_us = ggml_time_us(); const int64_t t_start_lora_us = ggml_time_us();
@ -6220,7 +6598,7 @@ struct llama_context * llama_new_context_with_model(
return ctx; return ctx;
} }
struct llama_context * llama_init_from_file( static struct llama_context * llama_init_from_file(
const char * path_model, const char * path_model,
struct llama_context_params params) { struct llama_context_params params) {
struct llama_model * model = llama_load_model_from_file(path_model, params); struct llama_model * model = llama_load_model_from_file(path_model, params);
@ -6429,7 +6807,7 @@ struct llama_data_file_context : llama_data_context {
* llama_copy_state_data(ctx, &data_ctx); * llama_copy_state_data(ctx, &data_ctx);
* *
*/ */
void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
// TODO: does not support multi-sequence states // TODO: does not support multi-sequence states
{ {
const auto & kv_self = ctx->kv_self; const auto & kv_self = ctx->kv_self;
@ -6835,19 +7213,21 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
int llama_tokenize( int llama_tokenize(
struct llama_context * ctx, struct llama_context * ctx,
const char * text, const char * text,
int text_len,
llama_token * tokens, llama_token * tokens,
int n_max_tokens, int n_max_tokens,
bool add_bos) { bool add_bos) {
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos); return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
} }
int llama_tokenize_with_model( int llama_tokenize_with_model(
const struct llama_model * model, const struct llama_model * model,
const char * text, const char * text,
int text_len,
llama_token * tokens, llama_token * tokens,
int n_max_tokens, int n_max_tokens,
bool add_bos) { bool add_bos) {
auto res = llama_tokenize_internal(model->vocab, text, add_bos); auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
if (n_max_tokens < (int) res.size()) { if (n_max_tokens < (int) res.size()) {
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
@ -6989,7 +7369,9 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
} }
// For internal test use // For internal test use
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) { const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
) {
return ctx->model.tensors_by_name; return ctx->model.tensors_by_name;
} }

View File

@ -388,6 +388,7 @@ extern "C" {
LLAMA_API int llama_tokenize( LLAMA_API int llama_tokenize(
struct llama_context * ctx, struct llama_context * ctx,
const char * text, const char * text,
int text_len,
llama_token * tokens, llama_token * tokens,
int n_max_tokens, int n_max_tokens,
bool add_bos); bool add_bos);
@ -395,6 +396,7 @@ extern "C" {
LLAMA_API int llama_tokenize_with_model( LLAMA_API int llama_tokenize_with_model(
const struct llama_model * model, const struct llama_model * model,
const char * text, const char * text,
int text_len,
llama_token * tokens, llama_token * tokens,
int n_max_tokens, int n_max_tokens,
bool add_bos); bool add_bos);
@ -554,7 +556,9 @@ extern "C" {
struct ggml_tensor; struct ggml_tensor;
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx); const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
);
#endif // LLAMA_API_INTERNAL #endif // LLAMA_API_INTERNAL

View File

@ -16,7 +16,7 @@
constexpr int kVecSize = 1 << 18; constexpr int kVecSize = 1 << 18;
float drawFromGaussianPdf(std::mt19937& rndm) { static float drawFromGaussianPdf(std::mt19937& rndm) {
constexpr double kScale = 1./(1. + std::mt19937::max()); constexpr double kScale = 1./(1. + std::mt19937::max());
constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale; constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale;
static float lastX; static float lastX;
@ -28,7 +28,8 @@ float drawFromGaussianPdf(std::mt19937& rndm) {
haveX = true; haveX = true;
return r*cos(phi); return r*cos(phi);
} }
void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) {
static void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) {
for (auto& v : values) v = mean + drawFromGaussianPdf(rndm); for (auto& v : values) v = mean + drawFromGaussianPdf(rndm);
} }

View File

@ -2,6 +2,8 @@ set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.h.in")
set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h") set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
set(BUILD_NUMBER 0) set(BUILD_NUMBER 0)
set(BUILD_COMMIT "unknown") set(BUILD_COMMIT "unknown")
set(BUILD_COMPILER "unknown")
set(BUILD_TARGET "unknown")
# Look for git # Look for git
find_package(Git) find_package(Git)
@ -41,11 +43,45 @@ if(Git_FOUND)
endif() endif()
endif() endif()
if(GIT_HEAD_RESULT EQUAL 0 AND GIT_COUNT_RESULT EQUAL 0)
set(BUILD_COMMIT ${HEAD})
set(BUILD_NUMBER ${COUNT})
endif()
execute_process(
COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER}
OUTPUT_VARIABLE OUT
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RES
)
if (RES EQUAL 0)
set(BUILD_COMPILER ${OUT})
endif()
execute_process(
COMMAND ${CMAKE_C_COMPILER} -dumpmachine
OUTPUT_VARIABLE OUT
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RES
)
if (RES EQUAL 0)
set(BUILD_TARGET ${OUT})
endif()
# Only write the header if it's changed to prevent unnecessary recompilation # Only write the header if it's changed to prevent unnecessary recompilation
if(EXISTS ${HEADER_FILE}) if(EXISTS ${HEADER_FILE})
file(STRINGS ${HEADER_FILE} CONTENTS REGEX "BUILD_COMMIT \"([^\"]*)\"") file(READ ${HEADER_FILE} CONTENTS)
list(GET CONTENTS 0 EXISTING) string(REGEX MATCH "BUILD_COMMIT \"([^\"]*)\"" _ ${CONTENTS})
if(NOT EXISTING STREQUAL "#define BUILD_COMMIT \"${BUILD_COMMIT}\"") set(OLD_COMMIT ${CMAKE_MATCH_1})
string(REGEX MATCH "BUILD_COMPILER \"([^\"]*)\"" _ ${CONTENTS})
set(OLD_COMPILER ${CMAKE_MATCH_1})
string(REGEX MATCH "BUILD_TARGET \"([^\"]*)\"" _ ${CONTENTS})
set(OLD_TARGET ${CMAKE_MATCH_1})
if (
NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR
NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
NOT OLD_TARGET STREQUAL BUILD_TARGET
)
configure_file(${TEMPLATE_FILE} ${HEADER_FILE}) configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
endif() endif()
else() else()

View File

@ -3,5 +3,7 @@
#define BUILD_NUMBER @BUILD_NUMBER@ #define BUILD_NUMBER @BUILD_NUMBER@
#define BUILD_COMMIT "@BUILD_COMMIT@" #define BUILD_COMMIT "@BUILD_COMMIT@"
#define BUILD_COMPILER "@BUILD_COMPILER@"
#define BUILD_TARGET "@BUILD_TARGET@"
#endif // BUILD_INFO_H #endif // BUILD_INFO_H

View File

@ -1,23 +1,35 @@
#!/bin/sh #!/bin/sh
BUILD_NUMBER="0" CC=$1
BUILD_COMMIT="unknown"
REV_LIST=$(git rev-list --count HEAD) build_number="0"
if [ $? -eq 0 ]; then build_commit="unknown"
BUILD_NUMBER=$REV_LIST build_compiler="unknown"
build_target="unknown"
if out=$(git rev-list --count HEAD); then
# git is broken on WSL so we need to strip extra newlines
build_number=$(printf '%s' "$out" | tr -d '\n')
fi fi
REV_PARSE=$(git rev-parse --short HEAD) if out=$(git rev-parse --short HEAD); then
if [ $? -eq 0 ]; then build_commit=$(printf '%s' "$out" | tr -d '\n')
BUILD_COMMIT=$REV_PARSE fi
if out=$($CC --version | head -1); then
build_compiler=$out
fi
if out=$($CC -dumpmachine); then
build_target=$out
fi fi
echo "#ifndef BUILD_INFO_H" echo "#ifndef BUILD_INFO_H"
echo "#define BUILD_INFO_H" echo "#define BUILD_INFO_H"
echo "" echo
echo "#define BUILD_NUMBER $BUILD_NUMBER" | tr -d '\n' echo "#define BUILD_NUMBER $build_number"
echo "" echo "#define BUILD_COMMIT \"$build_commit\""
echo "#define BUILD_COMMIT \"$BUILD_COMMIT\"" | tr -d '\n' echo "#define BUILD_COMPILER \"$build_compiler\""
echo "" echo "#define BUILD_TARGET \"$build_target\""
echo
echo "#endif // BUILD_INFO_H" echo "#endif // BUILD_INFO_H"

View File

@ -36,15 +36,15 @@
#define GGML_PRINT(...) printf(__VA_ARGS__) #define GGML_PRINT(...) printf(__VA_ARGS__)
float frand(void) { static float frand(void) {
return (float)rand()/(float)RAND_MAX; return (float)rand()/(float)RAND_MAX;
} }
int irand(int n) { static int irand(int n) {
return rand()%n; return rand()%n;
} }
void get_random_dims(int64_t * dims, int ndims) { static void get_random_dims(int64_t * dims, int ndims) {
dims[0] = dims[1] = dims[2] = dims[3] = 1; dims[0] = dims[1] = dims[2] = dims[3] = 1;
for (int i = 0; i < ndims; i++) { for (int i = 0; i < ndims; i++) {
@ -52,7 +52,7 @@ void get_random_dims(int64_t * dims, int ndims) {
} }
} }
void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) { static void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
dims[0] = dims[1] = dims[2] = dims[3] = 1; dims[0] = dims[1] = dims[2] = dims[3] = 1;
for (int i = 0; i < ndims; i++) { for (int i = 0; i < ndims; i++) {
@ -61,12 +61,9 @@ void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
} }
struct ggml_tensor * get_random_tensor( static struct ggml_tensor * get_random_tensor(
struct ggml_context * ctx0, struct ggml_context * ctx0, int ndims, int64_t ne[], float fmin, float fmax
int ndims, ) {
int64_t ne[],
float fmin,
float fmax) {
struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne); struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
switch (ndims) { switch (ndims) {
@ -109,11 +106,11 @@ struct ggml_tensor * get_random_tensor(
return result; return result;
} }
float get_element(const struct ggml_tensor * t, int idx) { static float get_element(const struct ggml_tensor * t, int idx) {
return ((float *)t->data)[idx]; return ((float *)t->data)[idx];
} }
void set_element(struct ggml_tensor * t, int idx, float value) { static void set_element(struct ggml_tensor * t, int idx, float value) {
((float *)t->data)[idx] = value; ((float *)t->data)[idx] = value;
} }

View File

@ -13,24 +13,24 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f; constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
const float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
const float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
const float MAX_DOT_PRODUCT_ERROR = 0.02f; constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
const char* RESULT_STR[] = {"ok", "FAILED"}; static const char* RESULT_STR[] = {"ok", "FAILED"};
// Generate synthetic data // Generate synthetic data
void generate_data(float offset, size_t n, float * dst) { static void generate_data(float offset, size_t n, float * dst) {
for (size_t i = 0; i < n; i++) { for (size_t i = 0; i < n; i++) {
dst[i] = 0.1 + 2*cosf(i + offset); dst[i] = 0.1 + 2*cosf(i + offset);
} }
} }
// Calculate RMSE between two float arrays // Calculate RMSE between two float arrays
float array_rmse(const float * a1, const float * a2, size_t n) { static float array_rmse(const float * a1, const float * a2, size_t n) {
double sum = 0; double sum = 0;
for (size_t i = 0; i < n; i++) { for (size_t i = 0; i < n; i++) {
double diff = a1[i] - a2[i]; double diff = a1[i] - a2[i];
@ -40,7 +40,7 @@ float array_rmse(const float * a1, const float * a2, size_t n) {
} }
// Total quantization error on test data // Total quantization error on test data
float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(2*test_size); std::vector<uint8_t> tmp_q(2*test_size);
std::vector<float> tmp_out(test_size); std::vector<float> tmp_out(test_size);
@ -50,7 +50,7 @@ float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, cons
} }
// Total quantization error on test data // Total quantization error on test data
float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(2*test_size); std::vector<uint8_t> tmp_q(2*test_size);
std::vector<float> tmp_out(test_size); std::vector<float> tmp_out(test_size);
std::vector<float> tmp_out_ref(test_size); std::vector<float> tmp_out_ref(test_size);
@ -64,7 +64,7 @@ float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size,
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size); return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
} }
float dot_product(const float * a1, const float * a2, size_t test_size) { static float dot_product(const float * a1, const float * a2, size_t test_size) {
double sum = 0; double sum = 0;
for (size_t i = 0; i < test_size; i++) { for (size_t i = 0; i < test_size; i++) {
sum += a1[i] * a2[i]; sum += a1[i] * a2[i];
@ -73,7 +73,9 @@ float dot_product(const float * a1, const float * a2, size_t test_size) {
} }
// Total dot product error // Total dot product error
float dot_product_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) { static float dot_product_error(
ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
) {
std::vector<uint8_t> tmp_q1(2*test_size); std::vector<uint8_t> tmp_q1(2*test_size);
std::vector<uint8_t> tmp_q2(2*test_size); std::vector<uint8_t> tmp_q2(2*test_size);

View File

@ -61,22 +61,22 @@ inline int64_t cpu_cycles() {
// Generate synthetic data // Generate synthetic data
void generate_data(float offset, size_t n, float * dst) { static void generate_data(float offset, size_t n, float * dst) {
for (size_t i = 0; i < n; i++) { for (size_t i = 0; i < n; i++) {
dst[i] = 0.1 + 2*cosf(i + offset); dst[i] = 0.1 + 2*cosf(i + offset);
} }
} }
float gigabytes_per_second(size_t bytes, int64_t usecs) { static float gigabytes_per_second(size_t bytes, int64_t usecs) {
return bytes / (float) usecs * 1000000 / (1024*1024*1024); return bytes / (float) usecs * 1000000 / (1024*1024*1024);
} }
void * align_with_offset(void * ptr, int offset) { static void * align_with_offset(void * ptr, int offset) {
size_t dummy_size = MAX_ALIGNMENT * 4; size_t dummy_size = MAX_ALIGNMENT * 4;
return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset; return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
} }
void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) { static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) {
int64_t min_time_us = INT64_MAX; int64_t min_time_us = INT64_MAX;
int64_t total_time_us = 0; int64_t total_time_us = 0;
int64_t min_time_cycles = INT64_MAX; int64_t min_time_cycles = INT64_MAX;
@ -108,7 +108,7 @@ void benchmark_function(size_t size, size_t q_size, int64_t iterations, const st
printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * iterations, total_time_us)); printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * iterations, total_time_us));
} }
void usage(char * argv[]) { static void usage(char * argv[]) {
printf("Benchmark quantization specific functions on synthetic data\n"); printf("Benchmark quantization specific functions on synthetic data\n");
printf("\n"); printf("\n");
printf("usage: %s [options]\n", argv[0]); printf("usage: %s [options]\n", argv[0]);

View File

@ -12,7 +12,8 @@
#include <vector> #include <vector>
#include <algorithm> #include <algorithm>
void dump(const llama_token_data_array * candidates) {
static void dump(const llama_token_data_array * candidates) {
for (size_t i = 0; i < candidates->size; i++) { for (size_t i = 0; i < candidates->size; i++) {
printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit); printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
} }
@ -21,9 +22,7 @@ void dump(const llama_token_data_array * candidates) {
#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0) #define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
void test_top_k(const std::vector<float> & probs, static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
const std::vector<float> & expected_probs,
int k) {
size_t n_vocab = probs.size(); size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
@ -45,10 +44,7 @@ void test_top_k(const std::vector<float> & probs,
} }
void test_top_p(const std::vector<float> & probs, static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
const std::vector<float> & expected_probs,
float p) {
size_t n_vocab = probs.size(); size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
@ -70,9 +66,7 @@ void test_top_p(const std::vector<float> & probs,
} }
void test_tfs(const std::vector<float> & probs, static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
const std::vector<float> & expected_probs,
float z) {
size_t n_vocab = probs.size(); size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
@ -93,9 +87,7 @@ void test_tfs(const std::vector<float> & probs,
} }
void test_typical(const std::vector<float> & probs, static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
const std::vector<float> & expected_probs,
float p) {
size_t n_vocab = probs.size(); size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
@ -116,11 +108,10 @@ void test_typical(const std::vector<float> & probs,
} }
void test_repetition_penalty( static void test_repetition_penalty(
const std::vector<float> & probs, const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
const std::vector<llama_token> & last_tokens, const std::vector<float> & expected_probs, float penalty
const std::vector<float> & expected_probs, ) {
float penalty) {
assert(probs.size() == expected_probs.size()); assert(probs.size() == expected_probs.size());
size_t n_vocab = probs.size(); size_t n_vocab = probs.size();
@ -145,11 +136,10 @@ void test_repetition_penalty(
} }
void test_frequency_presence_penalty( static void test_frequency_presence_penalty(
const std::vector<float> & probs, const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
const std::vector<llama_token> & last_tokens, const std::vector<float> & expected_probs, float alpha_frequency, float alpha_presence
const std::vector<float> & expected_probs, ) {
float alpha_frequency, float alpha_presence) {
assert(probs.size() == expected_probs.size()); assert(probs.size() == expected_probs.size());
size_t n_vocab = probs.size(); size_t n_vocab = probs.size();

View File

@ -36,6 +36,7 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
{ " Hello" , { 1678, 15043, }, }, { " Hello" , { 1678, 15043, }, },
{ " Hello" , { 268, 15043, }, }, { " Hello" , { 268, 15043, }, },
{ " Hello\n Hello" , { 268, 15043, 13, 1678, 15043, }, }, { " Hello\n Hello" , { 268, 15043, 13, 1678, 15043, }, },
{ " (" , { 29871, 313, }, },
}; };
return _k_tests; return _k_tests;

View File

@ -13,7 +13,7 @@
typedef int codepoint; typedef int codepoint;
std::string codepoint_to_utf8(codepoint cp) { static std::string codepoint_to_utf8(codepoint cp) {
std::string result; std::string result;
if (0x00 <= cp && cp <= 0x7f) { if (0x00 <= cp && cp <= 0x7f) {
result.push_back(cp); result.push_back(cp);
@ -87,35 +87,31 @@ int main(int argc, char **argv) {
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false); std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_spm(ctx, tokens); std::string check = llama_detokenize_spm(ctx, tokens);
if (check != str) { if (check != str) {
fprintf(stderr, "%s : error: token %d detokenizes to >%s<(%d) but tokenization of this detokenizes to >%s<(%d)\n", fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
__func__, i, str.c_str(), (int) str.length(), check.c_str(), (int) check.length()); __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
if (i != 3) {
return 2; return 2;
} }
} }
}
for (codepoint cp = 0x0000; cp < 0xffff; ++cp) { for (codepoint cp = 0x0000; cp < 0xffff; ++cp) {
if (cp < 0xd800 || cp > 0xdfff) { if (cp < 0xd800 || cp > 0xdfff) {
std::string str = codepoint_to_utf8(cp); std::string str = codepoint_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false); std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_spm(ctx, tokens); std::string check = llama_detokenize_spm(ctx, tokens);
if (str != check) { if (cp != 9601 && str != check) {
fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%d) instead of >%s<(%d)\n", fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
__func__, cp, check.c_str(), (int) check.length(), str.c_str(), (int) str.length()); __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
if (cp != 0 && cp != 9601) {
return 3; return 3;
} }
} }
} }
}
for (codepoint cp = 0x10000; cp < 0x0010ffff; ++cp) { for (codepoint cp = 0x10000; cp < 0x0010ffff; ++cp) {
std::string str = codepoint_to_utf8(cp); std::string str = codepoint_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false); std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_spm(ctx, tokens); std::string check = llama_detokenize_spm(ctx, tokens);
if (str != check) { if (str != check) {
fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%d) instead of >%s<(%d)\n", fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
__func__, cp, check.c_str(), (int) check.length(), str.c_str(), (int) str.length()); __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
return 4; return 4;
} }
} }