diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 641dbccd8..4b6071f5a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -265,22 +265,24 @@ jobs: matrix: include: - build: 'noavx' - defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF' + defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON' - build: 'avx2' - defines: '-DLLAMA_BUILD_SERVER=ON' + defines: '-DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' - build: 'avx' - defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF' + defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON' - build: 'avx512' defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON' - build: 'clblast' - defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"' + defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"' - build: 'openblas' - defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' + defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' steps: - name: Clone id: checkout uses: actions/checkout@v3 + with: + fetch-depth: 0 - name: Download OpenCL SDK id: get_opencl @@ -397,11 +399,14 @@ jobs: - name: Clone id: checkout uses: actions/checkout@v3 + with: + fetch-depth: 0 - uses: Jimver/cuda-toolkit@v0.2.11 id: cuda-toolkit with: cuda: ${{ matrix.cuda }} + method: 'network' sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]' - name: Build @@ -409,7 +414,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON + cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON cmake --build . --config Release - name: Determine tag name @@ -485,6 +490,8 @@ jobs: - name: Clone id: checkout uses: actions/checkout@v3 + with: + fetch-depth: 0 - name: Determine tag name id: tag diff --git a/CMakeLists.txt b/CMakeLists.txt index 3283f7b65..c0b93564a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -80,6 +80,8 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF) set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") +set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING + "llama: max. batch size for using peer access") option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) @@ -304,6 +306,7 @@ if (LLAMA_CUBLAS) add_compile_definitions(GGML_CUDA_F16) endif() add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) + add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE}) if (LLAMA_STATIC) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) @@ -427,6 +430,7 @@ if (LLAMA_ALL_WARNINGS) -Wextra -Wpedantic -Wcast-qual + -Wmissing-declarations -Wno-unused-function -Wno-multichar ) diff --git a/Makefile b/Makefile index 7ab1b7a09..dc8ae3807 100644 --- a/Makefile +++ b/Makefile @@ -95,16 +95,19 @@ CXXV := $(shell $(CXX) --version | head -n 1) # # keep standard at C11 and C++11 +MK_CPPFLAGS = -I. -Icommon +MK_CFLAGS = -std=c11 -fPIC +MK_CXXFLAGS = -std=c++11 -fPIC + # -Ofast tends to produce faster code, but may not be available for some compilers. ifdef LLAMA_FAST -OPT = -Ofast +MK_CFLAGS += -Ofast +MK_HOST_CXXFLAGS += -Ofast +MK_CUDA_CXXFLAGS += -O3 else -OPT = -O3 +MK_CFLAGS += -O3 +MK_CXXFLAGS += -O3 endif -MK_CPPFLAGS = -I. -Icommon -MK_CFLAGS = $(OPT) -std=c11 -fPIC -MK_CXXFLAGS = $(OPT) -std=c++11 -fPIC -MK_LDFLAGS = # clock_gettime came in POSIX.1b (1993) # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional @@ -172,9 +175,16 @@ endif # LLAMA_DISABLE_LOGS # warnings MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \ -Wmissing-prototypes -Werror=implicit-int -Wno-unused-function -MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar +MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wmissing-declarations -Wno-unused-function -Wno-multichar -ifeq '' '$(findstring clang,$(shell $(CXX) --version))' +# TODO(cebtenzzre): remove this once PR #2632 gets merged +TTFS_CXXFLAGS = $(CXXFLAGS) -Wno-missing-declarations + +ifneq '' '$(findstring clang,$(shell $(CXX) --version))' + # clang++ only + MK_CXXFLAGS += -Wmissing-prototypes + TTFS_CXXFLAGS += -Wno-missing-prototypes +else # g++ only MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds endif @@ -225,7 +235,7 @@ ifndef RISCV ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) # Use all CPU extensions that are available: MK_CFLAGS += -march=native -mtune=native - MK_CXXFLAGS += -march=native -mtune=native + MK_HOST_CXXFLAGS += -march=native -mtune=native # Usage AVX-only #MK_CFLAGS += -mfma -mf16c -mavx @@ -358,6 +368,11 @@ ifdef LLAMA_CUDA_KQUANTS_ITER else NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 endif +ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE + NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE) +else + NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 +endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE #ifdef LLAMA_CUDA_CUBLAS # NVCCFLAGS += -DGGML_CUDA_CUBLAS #endif # LLAMA_CUDA_CUBLAS @@ -365,7 +380,7 @@ ifdef LLAMA_CUDA_CCBIN NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) endif ggml-cuda.o: ggml-cuda.cu ggml-cuda.h - $(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) -Wno-pedantic -c $< -o $@ + $(NVCC) $(NVCCFLAGS) -Wno-pedantic -c $< -o $@ endif # LLAMA_CUBLAS ifdef LLAMA_CLBLAST @@ -433,23 +448,30 @@ k_quants.o: k_quants.c k_quants.h endif # LLAMA_NO_K_QUANTS # combine build flags with cmdline overrides -override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS) -override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS) -override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS) +override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS) +override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS) +override CUDA_CXXFLAGS := $(MK_CUDA_CXXFLAGS) $(CUDA_CXXFLAGS) +override HOST_CXXFLAGS := $(MK_HOST_CXXFLAGS) $(HOST_CXXFLAGS) +override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS) + +# save CXXFLAGS before we add host-only options +NVCCFLAGS := $(NVCCFLAGS) $(CXXFLAGS) $(CUDA_CXXFLAGS) -Wno-pedantic -Xcompiler "$(HOST_CXXFLAGS)" +override CXXFLAGS += $(HOST_CXXFLAGS) # # Print build information # $(info I llama.cpp build info: ) -$(info I UNAME_S: $(UNAME_S)) -$(info I UNAME_P: $(UNAME_P)) -$(info I UNAME_M: $(UNAME_M)) -$(info I CFLAGS: $(CFLAGS)) -$(info I CXXFLAGS: $(CXXFLAGS)) -$(info I LDFLAGS: $(LDFLAGS)) -$(info I CC: $(CCV)) -$(info I CXX: $(CXXV)) +$(info I UNAME_S: $(UNAME_S)) +$(info I UNAME_P: $(UNAME_P)) +$(info I UNAME_M: $(UNAME_M)) +$(info I CFLAGS: $(CFLAGS)) +$(info I CXXFLAGS: $(CXXFLAGS)) +$(info I NVCCFLAGS: $(NVCCFLAGS)) +$(info I LDFLAGS: $(LDFLAGS)) +$(info I CC: $(CCV)) +$(info I CXX: $(CXXV)) $(info ) # @@ -492,22 +514,22 @@ main: examples/main/main.cpp build-info.h ggml. @echo '==== Run ./main -h for help. ====' @echo -simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS) +simple: examples/simple/simple.cpp ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS) +quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS) +quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS) +perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS) +embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS) +save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) @@ -524,7 +546,7 @@ gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(TTFS_CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) @@ -547,7 +569,7 @@ metal: examples/metal/metal.cpp ggml.o $(OBJS) endif build-info.h: $(wildcard .git/index) scripts/build-info.sh - @sh scripts/build-info.sh > $@.tmp + @sh scripts/build-info.sh $(CC) > $@.tmp @if ! cmp -s $@.tmp $@; then \ mv $@.tmp $@; \ else \ @@ -560,7 +582,7 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh tests: $(TEST_TARGETS) -benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS) +benchmark-matmult: examples/benchmark/benchmark-matmult.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) ./$@ diff --git a/README.md b/README.md index b3845afd7..d8fd8bc44 100644 --- a/README.md +++ b/README.md @@ -391,13 +391,14 @@ Building the program with BLAS support may lead to some performance improvements - | Option | Legal values | Default | Description | - |-------------------------|------------------------|---------|-------------| - | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | - | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | - | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. | - | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | - | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | + | Option | Legal values | Default | Description | + |--------------------------------|------------------------|---------|-------------| + | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | + | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | + | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. | + | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | + | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | + | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | - #### hipBLAS diff --git a/common/common.cpp b/common/common.cpp index 8fbff1da7..b8d306ae2 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -78,7 +78,7 @@ int32_t get_num_physical_cores() { return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; } -void process_escapes(std::string& input) { +static void process_escapes(std::string& input) { std::size_t input_len = input.length(); std::size_t output_idx = 0; @@ -798,10 +798,10 @@ std::vector llama_tokenize( // upper limit for the number of tokens int n_tokens = text.length() + add_bos; std::vector result(n_tokens); - n_tokens = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos); + n_tokens = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos); + int check = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); diff --git a/common/common.h b/common/common.h index 504e5944e..2cc11966b 100644 --- a/common/common.h +++ b/common/common.h @@ -3,6 +3,7 @@ #pragma once #include "llama.h" +#include "build-info.h" #define LOG_NO_FILE_LINE_FUNCTION #include "log.h" @@ -23,6 +24,11 @@ #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) +#define print_build_info() do { \ + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \ + fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \ +} while(0) + // // CLI argument parsing // diff --git a/common/console.cpp b/common/console.cpp index 23545e5be..f65cbc6ed 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -158,7 +158,7 @@ namespace console { } } - char32_t getchar32() { + static char32_t getchar32() { #if defined(_WIN32) HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE); wchar_t high_surrogate = 0; @@ -212,7 +212,7 @@ namespace console { #endif } - void pop_cursor() { + static void pop_cursor() { #if defined(_WIN32) if (hConsole != NULL) { CONSOLE_SCREEN_BUFFER_INFO bufferInfo; @@ -233,7 +233,7 @@ namespace console { putc('\b', out); } - int estimateWidth(char32_t codepoint) { + static int estimateWidth(char32_t codepoint) { #if defined(_WIN32) (void)codepoint; return 1; @@ -242,7 +242,7 @@ namespace console { #endif } - int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) { + static int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) { #if defined(_WIN32) CONSOLE_SCREEN_BUFFER_INFO bufferInfo; if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) { @@ -303,7 +303,7 @@ namespace console { #endif } - void replace_last(char ch) { + static void replace_last(char ch) { #if defined(_WIN32) pop_cursor(); put_codepoint(&ch, 1, 1); @@ -312,7 +312,7 @@ namespace console { #endif } - void append_utf8(char32_t ch, std::string & out) { + static void append_utf8(char32_t ch, std::string & out) { if (ch <= 0x7F) { out.push_back(static_cast(ch)); } else if (ch <= 0x7FF) { @@ -333,7 +333,7 @@ namespace console { } // Helper function to remove the last UTF-8 character from a string - void pop_back_utf8_char(std::string & line) { + static void pop_back_utf8_char(std::string & line) { if (line.empty()) { return; } @@ -349,7 +349,7 @@ namespace console { line.erase(pos); } - bool readline_advanced(std::string & line, bool multiline_input) { + static bool readline_advanced(std::string & line, bool multiline_input) { if (out != stdout) { fflush(stdout); } @@ -452,7 +452,7 @@ namespace console { return has_more; } - bool readline_simple(std::string & line, bool multiline_input) { + static bool readline_simple(std::string & line, bool multiline_input) { #if defined(_WIN32) std::wstring wline; if (!std::getline(std::wcin, wline)) { diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp index 177d1e3a8..5a545a807 100644 --- a/common/grammar-parser.cpp +++ b/common/grammar-parser.cpp @@ -9,7 +9,7 @@ namespace grammar_parser { // NOTE: assumes valid utf8 (but checks for overrun) // copied from llama.cpp - std::pair decode_utf8(const char * src) { + static std::pair decode_utf8(const char * src) { static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; uint8_t first_byte = static_cast(*src); uint8_t highbits = first_byte >> 4; @@ -24,19 +24,19 @@ namespace grammar_parser { return std::make_pair(value, pos); } - uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) { + static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) { uint32_t next_id = static_cast(state.symbol_ids.size()); auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id)); return result.first->second; } - uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) { + static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) { uint32_t next_id = static_cast(state.symbol_ids.size()); state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id; return next_id; } - void add_rule( + static void add_rule( parse_state & state, uint32_t rule_id, const std::vector & rule) { @@ -46,11 +46,11 @@ namespace grammar_parser { state.rules[rule_id] = rule; } - bool is_word_char(char c) { + static bool is_word_char(char c) { return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9'); } - std::pair parse_hex(const char * src, int size) { + static std::pair parse_hex(const char * src, int size) { const char * pos = src; const char * end = src + size; uint32_t value = 0; @@ -73,7 +73,7 @@ namespace grammar_parser { return std::make_pair(value, pos); } - const char * parse_space(const char * src, bool newline_ok) { + static const char * parse_space(const char * src, bool newline_ok) { const char * pos = src; while (*pos == ' ' || *pos == '\t' || *pos == '#' || (newline_ok && (*pos == '\r' || *pos == '\n'))) { @@ -88,7 +88,7 @@ namespace grammar_parser { return pos; } - const char * parse_name(const char * src) { + static const char * parse_name(const char * src) { const char * pos = src; while (is_word_char(*pos)) { pos++; @@ -99,7 +99,7 @@ namespace grammar_parser { return pos; } - std::pair parse_char(const char * src) { + static std::pair parse_char(const char * src) { if (*src == '\\') { switch (src[1]) { case 'x': return parse_hex(src + 2, 2); @@ -129,7 +129,7 @@ namespace grammar_parser { uint32_t rule_id, bool is_nested); - const char * parse_sequence( + static const char * parse_sequence( parse_state & state, const char * src, const std::string & rule_name, @@ -247,7 +247,7 @@ namespace grammar_parser { return pos; } - const char * parse_rule(parse_state & state, const char * src) { + static const char * parse_rule(parse_state & state, const char * src) { const char * name_end = parse_name(src); const char * pos = parse_space(name_end, false); size_t name_len = name_end - src; @@ -285,7 +285,7 @@ namespace grammar_parser { } } - void print_grammar_char(FILE * file, uint32_t c) { + static void print_grammar_char(FILE * file, uint32_t c) { if (0x20 <= c && c <= 0x7f) { fprintf(file, "%c", static_cast(c)); } else { @@ -294,7 +294,7 @@ namespace grammar_parser { } } - bool is_char_element(llama_grammar_element elem) { + static bool is_char_element(llama_grammar_element elem) { switch (elem.type) { case LLAMA_GRETYPE_CHAR: return true; case LLAMA_GRETYPE_CHAR_NOT: return true; @@ -304,7 +304,7 @@ namespace grammar_parser { } } - void print_rule_binary(FILE * file, const std::vector & rule) { + static void print_rule_binary(FILE * file, const std::vector & rule) { for (auto elem : rule) { switch (elem.type) { case LLAMA_GRETYPE_END: fprintf(file, "END"); break; @@ -334,7 +334,7 @@ namespace grammar_parser { fprintf(file, "\n"); } - void print_rule( + static void print_rule( FILE * file, uint32_t rule_id, const std::vector & rule, diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py new file mode 100755 index 000000000..331e84e98 --- /dev/null +++ b/convert-starcoder-hf-to-gguf.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +# HF starcoder --> gguf conversion + +from __future__ import annotations + +import argparse +import json +import os +import struct +import sys +from pathlib import Path +from typing import Any + +import numpy as np +import torch +from transformers import AutoTokenizer # type: ignore[import] + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + + +def bytes_to_unicode(): + # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8+n) + n += 1 + return dict(zip(bs, (chr(n) for n in cs))) + + +def count_model_parts(dir_model: Path) -> int: + num_parts = 0 + for filename in os.listdir(dir_model): + if filename.startswith("pytorch_model-"): + num_parts += 1 + + if num_parts > 0: + print("gguf: found " + str(num_parts) + " model parts") + return num_parts + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") + parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1) + return parser.parse_args() + +args = parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file = sys.stderr) + sys.exit(1) + +# possible tensor data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 + +# map from ftype to string +ftype_str = ["f32", "f16"] + +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' + +print("gguf: loading model "+dir_model.name) + +with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + +if hparams["architectures"][0] != "GPTBigCodeForCausalLM": + print("Model architecture not supported: " + hparams["architectures"][0]) + + sys.exit(1) + +# get number of model parts +num_parts = count_model_parts(dir_model) + +ARCH=gguf.MODEL_ARCH.STARCODER +gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) + +print("gguf: get model metadata") + +block_count = hparams["n_layer"] + +gguf_writer.add_name("StarCoder") +gguf_writer.add_context_length(hparams["n_positions"]) +gguf_writer.add_embedding_length(hparams["n_embd"]) +gguf_writer.add_feed_forward_length(4 * hparams["n_embd"]) +gguf_writer.add_block_count(block_count) +gguf_writer.add_head_count(hparams["n_head"]) +gguf_writer.add_head_count_kv(1) +gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) +gguf_writer.add_file_type(ftype) + +# TOKENIZATION + +print("gguf: get tokenizer metadata") + +tokens: list[bytearray] = [] +scores: list[float] = [] +toktypes: list[int] = [] + +tokenizer_json_file = dir_model / 'tokenizer.json' +if not tokenizer_json_file.is_file(): + print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr) + sys.exit(1) + +# gpt2 tokenizer +gguf_writer.add_tokenizer_model("gpt2") + +with open(tokenizer_json_file, "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + +print("gguf: get gpt2 tokenizer vocab") + +# The number of tokens in tokenizer.json can differ from the expected vocab size. +# This causes downstream issues with mismatched tensor sizes when running the inference +vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"]) + +# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py +tokenizer = AutoTokenizer.from_pretrained(dir_model) + +reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} +byte_encoder = bytes_to_unicode() +byte_decoder = {v: k for k, v in byte_encoder.items()} + +for i in range(vocab_size): + if i in reverse_vocab: + try: + text = bytearray([byte_decoder[c] for c in reverse_vocab[i]]) + except KeyError: + text = bytearray() + for c in reverse_vocab[i]: + if ord(c) < 256: # single byte character + text.append(byte_decoder[ord(c)]) + else: # multibyte special token character + text.extend(c.encode('utf-8')) + else: + print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.") + pad_token = f"[PAD{i}]".encode("utf8") + text = bytearray(pad_token) + + tokens.append(text) + scores.append(0.0) # dymmy + toktypes.append(gguf.TokenType.NORMAL) # dummy + +gguf_writer.add_token_list(tokens) +gguf_writer.add_token_scores(scores) +gguf_writer.add_token_types(toktypes) + +special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) +special_vocab.add_to_gguf(gguf_writer) + +# TENSORS + +tensor_map = gguf.get_tensor_name_map(ARCH,block_count) + +# params for qkv transform +n_head = hparams["n_head"] +n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1 + +head_dim = hparams["n_embd"] // n_head + +# tensor info +print("gguf: get tensor metadata") + +if num_parts == 0: + part_names = iter(("pytorch_model.bin",)) +else: + part_names = ( + f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) + ) + +for part_name in part_names: + if args.vocab_only: + break + print("gguf: loading model part '" + part_name + "'") + model_part = torch.load(dir_model / part_name, map_location="cpu") + + for name in model_part.keys(): + data = model_part[name] + + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + data = data.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + + gguf_writer.add_tensor(new_name, data) + + +print("gguf: write header") +gguf_writer.write_header_to_file() +print("gguf: write metadata") +gguf_writer.write_kv_data_to_file() +if not args.vocab_only: + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() + +gguf_writer.close() + +print(f"gguf: model successfully exported to '{fname_out}'") +print("") diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index b5916f230..b02a80863 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -9,12 +9,12 @@ #endif #ifdef LLAMA_DEFAULT_RMS_EPS -static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; +constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; #else -static const float rms_norm_eps = 5e-6f; +constexpr float rms_norm_eps = 5e-6f; #endif -float frand() { +static float frand() { return (float)rand()/(float)RAND_MAX; } @@ -25,19 +25,21 @@ struct random_normal_distribution { float max; }; -void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) { +static void init_random_normal_distribution( + struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max +) { rnd->gen = std::mt19937(seed); rnd->nd = std::normal_distribution{mean, std}; rnd->min = min; rnd->max = max; } -float frand_normal(struct random_normal_distribution * rnd) { +static float frand_normal(struct random_normal_distribution * rnd) { const float r = rnd->nd(rnd->gen); return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r); } -void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { +static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); if (plan.work_size > 0) { @@ -48,13 +50,9 @@ void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, ggml_graph_compute(graph, &plan); } -struct ggml_tensor * randomize_tensor( - struct ggml_tensor * tensor, - int ndims, - const int64_t ne[], - float fmin, - float fmax) { - +static struct ggml_tensor * randomize_tensor( + struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax +) { switch (ndims) { case 1: for (int i0 = 0; i0 < ne[0]; i0++) { @@ -95,11 +93,9 @@ struct ggml_tensor * randomize_tensor( return tensor; } -struct ggml_tensor * randomize_tensor_normal( - struct ggml_tensor * tensor, - int ndims, - const int64_t ne[], - struct random_normal_distribution * rnd) { +static struct ggml_tensor * randomize_tensor_normal( + struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd +) { float scale = 1.0; // xavier switch (ndims) { case 1: @@ -159,7 +155,7 @@ struct llama_hparams { } }; -uint32_t get_n_ff(const struct llama_hparams* hparams) { +static uint32_t get_n_ff(const struct llama_hparams* hparams) { const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; return n_ff; } @@ -260,7 +256,7 @@ struct llama_model_lora { std::vector layers; }; -void init_model(struct llama_model * model) { +static void init_model(struct llama_model * model) { const auto & hparams = model->hparams; const uint32_t n_embd = hparams.n_embd; @@ -297,7 +293,7 @@ void init_model(struct llama_model * model) { } -void init_model_lora(struct llama_model_lora * model) { +static void init_model_lora(struct llama_model_lora * model) { const auto & hparams = model->hparams; const uint32_t n_embd = hparams.n_embd; @@ -340,7 +336,7 @@ void init_model_lora(struct llama_model_lora * model) { } } -void set_param_model(struct llama_model * model) { +static void set_param_model(struct llama_model * model) { const auto& hparams = model->hparams; const uint32_t n_layer = hparams.n_layer; @@ -366,7 +362,7 @@ void set_param_model(struct llama_model * model) { } } -void set_param_model_lora(struct llama_model_lora * model) { +static void set_param_model_lora(struct llama_model_lora * model) { const auto& hparams = model->hparams; const uint32_t n_layer = hparams.n_layer; @@ -397,7 +393,7 @@ void set_param_model_lora(struct llama_model_lora * model) { } } -void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) { +static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) { const auto & hparams = model->hparams; const uint32_t n_layer = hparams.n_layer; @@ -426,7 +422,9 @@ void randomize_model(struct llama_model * model, int seed, float mean, float std } -void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, float std, float min, float max) { +static void randomize_model_lora( + struct llama_model_lora * model, int seed, float mean, float std, float min, float max +) { const auto & hparams = model->hparams; const uint32_t n_layer = hparams.n_layer; @@ -459,7 +457,7 @@ void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, } } -bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { +static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { const auto & hparams = model->hparams; const uint32_t n_ctx = hparams.n_ctx; @@ -495,7 +493,7 @@ bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int return true; } -bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) { +static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) { const auto & hparams = model->hparams; const uint32_t n_ctx = hparams.n_ctx; @@ -531,15 +529,15 @@ bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * return true; } -struct ggml_tensor * forward( - struct llama_model * model, - struct llama_kv_cache * cache, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_past) { - +static struct ggml_tensor * forward( + struct llama_model * model, + struct llama_kv_cache * cache, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_past +) { const int N = n_tokens; struct llama_kv_cache& kv_self = *cache; @@ -764,25 +762,25 @@ struct ggml_tensor * forward( return inpL; } -void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { +static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { GGML_ASSERT(tensor->n_dims == 1); GGML_ASSERT(tensor->ne[0] == ne0); } -void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { +static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { GGML_ASSERT(tensor->n_dims == 2); GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[1] == ne1); } -void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { +static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { GGML_ASSERT(tensor->n_dims == 3); GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[2] == ne2); } -void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { +static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { GGML_ASSERT(tensor->n_dims == 4); GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[1] == ne1); @@ -790,16 +788,16 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6 GGML_ASSERT(tensor->ne[3] == ne3); } -struct ggml_tensor * forward_batch( - struct llama_model * model, - struct llama_kv_cache * cache, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_past, - const int n_batch) { - +static struct ggml_tensor * forward_batch( + struct llama_model * model, + struct llama_kv_cache * cache, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_past, + const int n_batch +) { const int N = n_tokens; struct llama_kv_cache& kv_self = *cache; @@ -1090,16 +1088,15 @@ struct ggml_tensor * forward_batch( return inpL; } - -struct ggml_tensor * forward_lora( - struct llama_model_lora * model, - struct llama_kv_cache * cache, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_past) { - +static struct ggml_tensor * forward_lora( + struct llama_model_lora * model, + struct llama_kv_cache * cache, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_past +) { const int N = n_tokens; struct llama_kv_cache& kv_self = *cache; @@ -1353,7 +1350,7 @@ struct ggml_tensor * forward_lora( return inpL; } -void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { +static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { assert(logits->n_dims == 2); assert(probs->n_dims == 2); assert(best_samples->n_dims == 1); @@ -1384,7 +1381,10 @@ void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, str } } -void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { +static void sample_softmax_batch( + struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, + struct ggml_tensor * best_samples +) { GGML_ASSERT(best_samples->n_dims == 2); GGML_ASSERT(logits->n_dims == 3); GGML_ASSERT(probs->n_dims == 3); @@ -1418,7 +1418,7 @@ void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits } } -void print_row(struct ggml_tensor * probs, int i) { +static void print_row(struct ggml_tensor * probs, int i) { for (int k = 0; k < probs->ne[0]; ++k) { float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); printf(" %.2f", p); @@ -1426,7 +1426,7 @@ void print_row(struct ggml_tensor * probs, int i) { printf("\n"); } -void print_matrix(struct ggml_tensor * probs) { +static void print_matrix(struct ggml_tensor * probs) { assert(probs->n_dims == 2); for (int i = 0; i < probs->ne[1]; ++i) { for (int k = 0; k < probs->ne[0]; ++k) { @@ -1437,7 +1437,7 @@ void print_matrix(struct ggml_tensor * probs) { } } -void print_token(int token, int n_vocab) { +static void print_token(int token, int n_vocab) { for (int k = 0; k < token; ++k) { printf(" "); } @@ -1448,14 +1448,14 @@ void print_token(int token, int n_vocab) { printf("\n"); } -void print_tokens(struct ggml_tensor * tokens, int n_vocab) { +static void print_tokens(struct ggml_tensor * tokens, int n_vocab) { for (int i=0; ine[0]; ++i) { int token = ggml_get_i32_1d(tokens, i); print_token(token, n_vocab); } } -void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { +static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { int n_tokens = tokens_input->ne[0]; int n_vocab = targets->ne[0]; float randomness = 0.0f; @@ -1476,7 +1476,9 @@ void get_example_targets(int example_id, struct ggml_tensor * tokens_input, stru } } -void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { +static void get_example_targets_batch( + struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets +) { GGML_ASSERT(tokens_input->n_dims == 2); GGML_ASSERT( targets->n_dims == 3); int n_tokens = tokens_input->ne[0]; @@ -1499,7 +1501,7 @@ void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct } } -void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) { +static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) { int n_tokens = tokens_input->ne[0]; int n_vocab = targets->ne[0]; for (int i=0; i #include @@ -30,7 +29,8 @@ struct ostream_beam_view { llama_context * ctx; llama_beam_view beam_view; }; -std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) { + +static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) { os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens("; for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) { os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]); @@ -46,7 +46,7 @@ struct beam_search_callback_data { // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same. // For example, eob can be flagged due to maximum token length, stop words, etc. -bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) { +static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) { return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx); } @@ -56,7 +56,7 @@ bool is_at_eob(const beam_search_callback_data & callback_data, const llama_toke // * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. // This is also called when the stop condition is met. // Collect tokens into std::vector response which is pointed to by callback_data. -void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) { +static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) { auto& callback_data = *static_cast(callback_data_ptr); // Mark beams as EOS as needed. for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt index 3f3415350..14916d831 100644 --- a/examples/benchmark/CMakeLists.txt +++ b/examples/benchmark/CMakeLists.txt @@ -1,7 +1,8 @@ set(TARGET benchmark) add_executable(${TARGET} benchmark-matmult.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_include_directories(${TARGET} PRIVATE ../../common) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) add_dependencies(${TARGET} BUILD_INFO) diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index f7215f43b..561309acb 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -1,5 +1,5 @@ +#include "common.h" #include "ggml.h" -#include "build-info.h" #include #include @@ -99,7 +99,7 @@ int main(int argc, char ** argv) { exit(1); } - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + print_build_info(); printf("Starting Test\n"); // create the ggml context diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 293b455d0..c291f0adf 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -115,7 +115,7 @@ struct TransformerWeights { } }; -void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) { +static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) { // we calloc instead of malloc to keep valgrind happy w->token_embedding_table = new float[p->vocab_size * p->dim](); printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); @@ -158,7 +158,7 @@ void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) { } } -int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) { +static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) { if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast(p->vocab_size * p->dim)) return 1; if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast(p->n_layers * p->dim)) return 1; if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast(p->n_layers * p->dim * p->dim)) return 1; @@ -189,7 +189,7 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shar return 0; } -void print_sample_weights(TransformerWeights *w){ +static void print_sample_weights(TransformerWeights *w){ printf("----- Quick print of first of the weight vales of all the variables\n"); printf("%f\n", w->token_embedding_table[0]); printf("%f\n", w->rms_att_weight[0]); @@ -324,7 +324,7 @@ struct train_params { int mem_compute1_gb; }; -void print_params(struct my_llama_hparams * params) { +static void print_params(struct my_llama_hparams * params) { printf("%s: n_vocab: %d\n", __func__, params->n_vocab); printf("%s: n_ctx: %d\n", __func__, params->n_ctx); printf("%s: n_embd: %d\n", __func__, params->n_embd); @@ -335,7 +335,7 @@ void print_params(struct my_llama_hparams * params) { printf("%s: n_rot: %d\n", __func__, params->n_rot); } -void init_model(struct my_llama_model * model) { +static void init_model(struct my_llama_model * model) { const auto & hparams = model->hparams; const uint32_t n_embd = hparams.n_embd; @@ -408,17 +408,17 @@ void init_model(struct my_llama_model * model) { } } -float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { +static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); return *ptr; } -int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { +static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); return *ptr; } -void print_row(struct ggml_tensor * probs, int i) { +static void print_row(struct ggml_tensor * probs, int i) { for (int k = 0; k < probs->ne[0]; ++k) { float p = get_f32_2d(probs, k, i); printf(" %f", p); @@ -426,7 +426,7 @@ void print_row(struct ggml_tensor * probs, int i) { printf("\n"); } -void print_matrix(struct ggml_tensor * probs) { +static void print_matrix(struct ggml_tensor * probs) { assert(probs->n_dims == 2); for (int i = 0; i < probs->ne[1]; ++i) { for (int k = 0; k < probs->ne[0]; ++k) { @@ -531,7 +531,7 @@ struct llama_file { } }; -bool is_ggml_file(const char *filename) { +static bool is_ggml_file(const char * filename) { llama_file file(filename, "rb"); if (file.size < 4) { return false; @@ -540,7 +540,7 @@ bool is_ggml_file(const char *filename) { return magic == GGUF_MAGIC; } -static std::string llama_escape_whitespaces(const std::string& text) { +static std::string llama_escape_whitespaces(const std::string & text) { std::ostringstream out; for (char c : text) { if (c == ' ') out << "\xe2\x96\x81"; @@ -549,7 +549,7 @@ static std::string llama_escape_whitespaces(const std::string& text) { return out.str(); } -void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { +static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { if (is_ggml_file(filename)) { struct ggml_context * ctx_data = NULL; @@ -637,7 +637,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) } } -void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { +static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { int ct; switch (gg_weights->n_dims){ case 1: @@ -673,7 +673,9 @@ void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * kar } } -void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) { +static void save_as_llama_model( + struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename +) { // convert AK weights into GG weights one by one. // w->token_embedding_table -> model->tok_embeddings // float* -> struct ggml_tensor @@ -785,7 +787,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod gguf_free(ctx); } -struct train_params get_default_train_params() { +static struct train_params get_default_train_params() { struct train_params params; params.fn_vocab_model = "models/7B/ggml-model-f16.gguf"; params.fn_llama2c_output_model = "ak_llama_model.bin"; @@ -835,7 +837,7 @@ struct train_params get_default_train_params() { return params; } -void print_usage(int /*argc*/, char ** argv, const struct train_params * params) { +static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) { fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); @@ -846,7 +848,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params) fprintf(stderr, "\n"); } -bool params_parse(int argc, char ** argv, struct train_params * params) { +static bool params_parse(int argc, char ** argv, struct train_params * params) { bool invalid_param = false; bool reqd_param_found = false; std::string arg; @@ -901,7 +903,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) { return true; } -std::string basename(const std::string &path) { +static std::string basename(const std::string &path) { size_t pos = path.find_last_of("/\\"); if (pos == std::string::npos) { return path; diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp index ef12212ba..fc6e44eb2 100644 --- a/examples/embd-input/embd-input-lib.cpp +++ b/examples/embd-input/embd-input-lib.cpp @@ -1,3 +1,4 @@ +#include "common.h" #include "embd-input.h" #include @@ -22,7 +23,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) { return nullptr; } - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + print_build_info(); if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = uint32_t(time(NULL)); diff --git a/examples/embd-input/embd-input.h b/examples/embd-input/embd-input.h index efb5ba5e2..eff5e3b84 100644 --- a/examples/embd-input/embd-input.h +++ b/examples/embd-input/embd-input.h @@ -3,7 +3,6 @@ #include "common.h" #include "llama.h" -#include "build-info.h" extern "C" { diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index e4a0a38c8..0788f362c 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -1,6 +1,5 @@ #include "common.h" #include "llama.h" -#include "build-info.h" #include @@ -17,7 +16,7 @@ int main(int argc, char ** argv) { params.embedding = true; - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + print_build_info(); if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = time(NULL); diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index a34010f10..9ab63a293 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -13,14 +13,14 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) -template +template static std::string to_string(const T & val) { std::stringstream ss; ss << val; return ss.str(); } -bool gguf_ex_write(const std::string & fname) { +static bool gguf_ex_write(const std::string & fname) { struct gguf_context * ctx = gguf_init_empty(); gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12); @@ -85,7 +85,7 @@ bool gguf_ex_write(const std::string & fname) { } // just read tensor info -bool gguf_ex_read_0(const std::string & fname) { +static bool gguf_ex_read_0(const std::string & fname) { struct gguf_init_params params = { /*.no_alloc = */ false, /*.ctx = */ NULL, @@ -143,7 +143,7 @@ bool gguf_ex_read_0(const std::string & fname) { } // read and create ggml_context containing the tensors and their data -bool gguf_ex_read_1(const std::string & fname) { +static bool gguf_ex_read_1(const std::string & fname) { struct ggml_context * ctx_data = NULL; struct gguf_init_params params = { diff --git a/examples/main/main.cpp b/examples/main/main.cpp index b9d26bc75..19cbbb2a1 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -41,7 +41,8 @@ static std::ostringstream * g_output_ss; static std::vector * g_output_tokens; static bool is_interacting = false; -void write_logfile( + +static void write_logfile( const llama_context * ctx, const gpt_params & params, const llama_model * model, const std::vector & input_tokens, const std::string & output, const std::vector & output_tokens @@ -86,7 +87,7 @@ void write_logfile( } #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) -void sigint_handler(int signo) { +static void sigint_handler(int signo) { if (signo == SIGINT) { if (!is_interacting) { is_interacting = true; @@ -148,6 +149,7 @@ int main(int argc, char ** argv) { } LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = time(NULL); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 2b7472dcc..4958cdfb9 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1,6 +1,5 @@ #include "common.h" #include "llama.h" -#include "build-info.h" #include #include @@ -28,9 +27,10 @@ struct results_log_softmax { float prob; }; -void write_logfile(const llama_context * ctx, const gpt_params & params, - const llama_model * model, const struct results_perplexity & results) { - +static void write_logfile( + const llama_context * ctx, const gpt_params & params, const llama_model * model, + const struct results_perplexity & results +) { if (params.logdir.empty()) { return; } @@ -76,7 +76,7 @@ void write_logfile(const llama_context * ctx, const gpt_params & params, fclose(logfile); } -std::vector softmax(const std::vector& logits) { +static std::vector softmax(const std::vector& logits) { std::vector probs(logits.size()); float max_logit = logits[0]; for (float v : logits) max_logit = std::max(max_logit, v); @@ -92,7 +92,7 @@ std::vector softmax(const std::vector& logits) { return probs; } -results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { +static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { float max_logit = logits[0]; for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]); double sum_exp = 0.0; @@ -100,9 +100,10 @@ results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; } -void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector & workers, - double & nll, double & nll2, float * logit_history, float * prob_history) { - +static void process_logits( + int n_vocab, const float * logits, const int * tokens, int n_token, std::vector & workers, + double & nll, double & nll2, float * logit_history, float * prob_history +) { std::mutex mutex; int counter = 0; auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { @@ -130,7 +131,7 @@ void process_logits(int n_vocab, const float * logits, const int * tokens, int n } -results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { +static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Output: `perplexity: 13.5106 [114/114]` @@ -260,8 +261,7 @@ results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) return {tokens, std::exp(nll / count), logit_history, prob_history}; } -results_perplexity perplexity(llama_context * ctx, const gpt_params & params) { - +static results_perplexity perplexity(llama_context * ctx, const gpt_params & params) { if (params.ppl_stride > 0) { return perplexity_v2(ctx, params); } @@ -400,8 +400,9 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) { return {tokens, ppl, logit_history, prob_history}; } -std::vector hellaswag_evaluate_tokens(llama_context * ctx, const std::vector & tokens, int n_past, int n_batch, - int n_vocab, int n_thread) { +static std::vector hellaswag_evaluate_tokens( + llama_context * ctx, const std::vector& tokens, int n_past, int n_batch, int n_vocab, int n_thread +) { std::vector result; result.reserve(tokens.size() * n_vocab); size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch; @@ -421,7 +422,7 @@ std::vector hellaswag_evaluate_tokens(llama_context * ctx, const std::vec return result; } -void hellaswag_score(llama_context * ctx, const gpt_params & params) { +static void hellaswag_score(llama_context * ctx, const gpt_params & params) { // Calculates hellaswag score (acc_norm) from prompt // // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl @@ -668,7 +669,7 @@ int main(int argc, char ** argv) { params.n_ctx += params.ppl_stride/2; } - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + print_build_info(); if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = time(NULL); diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt index c5c394058..db182e263 100644 --- a/examples/quantize-stats/CMakeLists.txt +++ b/examples/quantize-stats/CMakeLists.txt @@ -2,4 +2,5 @@ set(TARGET quantize-stats) add_executable(${TARGET} quantize-stats.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_include_directories(${TARGET} PRIVATE ../../common) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 6ce03ba7b..9f930dede 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -1,7 +1,6 @@ -#include "ggml.h" -#include "build-info.h" - #define LLAMA_API_INTERNAL +#include "common.h" +#include "ggml.h" #include "llama.h" #include @@ -34,8 +33,8 @@ struct quantize_stats_params { std::vector include_types; }; -const size_t HISTOGRAM_BUCKETS = 150; -const double HISTOGRAM_RANGE = 0.03; +constexpr size_t HISTOGRAM_BUCKETS = 150; +constexpr double HISTOGRAM_RANGE = 0.03; struct error_stats { size_t num_samples; @@ -44,8 +43,7 @@ struct error_stats { uint64_t error_histogram[HISTOGRAM_BUCKETS]; }; - -void quantize_stats_print_usage(int /*argc*/, char ** argv) { +static void quantize_stats_print_usage(int /*argc*/, char ** argv) { quantize_stats_params params; fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); @@ -71,7 +69,7 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) { } // Check if a layer is included/excluded by command line -bool layer_included(const quantize_stats_params & params, const std::string & layer) { +static bool layer_included(const quantize_stats_params & params, const std::string & layer) { for (const auto& excluded : params.exclude_layers) { if (std::regex_search(layer, std::regex(excluded))) { return false; @@ -86,7 +84,7 @@ bool layer_included(const quantize_stats_params & params, const std::string & la } // Update error statistics given vectors with the before/after result of quantization -void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) { +static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) { for (int64_t i = 0; i < nelements; i++) { double diff = input[i] - output[i]; stats.total_error += diff * diff; @@ -96,14 +94,14 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou stats.num_samples += nelements; } -void combine_error_stats(error_stats & into, const error_stats & from) { +static void combine_error_stats(error_stats & into, const error_stats & from) { into.num_samples += from.num_samples; into.total_error += from.total_error; if (from.max_error > into.max_error) into.max_error = from.max_error; for (size_t i=0; inb[3] == tensor->nb[2]*tensor->ne[2]; } -void test_roundtrip_on_chunk( - const ggml_tensor * layer, - int64_t offset, - int64_t chunk_size, - const ggml_type_traits_t & qfns, - bool use_reference, - float * input_scratch, - char * quantized_scratch, - float * output_scratch, - error_stats & stats) { - +static void test_roundtrip_on_chunk( + const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference, + float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats +) { if (layer->type == GGML_TYPE_F16) { for (int i = 0; i < chunk_size; i++) { input_scratch[i] = ggml_get_f32_1d(layer, i + offset); @@ -174,18 +165,11 @@ void test_roundtrip_on_chunk( // Run quantization function for a single layer and update error stats -void test_roundtrip_on_layer( - std::string & name, - bool print_layer_stats, - const ggml_type_traits_t & qfns, - bool use_reference, - const ggml_tensor * layer, - std::vector & input_scratch, - std::vector & quantized_scratch, - std::vector & output_scratch, - error_stats & total_error, - int max_thread = 0) { - +static void test_roundtrip_on_layer( + std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference, + const ggml_tensor * layer, std::vector & input_scratch, std::vector & quantized_scratch, + std::vector & output_scratch, error_stats & total_error, int max_thread = 0 +) { assert(tensor_is_contiguous(layer)); error_stats layer_error {}; uint64_t nelements = ggml_nelements(layer); @@ -314,7 +298,7 @@ int main(int argc, char ** argv) { return 1; } - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + print_build_info(); // load the model fprintf(stderr, "Loading model\n"); diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index 47d0be72e..4a8eed544 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -2,6 +2,7 @@ set(TARGET quantize) add_executable(${TARGET} quantize.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_include_directories(${TARGET} PRIVATE ../../common) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) add_dependencies(${TARGET} BUILD_INFO) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 1bf182482..acb79e690 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -1,5 +1,4 @@ -#include "build-info.h" - +#include "common.h" #include "llama.h" #include @@ -40,7 +39,7 @@ static const std::vector QUANT_OPTIONS = { }; -bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { +static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { std::string ftype_str; for (auto ch : ftype_str_in) { @@ -72,7 +71,7 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std: // usage: // ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // -void usage(const char * executable) { +static void usage(const char * executable) { printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); @@ -161,7 +160,7 @@ int main(int argc, char ** argv) { } } - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + print_build_info(); fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str()); if (params.nthread > 0) { diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 14e9501ca..eac307904 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -1,6 +1,5 @@ #include "common.h" #include "llama.h" -#include "build-info.h" #include #include @@ -17,7 +16,7 @@ int main(int argc, char ** argv) { return 1; } - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + print_build_info(); if (params.n_predict < 0) { params.n_predict = 16; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3f3c64650..1bb8e92c0 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1083,8 +1083,9 @@ static json format_final_response(llama_server_context &llama, const std::string return res; } -static json format_partial_response(llama_server_context &llama, const std::string &content, const std::vector &probs) -{ +static json format_partial_response( + llama_server_context &llama, const std::string &content, const std::vector &probs +) { json res = json{ {"content", content}, {"stop", false}, @@ -1215,7 +1216,7 @@ static void log_server_request(const Request &req, const Response &res) }); } -bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) { +static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) { return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx); } @@ -1225,7 +1226,7 @@ bool is_at_eob(llama_server_context & server_context, const llama_token * tokens // * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. // This is also called when the stop condition is met. // Collect tokens into std::vector response which is pointed to by callback_data. -void beam_search_callback(void * callback_data, llama_beams_state beams_state) { +static void beam_search_callback(void *callback_data, llama_beams_state beams_state) { auto & llama = *static_cast(callback_data); // Mark beams as EOS as needed. for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { @@ -1258,7 +1259,8 @@ struct token_translator { std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); } }; -void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) { +static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama) +{ auto & gtps = llama.generated_token_probs; auto translator = token_translator{llama.ctx}; auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); }; diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt index 0ac9cb03a..7da5ff6f3 100644 --- a/examples/simple/CMakeLists.txt +++ b/examples/simple/CMakeLists.txt @@ -3,6 +3,3 @@ add_executable(${TARGET} simple.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 9f160376a..37eaf3b2c 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -1,5 +1,3 @@ -#include "build-info.h" - #include "common.h" #include "llama.h" diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index d91566586..025eac2a6 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -973,10 +973,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto buf[size] = '\0'; - int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false); + int n_tokens = llama_tokenize(lctx, buf.data(), buf.size(), out.data(), out.size(), false); if (n_tokens < 0) { out.resize(-n_tokens); - n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false); + n_tokens = llama_tokenize(lctx, buf.data(), buf.size(), out.data(), out.size(), false); } GGML_ASSERT(n_tokens >= 0); out.resize(n_tokens); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index fe7332b2a..08428ea3f 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -31,6 +31,9 @@ #define cublasSetStream hipblasSetStream #define cublasSgemm hipblasSgemm #define cublasStatus_t hipblasStatus_t +#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer +#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess +#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess #define cudaDeviceProp hipDeviceProp_t #define cudaDeviceSynchronize hipDeviceSynchronize #define cudaError_t hipError_t @@ -61,7 +64,7 @@ #define cudaStreamCreateWithFlags hipStreamCreateWithFlags #define cudaStreamNonBlocking hipStreamNonBlocking #define cudaStreamSynchronize hipStreamSynchronize -#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0) +#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) #define cudaStream_t hipStream_t #define cudaSuccess hipSuccess #else @@ -190,6 +193,12 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); } while (0) #endif // CUDART_VERSION >= 11 +#if CUDART_VERSION >= 11100 +#define GGML_CUDA_ASSUME(x) __builtin_assume(x) +#else +#define GGML_CUDA_ASSUME(x) +#endif // CUDART_VERSION >= 11100 + #ifdef GGML_CUDA_F16 typedef half dfloat; // dequantize float typedef half2 dfloat2; @@ -418,6 +427,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); #endif +#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE +#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128 +#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE + #define MUL_MAT_SRC1_COL_STRIDE 128 #define MAX_STREAMS 8 @@ -2145,10 +2158,10 @@ template static __device__ __forceinlin const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < nwarps); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); const int kbx = k / QI4_0; const int kqsx = k % QI4_0; @@ -2239,10 +2252,10 @@ template static __device__ __forceinlin const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < nwarps); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); const int kbx = k / QI4_1; const int kqsx = k % QI4_1; @@ -2331,10 +2344,10 @@ template static __device__ __forceinlin const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < nwarps); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); const int kbx = k / QI5_0; const int kqsx = k % QI5_0; @@ -2445,10 +2458,10 @@ template static __device__ __forceinlin const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < nwarps); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); const int kbx = k / QI5_1; const int kqsx = k % QI5_1; @@ -2551,10 +2564,10 @@ template static __device__ __forceinlin const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < nwarps); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); const int kbx = k / QI8_0; const int kqsx = k % QI8_0; @@ -2642,10 +2655,10 @@ template static __device__ __forceinlin const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < nwarps); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); const int kbx = k / QI2_K; const int kqsx = k % QI2_K; @@ -2763,10 +2776,10 @@ template static __device__ __forceinlin const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < nwarps); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); const int kbx = k / QI3_K; const int kqsx = k % QI3_K; @@ -2981,10 +2994,10 @@ template static __device__ __forceinlin const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < nwarps); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); const int kbx = k / QI4_K; // == 0 if QK_K == 256 const int kqsx = k % QI4_K; // == k if QK_K == 256 @@ -3162,10 +3175,10 @@ template static __device__ __forceinlin const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < nwarps); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); const int kbx = k / QI5_K; // == 0 if QK_K == 256 const int kqsx = k % QI5_K; // == k if QK_K == 256 @@ -3291,10 +3304,10 @@ template static __device__ __forceinlin const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < nwarps); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); const int kbx = k / QI6_K; // == 0 if QK_K == 256 const int kqsx = k % QI6_K; // == k if QK_K == 256 @@ -6252,6 +6265,43 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s } } +void ggml_cuda_set_peer_access(const int n_tokens) { + static bool peer_access_enabled = false; + + const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE; + + if (peer_access_enabled == enable_peer_access) { + return; + } + +#ifdef NDEBUG + for (int id = 0; id < g_device_count; ++id) { + CUDA_CHECK(ggml_cuda_set_device(id)); + + for (int id_other = 0; id_other < g_device_count; ++id_other) { + if (id == id_other) { + continue; + } + if (id != g_main_device && id_other != g_main_device) { + continue; + } + + int can_access_peer; + CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other)); + if (can_access_peer) { + if (enable_peer_access) { + CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0)); + } else { + CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other)); + } + } + } + } +#endif // NDEBUG + + peer_access_enabled = enable_peer_access; +} + static void ggml_cuda_op_mul_mat( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op, const bool convert_src1_to_q8_1) { @@ -6276,6 +6326,8 @@ static void ggml_cuda_op_mul_mat( const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; + ggml_cuda_set_peer_access(ne11); + GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT); @@ -6408,7 +6460,7 @@ static void ggml_cuda_op_mul_mat( // wait for main GPU data if necessary if (split && (id != g_main_device || is != 0)) { - CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0])); + CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0)); } for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) { @@ -6530,7 +6582,7 @@ static void ggml_cuda_op_mul_mat( CUDA_CHECK(ggml_cuda_set_device(g_main_device)); for (int64_t id = 0; id < g_device_count; ++id) { for (int64_t is = 0; is < is_max; ++is) { - CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is])); + CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0)); } } } @@ -6964,6 +7016,7 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) return; } if (g_scratch_buffer == nullptr) { + ggml_cuda_set_device(g_main_device); CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size)); } @@ -7003,7 +7056,7 @@ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) { ggml_cuda_assign_buffers_impl(tensor, false, true, false); } -void ggml_cuda_set_main_device(int main_device) { +void ggml_cuda_set_main_device(const int main_device) { if (main_device >= g_device_count) { fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", main_device, g_device_count, g_main_device); @@ -7017,11 +7070,11 @@ void ggml_cuda_set_main_device(int main_device) { } } -void ggml_cuda_set_mul_mat_q(bool mul_mat_q) { +void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) { g_mul_mat_q = mul_mat_q; } -void ggml_cuda_set_scratch_size(size_t scratch_size) { +void ggml_cuda_set_scratch_size(const size_t scratch_size) { g_scratch_size = scratch_size; } diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index 7f7204ea1..e0e0dbcbb 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -77,13 +77,14 @@ KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world" class MODEL_ARCH(IntEnum): - LLAMA : int = auto() - FALCON : int = auto() - BAICHUAN:int = auto() - GPT2 : int = auto() - GPTJ : int = auto() - GPTNEOX: int = auto() - MPT : int = auto() + LLAMA : int = auto() + FALCON : int = auto() + BAICHUAN : int = auto() + GPT2 : int = auto() + GPTJ : int = auto() + GPTNEOX : int = auto() + MPT : int = auto() + STARCODER : int = auto() class MODEL_TENSOR(IntEnum): @@ -107,13 +108,14 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { - MODEL_ARCH.LLAMA: "llama", - MODEL_ARCH.FALCON: "falcon", - MODEL_ARCH.BAICHUAN:"baichuan", - MODEL_ARCH.GPT2: "gpt2", - MODEL_ARCH.GPTJ: "gptj", - MODEL_ARCH.GPTNEOX: "gptneox", - MODEL_ARCH.MPT: "mpt", + MODEL_ARCH.LLAMA: "llama", + MODEL_ARCH.FALCON: "falcon", + MODEL_ARCH.BAICHUAN: "baichuan", + MODEL_ARCH.GPT2: "gpt2", + MODEL_ARCH.GPTJ: "gptj", + MODEL_ARCH.GPTNEOX: "gptneox", + MODEL_ARCH.MPT: "mpt", + MODEL_ARCH.STARCODER: "starcoder", } MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = { @@ -171,6 +173,18 @@ MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = { MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", }, + MODEL_ARCH.STARCODER: { + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.POS_EMBD: "position_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + }, MODEL_ARCH.GPT2: { # TODO }, diff --git a/llama.cpp b/llama.cpp index 532937da8..0e1c8755c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,3 +1,4 @@ +#define LLAMA_API_INTERNAL #include "llama.h" #include "ggml.h" @@ -109,7 +110,7 @@ static size_t utf8_len(char src) { return lookup[highbits]; } -void replace_all(std::string & s, const std::string & search, const std::string & replace) { +static void replace_all(std::string & s, const std::string & search, const std::string & replace) { std::string result; for (size_t pos = 0; ; pos += search.length()) { auto new_pos = s.find(search, pos); @@ -161,17 +162,19 @@ enum llm_arch { LLM_ARCH_GPTJ, LLM_ARCH_GPTNEOX, LLM_ARCH_MPT, + LLM_ARCH_STARCODER, LLM_ARCH_UNKNOWN, }; static std::map LLM_ARCH_NAMES = { - { LLM_ARCH_LLAMA, "llama" }, - { LLM_ARCH_FALCON, "falcon" }, - { LLM_ARCH_GPT2, "gpt2" }, - { LLM_ARCH_GPTJ, "gptj" }, - { LLM_ARCH_GPTNEOX, "gptneox" }, - { LLM_ARCH_MPT, "mpt" }, - { LLM_ARCH_BAICHUAN,"baichuan" }, + { LLM_ARCH_LLAMA, "llama" }, + { LLM_ARCH_FALCON, "falcon" }, + { LLM_ARCH_GPT2, "gpt2" }, + { LLM_ARCH_GPTJ, "gptj" }, + { LLM_ARCH_GPTNEOX, "gptneox" }, + { LLM_ARCH_MPT, "mpt" }, + { LLM_ARCH_BAICHUAN, "baichuan" }, + { LLM_ARCH_STARCODER, "starcoder" }, }; enum llm_kv { @@ -377,6 +380,21 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, }, }, + { + LLM_ARCH_STARCODER, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_POS_EMBD, "position_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -896,9 +914,11 @@ static llama_state g_state; // available llama models enum e_model { MODEL_UNKNOWN, + MODEL_1B, MODEL_3B, MODEL_7B, MODEL_13B, + MODEL_15B, MODEL_30B, MODEL_34B, MODEL_40B, @@ -908,6 +928,7 @@ enum e_model { static const size_t kB = 1024; static const size_t MB = kB*kB; +static const size_t GB = kB*kB*kB; // default hparams (LLaMA 7B) struct llama_hparams { @@ -967,13 +988,22 @@ struct llama_layer { struct ggml_tensor * wo; struct ggml_tensor * wqkv; + // attention bias + struct ggml_tensor * bo; + struct ggml_tensor * bqkv; + // normalization struct ggml_tensor * ffn_norm; + struct ggml_tensor * ffn_norm_b; // ff struct ggml_tensor * w1; // ffn_gate struct ggml_tensor * w2; // ffn_down struct ggml_tensor * w3; // ffn_up + + // ff bias + struct ggml_tensor * b2; // ffn_down + struct ggml_tensor * b3; // ffn_up }; struct llama_kv_cell { @@ -1067,6 +1097,7 @@ struct llama_model { llama_vocab vocab; struct ggml_tensor * tok_embeddings; + struct ggml_tensor * pos_embeddings; struct ggml_tensor * output_norm; struct ggml_tensor * output_norm_b; @@ -1334,6 +1365,7 @@ struct llama_model_loader { int n_created = 0; int64_t n_elements = 0; + size_t n_bytes = 0; bool use_mmap = false; @@ -1366,6 +1398,7 @@ struct llama_model_loader { const char * name = gguf_get_tensor_name(ctx_gguf, i); struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name); n_elements += ggml_nelements(t); + n_bytes += ggml_nbytes(t); } LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", @@ -1644,7 +1677,7 @@ struct llama_model_loader { // load LLaMA models // -std::string llama_model_ftype_name(enum llama_ftype ftype) { +static std::string llama_model_ftype_name(enum llama_ftype ftype) { if (ftype & LLAMA_FTYPE_GUESSED) { return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)"; } @@ -1677,9 +1710,11 @@ std::string llama_model_ftype_name(enum llama_ftype ftype) { static const char * llama_model_type_name(e_model type) { switch (type) { + case MODEL_1B: return "1B"; case MODEL_3B: return "3B"; case MODEL_7B: return "7B"; case MODEL_13B: return "13B"; + case MODEL_15B: return "15B"; case MODEL_30B: return "30B"; case MODEL_34B: return "34B"; case MODEL_40B: return "40B"; @@ -1797,6 +1832,17 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_STARCODER: + { + GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + switch (hparams.n_layer) { + case 24: model.type = e_model::MODEL_1B; break; + case 36: model.type = e_model::MODEL_3B; break; + case 42: model.type = e_model::MODEL_7B; break; + case 40: model.type = e_model::MODEL_15B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; }; @@ -1950,7 +1996,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); - LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9); + LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9); + if (ml.n_bytes < GB) { + LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); + } else { + LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); + } // general kv LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str()); @@ -2250,6 +2301,85 @@ static void llm_load_tensors( } } } break; + case LLM_ARCH_STARCODER: + { + model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); + + // output + { + ggml_backend backend_norm; + ggml_backend backend_output; + + if (n_gpu_layers > int(n_layer)) { + // norm is not performance relevant on its own but keeping it in VRAM reduces data copying + // on Windows however this is detrimental unless everything is on the GPU +#ifndef _WIN32 + backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#else + backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#endif // _WIN32 + + backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } + + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + + if (backend_norm == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(model.output_norm); + vram_weights += ggml_nbytes(model.output_norm_b); + } + if (backend_output == GGML_BACKEND_GPU_SPLIT) { + vram_weights += ggml_nbytes(model.output); + } + } + + const uint32_t n_ff = hparams.n_ff; + + const int i_gpu_start = n_layer - n_gpu_layers; + + model.layers.resize(n_layer); + + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); + + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); + + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + + layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); + + layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); + + if (backend == GGML_BACKEND_GPU) { + vram_weights += + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + + ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) + + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) + + ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) + + ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3); + } + } + } break; default: throw std::runtime_error("unknown architecture"); }; @@ -3445,6 +3575,248 @@ static struct ggml_cgraph * llm_build_falcon( return gf; } +static struct ggml_cgraph * llm_build_starcoder( + llama_context & lctx, + llama_batch & batch) { + const auto & model = lctx.model; + const auto & hparams = model.hparams; + + const auto & kv_self = lctx.kv_self; + + GGML_ASSERT(!!kv_self.ctx); + + const int64_t n_embd = hparams.n_embd; + const int64_t n_layer = hparams.n_layer; + const int64_t n_ctx = hparams.n_ctx; + const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + GGML_ASSERT(n_embd_head == hparams.n_rot); + + const float norm_eps = hparams.f_norm_eps; + + const int32_t n_tokens = batch.n_tokens; + + auto & buf_compute = lctx.buf_compute; + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.data, + /*.no_alloc =*/ false, + }; + + params.no_alloc = true; + + struct ggml_context * ctx0 = ggml_init(params); + + ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * cur; + struct ggml_tensor * token; + struct ggml_tensor * position; + struct ggml_tensor * inpL; + + if (batch.token) { + struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + + ggml_allocr_alloc(lctx.alloc, inp_tokens); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); + } + ggml_set_name(inp_tokens, "inp_tokens"); + + token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); + } else { +#ifdef GGML_USE_MPI + GGML_ASSERT(false && "not implemented"); +#endif + + token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); + + ggml_allocr_alloc(lctx.alloc, token); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token)); + } + } + + { + // Compute position embeddings. + struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_allocr_alloc(lctx.alloc, inp_positions); + if (!ggml_allocr_is_measure(lctx.alloc)) { + for (int i = 0; i < n_tokens; ++i) { + ((int32_t *) inp_positions->data)[i] = batch.pos[i]; + } + } + ggml_set_name(inp_positions, "inp_positions"); + + position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions); + } + + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(lctx.alloc, KQ_scale); + if (!ggml_allocr_is_measure(lctx.alloc)) { + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + } + ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_ctx, n_tokens, 1); + ggml_allocr_alloc(lctx.alloc, KQ_mask); + if (!ggml_allocr_is_measure(lctx.alloc)) { + float * data = (float *) KQ_mask->data; + memset(data, 0, ggml_nbytes(KQ_mask)); + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + for (int i = 0; i < n_ctx; ++i) { + if (!kv_self.cells[i].has_seq_id(batch.seq_id[j]) || kv_self.cells[i].pos > batch.pos[j]) { + data[h*(n_ctx*n_tokens) + j*n_ctx + i] = -INFINITY; + } + } + } + } + } + + inpL = ggml_add(ctx0, token, position); + ggml_set_name(inpL, "inpL"); + + for (int il = 0; il < n_layer; ++il) { + { + // Norm + cur = ggml_norm(ctx0, inpL, norm_eps); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); + } + + { + // Self Attention + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv); + + struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd); + struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd); + struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa)); + + struct ggml_tensor * Qcur = tmpq; + struct ggml_tensor * Kcur = tmpk; + + { + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); + ggml_set_name(Vcur, "Vcur"); + + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_self.head)); + ggml_set_name(k, "k"); + + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_self.head*ggml_element_size(kv_self.v)); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } + + struct ggml_tensor * Q = + ggml_permute(ctx0, + ggml_cpy(ctx0, + Qcur, + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)), + 0, 2, 1, 3); + ggml_set_name(Q, "Q"); + + struct ggml_tensor * K = + ggml_view_3d(ctx0, kv_self.k, + n_embd_head, n_ctx, n_head_kv, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + ggml_set_name(K, "K"); + + // K * Q + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + ggml_set_name(KQ, "KQ"); + + // KQ_scaled = KQ / sqrt(n_embd_head) + // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] + struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); + ggml_set_name(KQ_scaled, "KQ_scaled"); + + // KQ_masked = mask_past(KQ_scaled) + struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); + ggml_set_name(KQ_masked, "KQ_masked"); + + // KQ = soft_max(KQ_masked) + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + ggml_set_name(KQ_soft_max, "KQ_soft_max"); + + // split cached V into n_head heads + struct ggml_tensor * V = + ggml_view_3d(ctx0, kv_self.v, + n_ctx, n_embd_head, n_head_kv, + ggml_element_size(kv_self.v)*n_ctx, + ggml_element_size(kv_self.v)*n_ctx*n_embd_head, + ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); + ggml_set_name(V, "V"); + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + ggml_set_name(KQV, "KQV"); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + ggml_set_name(KQV_merged, "KQV_merged"); + + // cur = KQV_merged.contiguous().view(n_embd, n_tokens) + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens)); + ggml_set_name(cur, "KQV_merged_contiguous"); + } + + // Projection + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo); + + // Add the input + cur = ggml_add(ctx0, cur, inpL); + + struct ggml_tensor * inpFF = cur; + + // FF + { + // Norm + { + cur = ggml_norm(ctx0, inpFF, norm_eps); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b); + } + + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); + + // GELU activation + cur = ggml_gelu(ctx0, cur); + + // Projection + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2); + } + + inpL = ggml_add(ctx0, cur, inpFF); + } + + // Output Norm + { + cur = ggml_norm(ctx0, inpL, norm_eps); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b); + } + ggml_set_name(cur, "result_norm"); + + cur = ggml_mul_mat(ctx0, model.output, cur); + ggml_set_name(cur, "result_output"); + + ggml_build_forward_expand(gf, cur); + ggml_free(ctx0); + + return gf; +} + static struct ggml_cgraph * llama_build_graph( llama_context & lctx, llama_batch & batch) { @@ -3465,6 +3837,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm_build_falcon(lctx, batch); } break; + case LLM_ARCH_STARCODER: + { + result = llm_build_starcoder(lctx, batch); + } break; default: GGML_ASSERT(false); }; @@ -4070,7 +4446,7 @@ struct llama_grammar_candidate { // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`. -std::pair, llama_partial_utf8> decode_utf8( +static std::pair, llama_partial_utf8> decode_utf8( const char * src, llama_partial_utf8 partial_start) { static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 }; @@ -5668,7 +6044,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } // TODO: after the GGUF PR, this likely won't work and needs to be updated -int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) { +static int llama_apply_lora_from_file_internal( + const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads +) { LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); const int64_t t_start_lora_us = ggml_time_us(); @@ -6220,7 +6598,7 @@ struct llama_context * llama_new_context_with_model( return ctx; } -struct llama_context * llama_init_from_file( +static struct llama_context * llama_init_from_file( const char * path_model, struct llama_context_params params) { struct llama_model * model = llama_load_model_from_file(path_model, params); @@ -6429,7 +6807,7 @@ struct llama_data_file_context : llama_data_context { * llama_copy_state_data(ctx, &data_ctx); * */ -void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { +static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { // TODO: does not support multi-sequence states { const auto & kv_self = ctx->kv_self; @@ -6835,19 +7213,21 @@ llama_token llama_token_nl(const struct llama_context * ctx) { int llama_tokenize( struct llama_context * ctx, const char * text, + int text_len, llama_token * tokens, int n_max_tokens, bool add_bos) { - return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos); + return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos); } int llama_tokenize_with_model( const struct llama_model * model, const char * text, + int text_len, llama_token * tokens, int n_max_tokens, bool add_bos) { - auto res = llama_tokenize_internal(model->vocab, text, add_bos); + auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos); if (n_max_tokens < (int) res.size()) { // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); @@ -6989,7 +7369,9 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) { } // For internal test use -const std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { +const std::vector> & llama_internal_get_tensor_map( + struct llama_context * ctx +) { return ctx->model.tensors_by_name; } diff --git a/llama.h b/llama.h index 043b62e10..0af9c1089 100644 --- a/llama.h +++ b/llama.h @@ -388,6 +388,7 @@ extern "C" { LLAMA_API int llama_tokenize( struct llama_context * ctx, const char * text, + int text_len, llama_token * tokens, int n_max_tokens, bool add_bos); @@ -395,6 +396,7 @@ extern "C" { LLAMA_API int llama_tokenize_with_model( const struct llama_model * model, const char * text, + int text_len, llama_token * tokens, int n_max_tokens, bool add_bos); @@ -554,7 +556,9 @@ extern "C" { struct ggml_tensor; -const std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx); +const std::vector> & llama_internal_get_tensor_map( + struct llama_context * ctx +); #endif // LLAMA_API_INTERNAL diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp index 48758cda8..e96372c4b 100644 --- a/pocs/vdot/vdot.cpp +++ b/pocs/vdot/vdot.cpp @@ -16,7 +16,7 @@ constexpr int kVecSize = 1 << 18; -float drawFromGaussianPdf(std::mt19937& rndm) { +static float drawFromGaussianPdf(std::mt19937& rndm) { constexpr double kScale = 1./(1. + std::mt19937::max()); constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale; static float lastX; @@ -28,7 +28,8 @@ float drawFromGaussianPdf(std::mt19937& rndm) { haveX = true; return r*cos(phi); } -void fillRandomGaussianFloats(std::vector& values, std::mt19937& rndm, float mean = 0) { + +static void fillRandomGaussianFloats(std::vector& values, std::mt19937& rndm, float mean = 0) { for (auto& v : values) v = mean + drawFromGaussianPdf(rndm); } diff --git a/scripts/build-info.cmake b/scripts/build-info.cmake index 5023b77ab..e33f3349a 100644 --- a/scripts/build-info.cmake +++ b/scripts/build-info.cmake @@ -2,6 +2,8 @@ set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.h.in") set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h") set(BUILD_NUMBER 0) set(BUILD_COMMIT "unknown") +set(BUILD_COMPILER "unknown") +set(BUILD_TARGET "unknown") # Look for git find_package(Git) @@ -41,11 +43,45 @@ if(Git_FOUND) endif() endif() +if(GIT_HEAD_RESULT EQUAL 0 AND GIT_COUNT_RESULT EQUAL 0) + set(BUILD_COMMIT ${HEAD}) + set(BUILD_NUMBER ${COUNT}) +endif() + +execute_process( + COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER} + OUTPUT_VARIABLE OUT + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE RES +) +if (RES EQUAL 0) + set(BUILD_COMPILER ${OUT}) +endif() + +execute_process( + COMMAND ${CMAKE_C_COMPILER} -dumpmachine + OUTPUT_VARIABLE OUT + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE RES +) +if (RES EQUAL 0) + set(BUILD_TARGET ${OUT}) +endif() + # Only write the header if it's changed to prevent unnecessary recompilation if(EXISTS ${HEADER_FILE}) - file(STRINGS ${HEADER_FILE} CONTENTS REGEX "BUILD_COMMIT \"([^\"]*)\"") - list(GET CONTENTS 0 EXISTING) - if(NOT EXISTING STREQUAL "#define BUILD_COMMIT \"${BUILD_COMMIT}\"") + file(READ ${HEADER_FILE} CONTENTS) + string(REGEX MATCH "BUILD_COMMIT \"([^\"]*)\"" _ ${CONTENTS}) + set(OLD_COMMIT ${CMAKE_MATCH_1}) + string(REGEX MATCH "BUILD_COMPILER \"([^\"]*)\"" _ ${CONTENTS}) + set(OLD_COMPILER ${CMAKE_MATCH_1}) + string(REGEX MATCH "BUILD_TARGET \"([^\"]*)\"" _ ${CONTENTS}) + set(OLD_TARGET ${CMAKE_MATCH_1}) + if ( + NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR + NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR + NOT OLD_TARGET STREQUAL BUILD_TARGET + ) configure_file(${TEMPLATE_FILE} ${HEADER_FILE}) endif() else() diff --git a/scripts/build-info.h.in b/scripts/build-info.h.in index 75d1e16fd..e996faef0 100644 --- a/scripts/build-info.h.in +++ b/scripts/build-info.h.in @@ -3,5 +3,7 @@ #define BUILD_NUMBER @BUILD_NUMBER@ #define BUILD_COMMIT "@BUILD_COMMIT@" +#define BUILD_COMPILER "@BUILD_COMPILER@" +#define BUILD_TARGET "@BUILD_TARGET@" #endif // BUILD_INFO_H diff --git a/scripts/build-info.sh b/scripts/build-info.sh index ed0d6c56a..3c8b1fb85 100755 --- a/scripts/build-info.sh +++ b/scripts/build-info.sh @@ -1,23 +1,35 @@ #!/bin/sh -BUILD_NUMBER="0" -BUILD_COMMIT="unknown" +CC=$1 -REV_LIST=$(git rev-list --count HEAD) -if [ $? -eq 0 ]; then - BUILD_NUMBER=$REV_LIST +build_number="0" +build_commit="unknown" +build_compiler="unknown" +build_target="unknown" + +if out=$(git rev-list --count HEAD); then + # git is broken on WSL so we need to strip extra newlines + build_number=$(printf '%s' "$out" | tr -d '\n') fi -REV_PARSE=$(git rev-parse --short HEAD) -if [ $? -eq 0 ]; then - BUILD_COMMIT=$REV_PARSE +if out=$(git rev-parse --short HEAD); then + build_commit=$(printf '%s' "$out" | tr -d '\n') +fi + +if out=$($CC --version | head -1); then + build_compiler=$out +fi + +if out=$($CC -dumpmachine); then + build_target=$out fi echo "#ifndef BUILD_INFO_H" echo "#define BUILD_INFO_H" -echo "" -echo "#define BUILD_NUMBER $BUILD_NUMBER" | tr -d '\n' -echo "" -echo "#define BUILD_COMMIT \"$BUILD_COMMIT\"" | tr -d '\n' -echo "" +echo +echo "#define BUILD_NUMBER $build_number" +echo "#define BUILD_COMMIT \"$build_commit\"" +echo "#define BUILD_COMPILER \"$build_compiler\"" +echo "#define BUILD_TARGET \"$build_target\"" +echo echo "#endif // BUILD_INFO_H" diff --git a/tests/test-opt.cpp b/tests/test-opt.cpp index 8ab240202..ce4976858 100644 --- a/tests/test-opt.cpp +++ b/tests/test-opt.cpp @@ -36,15 +36,15 @@ #define GGML_PRINT(...) printf(__VA_ARGS__) -float frand(void) { +static float frand(void) { return (float)rand()/(float)RAND_MAX; } -int irand(int n) { +static int irand(int n) { return rand()%n; } -void get_random_dims(int64_t * dims, int ndims) { +static void get_random_dims(int64_t * dims, int ndims) { dims[0] = dims[1] = dims[2] = dims[3] = 1; for (int i = 0; i < ndims; i++) { @@ -52,7 +52,7 @@ void get_random_dims(int64_t * dims, int ndims) { } } -void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) { +static void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) { dims[0] = dims[1] = dims[2] = dims[3] = 1; for (int i = 0; i < ndims; i++) { @@ -61,12 +61,9 @@ void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) { } -struct ggml_tensor * get_random_tensor( - struct ggml_context * ctx0, - int ndims, - int64_t ne[], - float fmin, - float fmax) { +static struct ggml_tensor * get_random_tensor( + struct ggml_context * ctx0, int ndims, int64_t ne[], float fmin, float fmax +) { struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne); switch (ndims) { @@ -109,11 +106,11 @@ struct ggml_tensor * get_random_tensor( return result; } -float get_element(const struct ggml_tensor * t, int idx) { +static float get_element(const struct ggml_tensor * t, int idx) { return ((float *)t->data)[idx]; } -void set_element(struct ggml_tensor * t, int idx, float value) { +static void set_element(struct ggml_tensor * t, int idx, float value) { ((float *)t->data)[idx] = value; } diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index 8d3c162d2..884af4054 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -13,24 +13,24 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f; -const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f; -const float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f; -const float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f; -const float MAX_DOT_PRODUCT_ERROR = 0.02f; +constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f; +constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f; +constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f; +constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f; +constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f; -const char* RESULT_STR[] = {"ok", "FAILED"}; +static const char* RESULT_STR[] = {"ok", "FAILED"}; // Generate synthetic data -void generate_data(float offset, size_t n, float * dst) { +static void generate_data(float offset, size_t n, float * dst) { for (size_t i = 0; i < n; i++) { dst[i] = 0.1 + 2*cosf(i + offset); } } // Calculate RMSE between two float arrays -float array_rmse(const float * a1, const float * a2, size_t n) { +static float array_rmse(const float * a1, const float * a2, size_t n) { double sum = 0; for (size_t i = 0; i < n; i++) { double diff = a1[i] - a2[i]; @@ -40,7 +40,7 @@ float array_rmse(const float * a1, const float * a2, size_t n) { } // Total quantization error on test data -float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { +static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { std::vector tmp_q(2*test_size); std::vector tmp_out(test_size); @@ -50,7 +50,7 @@ float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, cons } // Total quantization error on test data -float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { +static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { std::vector tmp_q(2*test_size); std::vector tmp_out(test_size); std::vector tmp_out_ref(test_size); @@ -64,7 +64,7 @@ float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size); } -float dot_product(const float * a1, const float * a2, size_t test_size) { +static float dot_product(const float * a1, const float * a2, size_t test_size) { double sum = 0; for (size_t i = 0; i < test_size; i++) { sum += a1[i] * a2[i]; @@ -73,7 +73,9 @@ float dot_product(const float * a1, const float * a2, size_t test_size) { } // Total dot product error -float dot_product_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) { +static float dot_product_error( + ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2 +) { std::vector tmp_q1(2*test_size); std::vector tmp_q2(2*test_size); diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index cbea7d452..01aa69877 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -61,22 +61,22 @@ inline int64_t cpu_cycles() { // Generate synthetic data -void generate_data(float offset, size_t n, float * dst) { +static void generate_data(float offset, size_t n, float * dst) { for (size_t i = 0; i < n; i++) { dst[i] = 0.1 + 2*cosf(i + offset); } } -float gigabytes_per_second(size_t bytes, int64_t usecs) { +static float gigabytes_per_second(size_t bytes, int64_t usecs) { return bytes / (float) usecs * 1000000 / (1024*1024*1024); } -void * align_with_offset(void * ptr, int offset) { +static void * align_with_offset(void * ptr, int offset) { size_t dummy_size = MAX_ALIGNMENT * 4; return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset; } -void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function & function) { +static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function & function) { int64_t min_time_us = INT64_MAX; int64_t total_time_us = 0; int64_t min_time_cycles = INT64_MAX; @@ -108,7 +108,7 @@ void benchmark_function(size_t size, size_t q_size, int64_t iterations, const st printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * iterations, total_time_us)); } -void usage(char * argv[]) { +static void usage(char * argv[]) { printf("Benchmark quantization specific functions on synthetic data\n"); printf("\n"); printf("usage: %s [options]\n", argv[0]); diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 4437c3948..019c0d462 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -12,7 +12,8 @@ #include #include -void dump(const llama_token_data_array * candidates) { + +static void dump(const llama_token_data_array * candidates) { for (size_t i = 0; i < candidates->size; i++) { printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit); } @@ -21,9 +22,7 @@ void dump(const llama_token_data_array * candidates) { #define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0) -void test_top_k(const std::vector & probs, - const std::vector & expected_probs, - int k) { +static void test_top_k(const std::vector & probs, const std::vector & expected_probs, int k) { size_t n_vocab = probs.size(); std::vector candidates; candidates.reserve(n_vocab); @@ -45,10 +44,7 @@ void test_top_k(const std::vector & probs, } -void test_top_p(const std::vector & probs, - const std::vector & expected_probs, - float p) { - +static void test_top_p(const std::vector & probs, const std::vector & expected_probs, float p) { size_t n_vocab = probs.size(); std::vector candidates; candidates.reserve(n_vocab); @@ -70,9 +66,7 @@ void test_top_p(const std::vector & probs, } -void test_tfs(const std::vector & probs, - const std::vector & expected_probs, - float z) { +static void test_tfs(const std::vector & probs, const std::vector & expected_probs, float z) { size_t n_vocab = probs.size(); std::vector candidates; candidates.reserve(n_vocab); @@ -93,9 +87,7 @@ void test_tfs(const std::vector & probs, } -void test_typical(const std::vector & probs, - const std::vector & expected_probs, - float p) { +static void test_typical(const std::vector & probs, const std::vector & expected_probs, float p) { size_t n_vocab = probs.size(); std::vector candidates; candidates.reserve(n_vocab); @@ -116,11 +108,10 @@ void test_typical(const std::vector & probs, } -void test_repetition_penalty( - const std::vector & probs, - const std::vector & last_tokens, - const std::vector & expected_probs, - float penalty) { +static void test_repetition_penalty( + const std::vector & probs, const std::vector & last_tokens, + const std::vector & expected_probs, float penalty +) { assert(probs.size() == expected_probs.size()); size_t n_vocab = probs.size(); @@ -145,11 +136,10 @@ void test_repetition_penalty( } -void test_frequency_presence_penalty( - const std::vector & probs, - const std::vector & last_tokens, - const std::vector & expected_probs, - float alpha_frequency, float alpha_presence) { +static void test_frequency_presence_penalty( + const std::vector & probs, const std::vector & last_tokens, + const std::vector & expected_probs, float alpha_frequency, float alpha_presence +) { assert(probs.size() == expected_probs.size()); size_t n_vocab = probs.size(); diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp index edbd86f85..dfb2e81a9 100644 --- a/tests/test-tokenizer-0-llama.cpp +++ b/tests/test-tokenizer-0-llama.cpp @@ -36,6 +36,7 @@ static const std::map> & k_tests() { { " Hello" , { 1678, 15043, }, }, { " Hello" , { 268, 15043, }, }, { " Hello\n Hello" , { 268, 15043, 13, 1678, 15043, }, }, + { " (" , { 29871, 313, }, }, }; return _k_tests; diff --git a/tests/test-tokenizer-1-llama.cpp b/tests/test-tokenizer-1-llama.cpp index 24c420372..a95d462cf 100644 --- a/tests/test-tokenizer-1-llama.cpp +++ b/tests/test-tokenizer-1-llama.cpp @@ -13,7 +13,7 @@ typedef int codepoint; -std::string codepoint_to_utf8(codepoint cp) { +static std::string codepoint_to_utf8(codepoint cp) { std::string result; if (0x00 <= cp && cp <= 0x7f) { result.push_back(cp); @@ -87,11 +87,9 @@ int main(int argc, char **argv) { std::vector tokens = llama_tokenize(ctx, str, false); std::string check = llama_detokenize_spm(ctx, tokens); if (check != str) { - fprintf(stderr, "%s : error: token %d detokenizes to >%s<(%d) but tokenization of this detokenizes to >%s<(%d)\n", - __func__, i, str.c_str(), (int) str.length(), check.c_str(), (int) check.length()); - if (i != 3) { - return 2; - } + fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n", + __func__, i, str.c_str(), str.length(), check.c_str(), check.length()); + return 2; } } @@ -100,12 +98,10 @@ int main(int argc, char **argv) { std::string str = codepoint_to_utf8(cp); std::vector tokens = llama_tokenize(ctx, str, false); std::string check = llama_detokenize_spm(ctx, tokens); - if (str != check) { - fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%d) instead of >%s<(%d)\n", - __func__, cp, check.c_str(), (int) check.length(), str.c_str(), (int) str.length()); - if (cp != 0 && cp != 9601) { - return 3; - } + if (cp != 9601 && str != check) { + fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", + __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); + return 3; } } } @@ -114,8 +110,8 @@ int main(int argc, char **argv) { std::vector tokens = llama_tokenize(ctx, str, false); std::string check = llama_detokenize_spm(ctx, tokens); if (str != check) { - fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%d) instead of >%s<(%d)\n", - __func__, cp, check.c_str(), (int) check.length(), str.c_str(), (int) str.length()); + fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", + __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); return 4; } }