diff --git a/.gitignore b/.gitignore index b862a0415..1f841c830 100644 --- a/.gitignore +++ b/.gitignore @@ -52,6 +52,7 @@ models-mnt /server /simple /speculative +/parallel /train-text-from-scratch /vdot build-info.h diff --git a/Makefile b/Makefile index dc8ae3807..9b631c2a5 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative parallel tests/test-c.o # Binaries only useful for tests TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama @@ -563,6 +563,9 @@ beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o co speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + ifdef LLAMA_METAL metal: examples/metal/metal.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) diff --git a/llama.cpp b/llama.cpp index df0b39bfb..abfc16c1a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1318,7 +1318,7 @@ static bool llama_kv_cache_find_slot( } // find how many cells are currently in use -int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) { +static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) { for (uint32_t i = cache.size - 2; i > 0; --i) { if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) { return i + 1; @@ -1328,7 +1328,7 @@ int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) { return 0; } -void llama_kv_cache_rm_tokens(struct llama_kv_cache & cache, int32_t c0, int32_t c1) { +static void llama_kv_cache_rm_tokens(struct llama_kv_cache & cache, int32_t c0, int32_t c1) { if (c0 < 0) c0 = 0; if (c1 < 0) c1 = cache.size; @@ -1338,7 +1338,7 @@ void llama_kv_cache_rm_tokens(struct llama_kv_cache & cache, int32_t c0, int32_t } } -void llama_kv_cache_rm_seq( +static void llama_kv_cache_rm_seq( struct llama_kv_cache & cache, llama_seq_id seq_id, llama_pos p0, @@ -1353,7 +1353,7 @@ void llama_kv_cache_rm_seq( } } -void llama_kv_cache_keep_seq(struct llama_kv_cache & cache, llama_seq_id seq_id) { +static void llama_kv_cache_keep_seq(struct llama_kv_cache & cache, llama_seq_id seq_id) { for (uint32_t i = 0; i < cache.size; ++i) { if (!cache.cells[i].has_seq_id(seq_id)) { cache.cells[i].pos = -1; @@ -1362,7 +1362,7 @@ void llama_kv_cache_keep_seq(struct llama_kv_cache & cache, llama_seq_id seq_id) } } -void llama_kv_cache_shift_seq( +static void llama_kv_cache_shift_seq( struct llama_kv_cache & cache, llama_seq_id seq_id, llama_pos p0,