llama.cpp/Makefile

# Define the default target now so that it is always the first target
default: main quantize quantize-stats perplexity embedding vdot

ifndef UNAME_S
UNAME_S := $(shell uname -s)
endif

ifndef UNAME_P
UNAME_P := $(shell uname -p)
endif

ifndef UNAME_M
UNAME_M := $(shell uname -m)
endif

CCV := $(shell $(CC) --version | head -n 1)
CXXV := $(shell $(CXX) --version | head -n 1)

# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
ifeq ($(UNAME_S),Darwin)
	ifneq ($(UNAME_P),arm)
		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
		ifeq ($(SYSCTL_M),1)
			# UNAME_P := arm
			# UNAME_M := arm64
			warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
		endif
	endif
endif

#
# Compile flags
#

# keep standard at C11 and C++11
CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS  =

# warnings
CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar

# OS specific
# TODO: support Windows
ifeq ($(UNAME_S),Linux)
	CFLAGS   += -pthread
	CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),Darwin)
	CFLAGS   += -pthread
	CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),FreeBSD)
	CFLAGS   += -pthread
	CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),NetBSD)
	CFLAGS   += -pthread
	CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),OpenBSD)
	CFLAGS   += -pthread
	CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),Haiku)
	CFLAGS   += -pthread
	CXXFLAGS += -pthread
endif

# Architecture specific
# TODO: probably these flags need to be tweaked on some architectures
#       feel free to update the Makefile for your architecture and send a pull request or issue
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
	# Use all CPU extensions that are available:
	CFLAGS   += -march=native -mtune=native
	CXXFLAGS += -march=native -mtune=native

	# Usage AVX-only
	#CFLAGS   += -mfma -mf16c -mavx
	#CXXFLAGS += -mfma -mf16c -mavx
endif
ifneq ($(filter ppc64%,$(UNAME_M)),)
	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
	ifneq (,$(findstring POWER9,$(POWER9_M)))
		CFLAGS   += -mcpu=power9
		CXXFLAGS += -mcpu=power9
	endif
	# Require c++23's std::byteswap for big-endian support.
	ifeq ($(UNAME_M),ppc64)
		CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
	endif
endif
ifndef LLAMA_NO_ACCELERATE
	# Mac M1 - include Accelerate framework.
	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
	ifeq ($(UNAME_S),Darwin)
		CFLAGS  += -DGGML_USE_ACCELERATE
		LDFLAGS += -framework Accelerate
	endif
endif
ifdef LLAMA_OPENBLAS
	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
	LDFLAGS += -lopenblas
endif
ifdef LLAMA_CUBLAS
	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include
	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
	OBJS      += ggml-cuda.o
	NVCC      = nvcc
	NVCCFLAGS = --forward-unknown-to-host-linker -arch=native
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -c $< -o $@
endif
ifdef LLAMA_GPROF
	CFLAGS   += -pg
	CXXFLAGS += -pg
endif
ifneq ($(filter aarch64%,$(UNAME_M)),)
	CFLAGS   += -mcpu=native
	CXXFLAGS += -mcpu=native
endif
ifneq ($(filter armv6%,$(UNAME_M)),)
	# Raspberry Pi 1, 2, 3
	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
endif
ifneq ($(filter armv7%,$(UNAME_M)),)
	# Raspberry Pi 4
	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
endif
ifneq ($(filter armv8%,$(UNAME_M)),)
	# Raspberry Pi 4
	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif

ifdef LLAMA_NO_RMSE
	CFLAGS += -DGGML_NO_RMSE
endif

#
# Print build information
#

$(info I llama.cpp build info: )
$(info I UNAME_S:  $(UNAME_S))
$(info I UNAME_P:  $(UNAME_P))
$(info I UNAME_M:  $(UNAME_M))
$(info I CFLAGS:   $(CFLAGS))
$(info I CXXFLAGS: $(CXXFLAGS))
$(info I LDFLAGS:  $(LDFLAGS))
$(info I CC:       $(CCV))
$(info I CXX:      $(CXXV))
$(info )

#
# Build library
#

ggml.o: ggml.c ggml.h
	$(CC)  $(CFLAGS)   -c $< -o $@

llama.o: llama.cpp ggml.h llama.h llama_util.h
	$(CXX) $(CXXFLAGS) -c $< -o $@

common.o: examples/common.cpp examples/common.h
	$(CXX) $(CXXFLAGS) -c $< -o $@

clean:
	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult

main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS)
	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
	@echo
	@echo '====  Run ./main -h for help.  ===='
	@echo

quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS)
	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS)
	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS)
	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS)
	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

libllama.so: llama.o ggml.o $(OBJS)
	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

#
# Tests
#

benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o $(OBJS)
	$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
	./benchmark-q4_0-matmult

.PHONY: tests
tests:
	bash ./tests/run-tests.sh
Improve cuBLAS performance by dequantizing on the GPU (#1065) 2023-04-20 01:14:14 +00:00			`# Define the default target now so that it is always the first target`
			`default: main quantize quantize-stats perplexity embedding vdot`

Initial release 2023-03-10 18:40:58 +00:00			`ifndef UNAME_S`
			`UNAME_S := $(shell uname -s)`
			`endif`

			`ifndef UNAME_P`
			`UNAME_P := $(shell uname -p)`
			`endif`

			`ifndef UNAME_M`
			`UNAME_M := $(shell uname -m)`
			`endif`

			`CCV := $(shell $(CC) --version \| head -n 1)`
			`CXXV := $(shell $(CXX) --version \| head -n 1)`

			`# Mac OS + Arm can report x86_64`
			`# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789`
			`ifeq ($(UNAME_S),Darwin)`
			`ifneq ($(UNAME_P),arm)`
Makefile: slightly cleanup for Mac Intel; echo instead of run ./main -h (#335) 2023-03-21 15:44:11 +00:00			`SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)`
Initial release 2023-03-10 18:40:58 +00:00			`ifeq ($(SYSCTL_M),1)`
			`# UNAME_P := arm`
			`# UNAME_M := arm64`
			`warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)`
			`endif`
			`endif`
			`endif`

			`#`
			`# Compile flags`
			`#`

Add tokenizer test + revert to C++11 (#355) * Add test-tokenizer-0 to do a few tokenizations - feel free to expand * Added option to convert-pth-to-ggml.py script to dump just the vocabulary * Added ./models/ggml-vocab.bin containing just LLaMA vocab data (used for tests) * Added utility to load vocabulary file from previous point (temporary implementation) * Avoid using std::string_view and drop back to C++11 (hope I didn't break something) * Rename gpt_vocab -> llama_vocab * All CMake binaries go into ./bin/ now 2023-03-21 15:29:41 +00:00			`# keep standard at C11 and C++11`
Initial release 2023-03-10 18:40:58 +00:00			`CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC`
Add tokenizer test + revert to C++11 (#355) * Add test-tokenizer-0 to do a few tokenizations - feel free to expand * Added option to convert-pth-to-ggml.py script to dump just the vocabulary * Added ./models/ggml-vocab.bin containing just LLaMA vocab data (used for tests) * Added utility to load vocabulary file from previous point (temporary implementation) * Avoid using std::string_view and drop back to C++11 (hope I didn't break something) * Rename gpt_vocab -> llama_vocab * All CMake binaries go into ./bin/ now 2023-03-21 15:29:41 +00:00			`CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC`
Initial release 2023-03-10 18:40:58 +00:00			`LDFLAGS =`

all : be more strict about converting float to double (#458) * Be more strict about converting float to double * Test equivalence of round, SILU implementations Test module is commented out in CMakeLists.txt because the tests may take a long time, depending on how much the compiler optimizes. * Fix softmax in perplexity.cpp * all : prefer float over double where appropriate * perplexity : add <cmath> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-28 16:48:20 +00:00			`# warnings`
ggml : Q4 cleanup - remove 4-bit dot product code (#1061) * Q4 cleanup * Remove unused AVX512 Q4_0 code 2023-04-19 16:06:37 +00:00			`CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith`
Rewrite loading code to try to satisfy everyone: - Support all three formats (ggml, ggmf, ggjt). (However, I didn't include the hack needed to support GPT4All files without conversion. Those can still be used after converting them with convert.py from my other PR.) - Support both mmap and read (mmap is used by default, but can be disabled with `--no-mmap`, and is automatically disabled for pre-ggjt files or on platforms where mmap is not supported). - Support multi-file models like before, but automatically determine the number of parts rather than requiring `--n_parts`. - Improve validation and error checking. - Stop using the per-file type field (f16) entirely in favor of just relying on the per-tensor type/size fields. This has no immediate benefit, but makes it easier to experiment with different formats, and should make it easier to support the new GPTQ-for-LLaMa models in the future (I have some work in progress on that front). - Support VirtualLock on Windows (using the same `--mlock` option as on Unix). - Indicate loading progress when using mmap + mlock. (Which led me to the interesting observation that on my Linux machine, with a warm file cache, mlock actually takes some time, whereas mmap without mlock starts almost instantly...) - To help implement this, move mlock support from ggml to the loading code. - madvise/PrefetchVirtualMemory support (based on #740) - Switch from ifstream to the `fopen` family of functions to avoid unnecessary copying and, when mmap is enabled, allow reusing the same file descriptor for both metadata reads and mmap (whereas the existing implementation opens the file a second time to mmap). - Quantization now produces a single-file output even with multi-file inputs (not really a feature as much as 'it was easier this way'). Implementation notes: I tried to factor the code into more discrete pieces than before. Regarding code style: I tried to follow the code style, but I'm naughty and used a few advanced C++ features repeatedly: - Destructors to make it easier to ensure everything gets cleaned up. - Exceptions. I don't even usually use exceptions when writing C++, and I can remove them if desired... but here they make the loading code much more succinct while still properly handling a variety of errors, ranging from API calls failing to integer overflow and allocation failure. The exceptions are converted to error codes at the API boundary.) Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740) 2023-04-08 19:24:37 +00:00			`CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar`
all : be more strict about converting float to double (#458) * Be more strict about converting float to double * Test equivalence of round, SILU implementations Test module is commented out in CMakeLists.txt because the tests may take a long time, depending on how much the compiler optimizes. * Fix softmax in perplexity.cpp * all : prefer float over double where appropriate * perplexity : add <cmath> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-03-28 16:48:20 +00:00
Initial release 2023-03-10 18:40:58 +00:00			`# OS specific`
			`# TODO: support Windows`
			`ifeq ($(UNAME_S),Linux)`
			`CFLAGS += -pthread`
			`CXXFLAGS += -pthread`
			`endif`
			`ifeq ($(UNAME_S),Darwin)`
			`CFLAGS += -pthread`
			`CXXFLAGS += -pthread`
			`endif`
			`ifeq ($(UNAME_S),FreeBSD)`
			`CFLAGS += -pthread`
			`CXXFLAGS += -pthread`
			`endif`
Add NetBSD support. (#90) 2023-03-13 16:40:54 +00:00			`ifeq ($(UNAME_S),NetBSD)`
			`CFLAGS += -pthread`
			`CXXFLAGS += -pthread`
			`endif`
Add OpenBSD support (#314) 2023-03-21 15:50:09 +00:00			`ifeq ($(UNAME_S),OpenBSD)`
			`CFLAGS += -pthread`
			`CXXFLAGS += -pthread`
			`endif`
Initial release 2023-03-10 18:40:58 +00:00			`ifeq ($(UNAME_S),Haiku)`
			`CFLAGS += -pthread`
			`CXXFLAGS += -pthread`
			`endif`

			`# Architecture specific`
			`# TODO: probably these flags need to be tweaked on some architectures`
			`# feel free to update the Makefile for your architecture and send a pull request or issue`
			`ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))`
make : use -march=native -mtune=native on x86 (#609) 2023-04-02 07:17:05 +00:00			`# Use all CPU extensions that are available:`
ggml : fix AVX build + update to new Q8_0 format 2023-04-22 08:08:12 +00:00			`CFLAGS += -march=native -mtune=native`
make : missing host optimizations in CXXFLAGS (#763) 2023-04-05 14:38:37 +00:00			`CXXFLAGS += -march=native -mtune=native`
ggml : fix AVX build + update to new Q8_0 format 2023-04-22 08:08:12 +00:00
			`# Usage AVX-only`
			`#CFLAGS += -mfma -mf16c -mavx`
			`#CXXFLAGS += -mfma -mf16c -mavx`
Initial release 2023-03-10 18:40:58 +00:00			`endif`
			`ifneq ($(filter ppc64%,$(UNAME_M)),)`
			`POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)`
			`ifneq (,$(findstring POWER9,$(POWER9_M)))`
ggml : fix AVX build + update to new Q8_0 format 2023-04-22 08:08:12 +00:00			`CFLAGS += -mcpu=power9`
additional optimizations for POWER9 (#454) 2023-03-24 15:19:26 +00:00			`CXXFLAGS += -mcpu=power9`
Initial release 2023-03-10 18:40:58 +00:00			`endif`
			`# Require c++23's std::byteswap for big-endian support.`
			`ifeq ($(UNAME_M),ppc64)`
			`CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN`
			`endif`
			`endif`
Update Makefile var + add comment 2023-03-11 10:26:16 +00:00			`ifndef LLAMA_NO_ACCELERATE`
Makefile: slightly cleanup for Mac Intel; echo instead of run ./main -h (#335) 2023-03-21 15:44:11 +00:00			`# Mac M1 - include Accelerate framework.`
			# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
Initial release 2023-03-10 18:40:58 +00:00			`ifeq ($(UNAME_S),Darwin)`
			`CFLAGS += -DGGML_USE_ACCELERATE`
			`LDFLAGS += -framework Accelerate`
			`endif`
			`endif`
Update Makefile var + add comment 2023-03-11 10:26:16 +00:00			`ifdef LLAMA_OPENBLAS`
Initial release 2023-03-10 18:40:58 +00:00			`CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas`
			`LDFLAGS += -lopenblas`
			`endif`
Add NVIDIA cuBLAS support (#1044) 2023-04-19 09:22:45 +00:00			`ifdef LLAMA_CUBLAS`
Improve cuBLAS performance by using a memory pool (#1094) * Improve cuBLAS performance by using a memory pool * Move cuda specific definitions to ggml-cuda.h/cu * Add CXX flags to nvcc * Change memory pool synchronization mechanism to a spin lock General code cleanup 2023-04-21 19:59:17 +00:00			`CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include`
			`LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64`
			`OBJS += ggml-cuda.o`
			`NVCC = nvcc`
			`NVCCFLAGS = --forward-unknown-to-host-linker -arch=native`
Improve cuBLAS performance by dequantizing on the GPU (#1065) 2023-04-20 01:14:14 +00:00			`ggml-cuda.o: ggml-cuda.cu ggml-cuda.h`
Improve cuBLAS performance by using a memory pool (#1094) * Improve cuBLAS performance by using a memory pool * Move cuda specific definitions to ggml-cuda.h/cu * Add CXX flags to nvcc * Change memory pool synchronization mechanism to a spin lock General code cleanup 2023-04-21 19:59:17 +00:00			`$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -c $< -o $@`
Add NVIDIA cuBLAS support (#1044) 2023-04-19 09:22:45 +00:00			`endif`
Update Makefile var + add comment 2023-03-11 10:26:16 +00:00			`ifdef LLAMA_GPROF`
Initial release 2023-03-10 18:40:58 +00:00			`CFLAGS += -pg`
			`CXXFLAGS += -pg`
			`endif`
			`ifneq ($(filter aarch64%,$(UNAME_M)),)`
ggml : fix AVX build + update to new Q8_0 format 2023-04-22 08:08:12 +00:00			`CFLAGS += -mcpu=native`
Initial release 2023-03-10 18:40:58 +00:00			`CXXFLAGS += -mcpu=native`
			`endif`
			`ifneq ($(filter armv6%,$(UNAME_M)),)`
			`# Raspberry Pi 1, 2, 3`
			`CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access`
			`endif`
			`ifneq ($(filter armv7%,$(UNAME_M)),)`
			`# Raspberry Pi 4`
			`CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations`
			`endif`
			`ifneq ($(filter armv8%,$(UNAME_M)),)`
			`# Raspberry Pi 4`
			`CFLAGS += -mfp16-format=ieee -mno-unaligned-access`
			`endif`

RMSE-optimized quants for all quantization types By default this new option is ON. One can turn it off by setting LLAMA_NO_RMSE. With this option enabled, the Q4_3 quantization results in a perplexity of 6.0344, so 0.0273 lower than simple Q4_3 quantization. 2023-04-21 08:26:49 +00:00			`ifdef LLAMA_NO_RMSE`
			`CFLAGS += -DGGML_NO_RMSE`
			`endif`

Initial release 2023-03-10 18:40:58 +00:00			`#`
			`# Print build information`
			`#`

			`$(info I llama.cpp build info: )`
			`$(info I UNAME_S: $(UNAME_S))`
			`$(info I UNAME_P: $(UNAME_P))`
			`$(info I UNAME_M: $(UNAME_M))`
			`$(info I CFLAGS: $(CFLAGS))`
			`$(info I CXXFLAGS: $(CXXFLAGS))`
			`$(info I LDFLAGS: $(LDFLAGS))`
			`$(info I CC: $(CCV))`
			`$(info I CXX: $(CXXV))`
			`$(info )`

			`#`
			`# Build library`
			`#`

			`ggml.o: ggml.c ggml.h`
make : fix dependencies, use auto variables (#983) 2023-04-14 19:39:48 +00:00			`$(CC) $(CFLAGS) -c $< -o $@`
Initial release 2023-03-10 18:40:58 +00:00
make : fix dependencies, use auto variables (#983) 2023-04-14 19:39:48 +00:00			`llama.o: llama.cpp ggml.h llama.h llama_util.h`
			`$(CXX) $(CXXFLAGS) -c $< -o $@`
Introduce C-style API (#370) * Major refactoring - introduce C-style API * Clean up * Add <cassert> * Add <iterator> * Add <algorithm> .... * Fix timing reporting and accumulation * Measure eval time only for single-token calls * Change llama_tokenize return meaning 2023-03-22 05:32:36 +00:00
Overhaul the examples structure - main -> examples - utils -> examples (renamed to "common") - quantize -> examples - separate tools for "perplexity" and "embedding" Hope I didn't break something ! 2023-03-25 18:26:40 +00:00			`common.o: examples/common.cpp examples/common.h`
make : fix dependencies, use auto variables (#983) 2023-04-14 19:39:48 +00:00			`$(CXX) $(CXXFLAGS) -c $< -o $@`
Initial release 2023-03-10 18:40:58 +00:00
			`clean:`
benchmark : add tool for timing q4_0 matrix multiplication (#653) * Initial version of q4_0 matrix multiplication benchmark * Bugfix: Added dependency to ggml.o to benchmark * Reviewer requests: added parameter for threads, switched to ggml_time_us() * Reviewer input: removed rtsc, use epsilon for check * Review comment: Removed set_locale * Feature: Param for numer of iterations, Bugfix for use of parameter threads * Reviewer suggestion: Moved to examples * Reviewer feedback: Updated clean: and benchmark: sections --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-04-13 12:46:23 +00:00			`rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult`
Initial release 2023-03-10 18:40:58 +00:00
Improve cuBLAS performance by dequantizing on the GPU (#1065) 2023-04-20 01:14:14 +00:00			`main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS)`
make : fix dependencies, use auto variables (#983) 2023-04-14 19:39:48 +00:00			`$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)`
Fix Makefile echo escape codes (by removing them). (#418) 2023-03-23 11:41:32 +00:00			`@echo`
			`@echo '==== Run ./main -h for help. ===='`
			`@echo`
Initial release 2023-03-10 18:40:58 +00:00
Improve cuBLAS performance by dequantizing on the GPU (#1065) 2023-04-20 01:14:14 +00:00			`quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS)`
make : fix dependencies, use auto variables (#983) 2023-04-14 19:39:48 +00:00			`$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)`
Overhaul the examples structure - main -> examples - utils -> examples (renamed to "common") - quantize -> examples - separate tools for "perplexity" and "embedding" Hope I didn't break something ! 2023-03-25 18:26:40 +00:00
Improve cuBLAS performance by dequantizing on the GPU (#1065) 2023-04-20 01:14:14 +00:00			`quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS)`
make : fix dependencies, use auto variables (#983) 2023-04-14 19:39:48 +00:00			`$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)`
Add quantize-stats command for testing quantization (#728) Command that calculates some statistics over the errors introduced by quantization, like mean square error, max error and some percentile errors for layer weights. Should be useful for testing quantization improvements. Exposes some internal state from ggml and llama for testing 2023-04-07 22:09:18 +00:00
Improve cuBLAS performance by dequantizing on the GPU (#1065) 2023-04-20 01:14:14 +00:00			`perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS)`
make : fix dependencies, use auto variables (#983) 2023-04-14 19:39:48 +00:00			`$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)`
Initial release 2023-03-10 18:40:58 +00:00
Improve cuBLAS performance by dequantizing on the GPU (#1065) 2023-04-20 01:14:14 +00:00			`embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS)`
make : fix dependencies, use auto variables (#983) 2023-04-14 19:39:48 +00:00			`$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)`
Add embedding example to Makefile (#540) 2023-03-28 06:11:09 +00:00
Improve cuBLAS performance by dequantizing on the GPU (#1065) 2023-04-20 01:14:14 +00:00			`vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)`
Adding a simple program to measure speed of dot products (#1041) On my Mac, the direct Q4_1 product is marginally slower (~69 vs ~55 us for Q4_0). The SIMD-ified ggml version is now almost 2X slower (~121 us). On a Ryzen 7950X CPU, the direct product for Q4_1 quantization is faster than the AVX2 implementation (~60 vs ~62 us). --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com> 2023-04-18 19:00:14 +00:00			`$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)`

Improve cuBLAS performance by dequantizing on the GPU (#1065) 2023-04-20 01:14:14 +00:00			`libllama.so: llama.o ggml.o $(OBJS)`
make : fix dependencies, use auto variables (#983) 2023-04-14 19:39:48 +00:00			`$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)`
fix whitespace (#944) 2023-04-13 14:03:57 +00:00
Initial release 2023-03-10 18:40:58 +00:00			`#`
			`# Tests`
			`#`

Improve cuBLAS performance by dequantizing on the GPU (#1065) 2023-04-20 01:14:14 +00:00			`benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o $(OBJS)`
make : fix dependencies, use auto variables (#983) 2023-04-14 19:39:48 +00:00			`$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)`
benchmark : add tool for timing q4_0 matrix multiplication (#653) * Initial version of q4_0 matrix multiplication benchmark * Bugfix: Added dependency to ggml.o to benchmark * Reviewer requests: added parameter for threads, switched to ggml_time_us() * Reviewer input: removed rtsc, use epsilon for check * Review comment: Removed set_locale * Feature: Param for numer of iterations, Bugfix for use of parameter threads * Reviewer suggestion: Moved to examples * Reviewer feedback: Updated clean: and benchmark: sections --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-04-13 12:46:23 +00:00			`./benchmark-q4_0-matmult`
fix whitespace (#944) 2023-04-13 14:03:57 +00:00
Initial release 2023-03-10 18:40:58 +00:00			`.PHONY: tests`
			`tests:`
			`bash ./tests/run-tests.sh`