mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 03:01:45 +00:00
3778836046
Added falcon main and library based on llama.cpp CPU inference works (getting ~260ms/token on 7B 16 bit falcon) Tested with 7B 16 bit and the two shakespear models (both in 16 bit precisiononly) TODO/WIP: 1) quantization runs, creates a ggjt 3 file but something is wrong with the quantized model binary - even quantization from 16 -> 16 also fails, something is wrong in the tensors produced 2) mmap should work with quantized binaries once 1) is solved 3) CUDA support is mostly there, it's currently disabled (all CPU backend) 4) memory/context caluculations are off, GPU memory calculations are wrong either 5) the python conversion script is pre GGML 1 version (tokens without scores) 6) some stuff is still called "llama", some of it should be renamed to a generic name as it works for both 7) the GGML produced by the current python uses an old ftype method Makfiles: cmake on windows with build tools works the makefile for linux/msys was blind adjusted but not tested yet - possibly missed something Changes to the codebase: * repeat2 has been added to ggml (jploski - https://github.com/ggerganov/ggml/pull/231) including the backward variant (untested, probably fails) * minor changes to work with falcon (name length) * libfalcon is the previous "llama.cpp" and falcon_main is the previous main.cpp
325 lines
9.2 KiB
Makefile
325 lines
9.2 KiB
Makefile
# Define the default target now so that it is always the first target
|
|
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot
|
|
|
|
ifdef LLAMA_BUILD_SERVER
|
|
BUILD_TARGETS += server
|
|
endif
|
|
|
|
default: $(BUILD_TARGETS)
|
|
|
|
ifndef UNAME_S
|
|
UNAME_S := $(shell uname -s)
|
|
endif
|
|
|
|
ifndef UNAME_P
|
|
UNAME_P := $(shell uname -p)
|
|
endif
|
|
|
|
ifndef UNAME_M
|
|
UNAME_M := $(shell uname -m)
|
|
endif
|
|
|
|
CCV := $(shell $(CC) --version | head -n 1)
|
|
CXXV := $(shell $(CXX) --version | head -n 1)
|
|
|
|
# Mac OS + Arm can report x86_64
|
|
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
|
|
ifeq ($(UNAME_S),Darwin)
|
|
ifneq ($(UNAME_P),arm)
|
|
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
|
|
ifeq ($(SYSCTL_M),1)
|
|
# UNAME_P := arm
|
|
# UNAME_M := arm64
|
|
warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
|
|
endif
|
|
endif
|
|
endif
|
|
|
|
#
|
|
# Compile flags
|
|
#
|
|
|
|
# keep standard at C11 and C++11
|
|
# -Ofast tends to produce faster code, but may not be available for some compilers.
|
|
#OPT = -Ofast
|
|
OPT = -O3
|
|
CFLAGS = -I. $(OPT) -std=c11 -fPIC
|
|
CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
|
|
LDFLAGS =
|
|
|
|
ifdef LLAMA_DEBUG
|
|
CFLAGS += -O0 -g
|
|
CXXFLAGS += -O0 -g
|
|
LDFLAGS += -g
|
|
else
|
|
CFLAGS += -DNDEBUG
|
|
CXXFLAGS += -DNDEBUG
|
|
endif
|
|
|
|
# warnings
|
|
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
|
|
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
|
|
|
|
# OS specific
|
|
# TODO: support Windows
|
|
ifeq ($(UNAME_S),Linux)
|
|
CFLAGS += -pthread
|
|
CXXFLAGS += -pthread
|
|
endif
|
|
ifeq ($(UNAME_S),Darwin)
|
|
CFLAGS += -pthread
|
|
CXXFLAGS += -pthread
|
|
endif
|
|
ifeq ($(UNAME_S),FreeBSD)
|
|
CFLAGS += -pthread
|
|
CXXFLAGS += -pthread
|
|
endif
|
|
ifeq ($(UNAME_S),NetBSD)
|
|
CFLAGS += -pthread
|
|
CXXFLAGS += -pthread
|
|
endif
|
|
ifeq ($(UNAME_S),OpenBSD)
|
|
CFLAGS += -pthread
|
|
CXXFLAGS += -pthread
|
|
endif
|
|
ifeq ($(UNAME_S),Haiku)
|
|
CFLAGS += -pthread
|
|
CXXFLAGS += -pthread
|
|
endif
|
|
|
|
ifdef LLAMA_GPROF
|
|
CFLAGS += -pg
|
|
CXXFLAGS += -pg
|
|
endif
|
|
ifdef LLAMA_PERF
|
|
CFLAGS += -DGGML_PERF
|
|
CXXFLAGS += -DGGML_PERF
|
|
endif
|
|
|
|
# Architecture specific
|
|
# TODO: probably these flags need to be tweaked on some architectures
|
|
# feel free to update the Makefile for your architecture and send a pull request or issue
|
|
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
|
|
# Use all CPU extensions that are available:
|
|
CFLAGS += -march=native -mtune=native
|
|
CXXFLAGS += -march=native -mtune=native
|
|
|
|
# Usage AVX-only
|
|
#CFLAGS += -mfma -mf16c -mavx
|
|
#CXXFLAGS += -mfma -mf16c -mavx
|
|
|
|
# Usage SSSE3-only (Not is SSE3!)
|
|
#CFLAGS += -mssse3
|
|
#CXXFLAGS += -mssse3
|
|
endif
|
|
|
|
ifneq ($(filter ppc64%,$(UNAME_M)),)
|
|
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
|
|
ifneq (,$(findstring POWER9,$(POWER9_M)))
|
|
CFLAGS += -mcpu=power9
|
|
CXXFLAGS += -mcpu=power9
|
|
endif
|
|
# Require c++23's std::byteswap for big-endian support.
|
|
ifeq ($(UNAME_M),ppc64)
|
|
CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
|
|
endif
|
|
endif
|
|
|
|
ifndef LLAMA_NO_K_QUANTS
|
|
CFLAGS += -DGGML_USE_K_QUANTS
|
|
CXXFLAGS += -DGGML_USE_K_QUANTS
|
|
OBJS += k_quants.o
|
|
endif
|
|
|
|
ifndef LLAMA_NO_ACCELERATE
|
|
# Mac M1 - include Accelerate framework.
|
|
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
|
|
ifeq ($(UNAME_S),Darwin)
|
|
CFLAGS += -DGGML_USE_ACCELERATE
|
|
LDFLAGS += -framework Accelerate
|
|
endif
|
|
endif # LLAMA_NO_ACCELERATE
|
|
|
|
ifdef LLAMA_OPENBLAS
|
|
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
|
|
ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),)
|
|
LDFLAGS += -lopenblas -lcblas
|
|
else
|
|
LDFLAGS += -lopenblas
|
|
endif
|
|
endif # LLAMA_OPENBLAS
|
|
|
|
ifdef LLAMA_BLIS
|
|
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
|
|
LDFLAGS += -lblis -L/usr/local/lib
|
|
endif # LLAMA_BLIS
|
|
|
|
ifdef LLAMA_CUBLAS
|
|
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
|
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
|
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
|
OBJS += ggml-cuda.o
|
|
NVCC = nvcc
|
|
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
|
|
ifdef LLAMA_CUDA_DMMV_X
|
|
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
|
else
|
|
NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
|
|
endif # LLAMA_CUDA_DMMV_X
|
|
ifdef LLAMA_CUDA_DMMV_Y
|
|
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
|
|
else
|
|
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
|
|
endif # LLAMA_CUDA_DMMV_Y
|
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
|
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
|
endif # LLAMA_CUBLAS
|
|
|
|
ifdef LLAMA_CLBLAST
|
|
CFLAGS += -DGGML_USE_CLBLAST
|
|
CXXFLAGS += -DGGML_USE_CLBLAST
|
|
# Mac provides OpenCL as a framework
|
|
ifeq ($(UNAME_S),Darwin)
|
|
LDFLAGS += -lclblast -framework OpenCL
|
|
else
|
|
LDFLAGS += -lclblast -lOpenCL
|
|
endif
|
|
OBJS += ggml-opencl.o
|
|
|
|
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
endif # LLAMA_CLBLAST
|
|
|
|
ifdef LLAMA_METAL
|
|
CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
|
|
CXXFLAGS += -DGGML_USE_METAL
|
|
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
|
|
OBJS += ggml-metal.o
|
|
|
|
ggml-metal.o: ggml-metal.m ggml-metal.h
|
|
$(CC) $(CFLAGS) -c $< -o $@
|
|
endif # LLAMA_METAL
|
|
|
|
ifneq ($(filter aarch64%,$(UNAME_M)),)
|
|
# Apple M1, M2, etc.
|
|
# Raspberry Pi 3, 4, Zero 2 (64-bit)
|
|
CFLAGS += -mcpu=native
|
|
CXXFLAGS += -mcpu=native
|
|
endif
|
|
|
|
ifneq ($(filter armv6%,$(UNAME_M)),)
|
|
# Raspberry Pi 1, Zero
|
|
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
|
|
endif
|
|
|
|
ifneq ($(filter armv7%,$(UNAME_M)),)
|
|
# Raspberry Pi 2
|
|
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
|
|
endif
|
|
|
|
ifneq ($(filter armv8%,$(UNAME_M)),)
|
|
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
|
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
|
|
endif
|
|
|
|
ifdef LLAMA_NO_K_QUANTS
|
|
k_quants.o: k_quants.c k_quants.h
|
|
$(CC) $(CFLAGS) -c $< -o $@
|
|
endif # LLAMA_NO_K_QUANTS
|
|
|
|
#
|
|
# Print build information
|
|
#
|
|
|
|
$(info I llama.cpp build info: )
|
|
$(info I UNAME_S: $(UNAME_S))
|
|
$(info I UNAME_P: $(UNAME_P))
|
|
$(info I UNAME_M: $(UNAME_M))
|
|
$(info I CFLAGS: $(CFLAGS))
|
|
$(info I CXXFLAGS: $(CXXFLAGS))
|
|
$(info I LDFLAGS: $(LDFLAGS))
|
|
$(info I CC: $(CCV))
|
|
$(info I CXX: $(CXXV))
|
|
$(info )
|
|
|
|
#
|
|
# Build library
|
|
#
|
|
|
|
ggml.o: ggml.c ggml.h ggml-cuda.h
|
|
$(CC) $(CFLAGS) -c $< -o $@
|
|
|
|
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
|
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
|
|
libfalcon.o: libfalcon.cpp ggml.h ggml-cuda.h libfalcon.h llama-util.h
|
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
|
|
common.o: examples/common.cpp examples/common.h
|
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
|
|
falcom_common.o: examples/falcon_common.cpp examples/falcon_common.h
|
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
|
|
libllama.so: llama.o ggml.o $(OBJS)
|
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
|
|
|
clean:
|
|
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h
|
|
|
|
#
|
|
# Examples
|
|
#
|
|
|
|
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
|
@echo
|
|
@echo '==== Run ./main -h for help. ===='
|
|
@echo
|
|
|
|
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
|
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
|
|
|
falcon_quantize: examples/falcon_quantize/quantize.cpp build-info.h ggml.o libfalcon.o $(OBJS)
|
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
|
|
|
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
|
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
|
|
|
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
|
|
|
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
|
|
|
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
|
|
|
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
|
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
|
|
|
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
|
@sh scripts/build-info.sh > $@.tmp
|
|
@if ! cmp -s $@.tmp $@; then \
|
|
mv $@.tmp $@; \
|
|
else \
|
|
rm $@.tmp; \
|
|
fi
|
|
|
|
falcon_main: examples/falcon/falcon_main.cpp build-info.h ggml.o libfalcon.o falcon_common.o $(OBJS)
|
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
|
#
|
|
# Tests
|
|
#
|
|
|
|
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
|
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
|
./$@
|
|
|
|
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
|
|
|
.PHONY: tests clean
|
|
tests:
|
|
bash ./tests/run-tests.sh
|