mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-14 23:09:53 +00:00
Merge branch 'ggerganov:master' into master
This commit is contained in:
commit
a4b7b4c398
15
.github/workflows/build.yml
vendored
15
.github/workflows/build.yml
vendored
@ -288,6 +288,7 @@ jobs:
|
|||||||
OPENBLAS_VERSION: 0.3.23
|
OPENBLAS_VERSION: 0.3.23
|
||||||
OPENCL_VERSION: 2023.04.17
|
OPENCL_VERSION: 2023.04.17
|
||||||
CLBLAST_VERSION: 1.6.0
|
CLBLAST_VERSION: 1.6.0
|
||||||
|
SDE_VERSION: 9.21.1-2023-04-24
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
@ -383,11 +384,23 @@ jobs:
|
|||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
|
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -C Release --verbose --timeout 900
|
ctest -C Release --verbose --timeout 900
|
||||||
|
|
||||||
|
- name: Test (Intel SDE)
|
||||||
|
id: cmake_test_sde
|
||||||
|
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
|
||||||
|
run: |
|
||||||
|
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz"
|
||||||
|
# for some weird reason windows tar doesn't like sde tar.xz
|
||||||
|
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
|
||||||
|
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
||||||
|
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
||||||
|
cd build
|
||||||
|
& $sde -future -- ctest -C Release --verbose --timeout 900
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
id: tag
|
id: tag
|
||||||
shell: bash
|
shell: bash
|
||||||
|
20
.github/workflows/python-lint.yml
vendored
Normal file
20
.github/workflows/python-lint.yml
vendored
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
name: flake8 Lint
|
||||||
|
|
||||||
|
on: [push, pull_request]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
flake8-lint:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
name: Lint
|
||||||
|
steps:
|
||||||
|
- name: Check out source repository
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
- name: Set up Python environment
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: "3.11"
|
||||||
|
- name: flake8 Lint
|
||||||
|
uses: py-actions/flake8@v2
|
||||||
|
with:
|
||||||
|
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704"
|
||||||
|
exclude: "examples/*,examples/*/**,*/**/__init__.py"
|
6
.gitignore
vendored
6
.gitignore
vendored
@ -15,6 +15,7 @@
|
|||||||
.DS_Store
|
.DS_Store
|
||||||
.build/
|
.build/
|
||||||
.cache/
|
.cache/
|
||||||
|
.ccls-cache/
|
||||||
.direnv/
|
.direnv/
|
||||||
.envrc
|
.envrc
|
||||||
.swiftpm
|
.swiftpm
|
||||||
@ -45,7 +46,7 @@ models-mnt
|
|||||||
/infill
|
/infill
|
||||||
/libllama.so
|
/libllama.so
|
||||||
/llama-bench
|
/llama-bench
|
||||||
/llava
|
/llava-cli
|
||||||
/main
|
/main
|
||||||
/metal
|
/metal
|
||||||
/perplexity
|
/perplexity
|
||||||
@ -63,8 +64,9 @@ models-mnt
|
|||||||
/speculative
|
/speculative
|
||||||
/parallel
|
/parallel
|
||||||
/train-text-from-scratch
|
/train-text-from-scratch
|
||||||
|
/tokenize
|
||||||
/vdot
|
/vdot
|
||||||
build-info.h
|
/common/build-info.cpp
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
compile_commands.json
|
compile_commands.json
|
||||||
CMakeSettings.json
|
CMakeSettings.json
|
||||||
|
@ -100,39 +100,6 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALO
|
|||||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||||
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
|
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
|
||||||
|
|
||||||
#
|
|
||||||
# Build info header
|
|
||||||
#
|
|
||||||
|
|
||||||
# Generate initial build-info.h
|
|
||||||
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
|
||||||
|
|
||||||
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
|
|
||||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/.git")
|
|
||||||
|
|
||||||
# Is git submodule
|
|
||||||
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
|
||||||
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
|
|
||||||
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
|
|
||||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Add a custom target for build-info.h
|
|
||||||
add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
|
|
||||||
|
|
||||||
# Add a custom command to rebuild build-info.h when .git/index changes
|
|
||||||
add_custom_command(
|
|
||||||
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h"
|
|
||||||
COMMENT "Generating build details from Git"
|
|
||||||
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
|
|
||||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
|
||||||
DEPENDS "${GIT_DIR}/index"
|
|
||||||
VERBATIM
|
|
||||||
)
|
|
||||||
else()
|
|
||||||
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Compile flags
|
# Compile flags
|
||||||
#
|
#
|
||||||
@ -491,6 +458,15 @@ if (LLAMA_LTO)
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# this version of Apple ld64 is buggy
|
||||||
|
execute_process(
|
||||||
|
COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
|
||||||
|
ERROR_VARIABLE output
|
||||||
|
)
|
||||||
|
if (output MATCHES "dyld-1015\.7")
|
||||||
|
add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
|
||||||
|
endif()
|
||||||
|
|
||||||
# Architecture specific
|
# Architecture specific
|
||||||
# TODO: probably these flags need to be tweaked on some architectures
|
# TODO: probably these flags need to be tweaked on some architectures
|
||||||
# feel free to update the Makefile for your architecture and send a pull request or issue
|
# feel free to update the Makefile for your architecture and send a pull request or issue
|
||||||
@ -543,6 +519,10 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC
|
|||||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
|
||||||
message(STATUS "x86 detected")
|
message(STATUS "x86 detected")
|
||||||
if (MSVC)
|
if (MSVC)
|
||||||
|
# instruction set detection for MSVC only
|
||||||
|
if (LLAMA_NATIVE)
|
||||||
|
include(cmake/FindSIMD.cmake)
|
||||||
|
endif ()
|
||||||
if (LLAMA_AVX512)
|
if (LLAMA_AVX512)
|
||||||
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
|
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
|
||||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
|
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
|
||||||
@ -594,8 +574,12 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
|
|||||||
endif()
|
endif()
|
||||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||||
message(STATUS "PowerPC detected")
|
message(STATUS "PowerPC detected")
|
||||||
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||||
|
add_compile_options(-mcpu=powerpc64le)
|
||||||
|
else()
|
||||||
add_compile_options(-mcpu=native -mtune=native)
|
add_compile_options(-mcpu=native -mtune=native)
|
||||||
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
||||||
|
endif()
|
||||||
else()
|
else()
|
||||||
message(STATUS "Unknown architecture")
|
message(STATUS "Unknown architecture")
|
||||||
endif()
|
endif()
|
||||||
|
96
Makefile
96
Makefile
@ -1,8 +1,8 @@
|
|||||||
# Define the default target now so that it is always the first target
|
# Define the default target now so that it is always the first target
|
||||||
BUILD_TARGETS = \
|
BUILD_TARGETS = \
|
||||||
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||||
simple batched batched-bench save-load-state server gguf llama-bench llava baby-llama beam-search \
|
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
||||||
speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
|
speculative infill tokenize benchmark-matmult parallel finetune export-lora tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
TEST_TARGETS = \
|
TEST_TARGETS = \
|
||||||
@ -239,6 +239,11 @@ else
|
|||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# this version of Apple ld64 is buggy
|
||||||
|
ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
|
||||||
|
MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
|
||||||
|
endif
|
||||||
|
|
||||||
# OS specific
|
# OS specific
|
||||||
# TODO: support Windows
|
# TODO: support Windows
|
||||||
ifneq '' '$(filter $(UNAME_S),Linux Darwin FreeBSD NetBSD OpenBSD Haiku)'
|
ifneq '' '$(filter $(UNAME_S),Linux Darwin FreeBSD NetBSD OpenBSD Haiku)'
|
||||||
@ -337,6 +342,12 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
|
|||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifneq ($(filter ppc64le%,$(UNAME_M)),)
|
||||||
|
MK_CFLAGS += -mcpu=powerpc64le
|
||||||
|
MK_CXXFLAGS += -mcpu=powerpc64le
|
||||||
|
CUDA_POWER_ARCH = 1
|
||||||
|
endif
|
||||||
|
|
||||||
else
|
else
|
||||||
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
||||||
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
||||||
@ -387,6 +398,8 @@ else
|
|||||||
endif #LLAMA_CUDA_NVCC
|
endif #LLAMA_CUDA_NVCC
|
||||||
ifdef CUDA_DOCKER_ARCH
|
ifdef CUDA_DOCKER_ARCH
|
||||||
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
|
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
|
||||||
|
else ifdef CUDA_POWER_ARCH
|
||||||
|
NVCCFLAGS +=
|
||||||
else
|
else
|
||||||
NVCCFLAGS += -arch=native
|
NVCCFLAGS += -arch=native
|
||||||
endif # CUDA_DOCKER_ARCH
|
endif # CUDA_DOCKER_ARCH
|
||||||
@ -542,9 +555,9 @@ llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h l
|
|||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
|
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
|
||||||
COMMON_DEPS = common.o sampling.o grammar-parser.o
|
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o
|
||||||
|
|
||||||
common.o: common/common.cpp build-info.h $(COMMON_H_DEPS)
|
common.o: common/common.cpp $(COMMON_H_DEPS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
sampling.o: common/sampling.cpp $(COMMON_H_DEPS)
|
sampling.o: common/sampling.cpp $(COMMON_H_DEPS)
|
||||||
@ -563,46 +576,49 @@ libllama.so: llama.o ggml.o $(OBJS)
|
|||||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Examples
|
# Examples
|
||||||
#
|
#
|
||||||
|
|
||||||
main: examples/main/main.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
@echo
|
@echo
|
||||||
@echo '==== Run ./main -h for help. ===='
|
@echo '==== Run ./main -h for help. ===='
|
||||||
@echo
|
@echo
|
||||||
|
|
||||||
infill: examples/infill/infill.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
batched: examples/batched/batched.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
batched-bench: examples/batched-bench/batched-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
|
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
|
quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
|
||||||
|
|
||||||
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
|
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
|
||||||
@ -614,28 +630,31 @@ train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratc
|
|||||||
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
|
||||||
|
|
||||||
|
llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||||
|
|
||||||
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
export-lora: examples/export-lora/export-lora.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
ifdef LLAMA_METAL
|
ifdef LLAMA_METAL
|
||||||
@ -648,7 +667,7 @@ swift: examples/batched.swift
|
|||||||
(cd examples/batched.swift; make build)
|
(cd examples/batched.swift; make build)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
|
||||||
@sh scripts/build-info.sh $(CC) > $@.tmp
|
@sh scripts/build-info.sh $(CC) > $@.tmp
|
||||||
@if ! cmp -s $@.tmp $@; then \
|
@if ! cmp -s $@.tmp $@; then \
|
||||||
mv $@.tmp $@; \
|
mv $@.tmp $@; \
|
||||||
@ -656,13 +675,16 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
|||||||
rm $@.tmp; \
|
rm $@.tmp; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
build-info.o: common/build-info.cpp
|
||||||
|
$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
|
||||||
|
|
||||||
#
|
#
|
||||||
# Tests
|
# Tests
|
||||||
#
|
#
|
||||||
|
|
||||||
tests: $(TEST_TARGETS)
|
tests: $(TEST_TARGETS)
|
||||||
|
|
||||||
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
|
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
run-benchmark-matmult: benchmark-matmult
|
run-benchmark-matmult: benchmark-matmult
|
||||||
@ -676,40 +698,40 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
|||||||
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-double-float: tests/test-double-float.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-grad0: tests/test-grad0.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-opt: tests/test-opt.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-c.o: tests/test-c.c llama.h
|
tests/test-c.o: tests/test-c.c llama.h
|
||||||
|
20
README.md
20
README.md
@ -2,7 +2,6 @@
|
|||||||
|
|
||||||
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
|
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
|
||||||
|
|
||||||
[![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
|
|
||||||
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
|
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
|
||||||
|
|
||||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||||
@ -11,8 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
|||||||
|
|
||||||
### Hot topics
|
### Hot topics
|
||||||
|
|
||||||
- LLaVA support: https://github.com/ggerganov/llama.cpp/pull/3436
|
- Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167
|
||||||
- ‼️ BPE tokenizer update: existing Falcon and Starcoder `.gguf` models will need to be reconverted: [#3252](https://github.com/ggerganov/llama.cpp/pull/3252)
|
|
||||||
|
|
||||||
----
|
----
|
||||||
|
|
||||||
@ -95,6 +93,7 @@ as the main playground for developing new features for the [ggml](https://github
|
|||||||
- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
|
- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
|
||||||
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
|
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
|
||||||
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
|
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
|
||||||
|
- [X] [StableLM-3b-4e1t](https://github.com/ggerganov/llama.cpp/pull/3586)
|
||||||
|
|
||||||
|
|
||||||
**Bindings:**
|
**Bindings:**
|
||||||
@ -411,22 +410,31 @@ Building the program with BLAS support may lead to some performance improvements
|
|||||||
This provides BLAS acceleration on HIP-supported AMD GPUs.
|
This provides BLAS acceleration on HIP-supported AMD GPUs.
|
||||||
Make sure to have ROCm installed.
|
Make sure to have ROCm installed.
|
||||||
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
|
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
|
||||||
Windows support is coming soon...
|
|
||||||
|
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
```bash
|
```bash
|
||||||
make LLAMA_HIPBLAS=1
|
make LLAMA_HIPBLAS=1
|
||||||
```
|
```
|
||||||
- Using `CMake`:
|
- Using `CMake` for Linux:
|
||||||
```bash
|
```bash
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
|
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
|
||||||
cmake --build .
|
cmake --build .
|
||||||
```
|
```
|
||||||
|
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS):
|
||||||
|
```bash
|
||||||
|
set PATH=%HIP_PATH%\bin;%PATH%
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
|
||||||
|
cmake --build .
|
||||||
|
```
|
||||||
|
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
||||||
|
|
||||||
|
|
||||||
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
|
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
|
||||||
If your GPU is not officialy supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
|
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
|
||||||
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
|
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
|
||||||
|
|
||||||
| Option | Legal values | Default | Description |
|
| Option | Legal values | Default | Description |
|
||||||
|
38
build.zig
38
build.zig
@ -10,7 +10,6 @@ const Maker = struct {
|
|||||||
builder: *std.build.Builder,
|
builder: *std.build.Builder,
|
||||||
target: CrossTarget,
|
target: CrossTarget,
|
||||||
optimize: Mode,
|
optimize: Mode,
|
||||||
config_header: *ConfigHeader,
|
|
||||||
enable_lto: bool,
|
enable_lto: bool,
|
||||||
|
|
||||||
include_dirs: ArrayList([]const u8),
|
include_dirs: ArrayList([]const u8),
|
||||||
@ -41,26 +40,24 @@ const Maker = struct {
|
|||||||
const commit_hash = try std.ChildProcess.exec(
|
const commit_hash = try std.ChildProcess.exec(
|
||||||
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
|
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
|
||||||
);
|
);
|
||||||
const config_header = builder.addConfigHeader(
|
try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
|
||||||
.{ .style = .blank, .include_path = "build-info.h" },
|
\\int LLAMA_BUILD_NUMBER = {};
|
||||||
.{
|
\\char const *LLAMA_COMMIT = "{s}";
|
||||||
.BUILD_NUMBER = 0,
|
\\char const *LLAMA_COMPILER = "Zig {s}";
|
||||||
.BUILD_COMMIT = commit_hash.stdout[0 .. commit_hash.stdout.len - 1], // omit newline
|
\\char const *LLAMA_BUILD_TARGET = "{s}";
|
||||||
.BUILD_COMPILER = builder.fmt("Zig {s}", .{zig_version}),
|
\\
|
||||||
.BUILD_TARGET = try target.allocDescription(builder.allocator),
|
, .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
|
||||||
},
|
|
||||||
);
|
|
||||||
var m = Maker{
|
var m = Maker{
|
||||||
.builder = builder,
|
.builder = builder,
|
||||||
.target = target,
|
.target = target,
|
||||||
.optimize = builder.standardOptimizeOption(.{}),
|
.optimize = builder.standardOptimizeOption(.{}),
|
||||||
.config_header = config_header,
|
|
||||||
.enable_lto = false,
|
.enable_lto = false,
|
||||||
.include_dirs = ArrayList([]const u8).init(builder.allocator),
|
.include_dirs = ArrayList([]const u8).init(builder.allocator),
|
||||||
.cflags = ArrayList([]const u8).init(builder.allocator),
|
.cflags = ArrayList([]const u8).init(builder.allocator),
|
||||||
.cxxflags = ArrayList([]const u8).init(builder.allocator),
|
.cxxflags = ArrayList([]const u8).init(builder.allocator),
|
||||||
.objs = ArrayList(*Compile).init(builder.allocator),
|
.objs = ArrayList(*Compile).init(builder.allocator),
|
||||||
};
|
};
|
||||||
|
|
||||||
try m.addCFlag("-std=c11");
|
try m.addCFlag("-std=c11");
|
||||||
try m.addCxxFlag("-std=c++11");
|
try m.addCxxFlag("-std=c++11");
|
||||||
try m.addProjectInclude(&.{});
|
try m.addProjectInclude(&.{});
|
||||||
@ -72,7 +69,7 @@ const Maker = struct {
|
|||||||
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
||||||
if (o.target.getAbi() != .msvc)
|
if (o.target.getAbi() != .msvc)
|
||||||
o.defineCMacro("_GNU_SOURCE", null);
|
o.defineCMacro("_GNU_SOURCE", null);
|
||||||
o.addConfigHeader(m.config_header);
|
|
||||||
if (std.mem.endsWith(u8, src, ".c")) {
|
if (std.mem.endsWith(u8, src, ".c")) {
|
||||||
o.addCSourceFiles(&.{src}, m.cflags.items);
|
o.addCSourceFiles(&.{src}, m.cflags.items);
|
||||||
o.linkLibC();
|
o.linkLibC();
|
||||||
@ -85,7 +82,6 @@ const Maker = struct {
|
|||||||
o.linkLibCpp();
|
o.linkLibCpp();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
o.addConfigHeader(m.config_header);
|
|
||||||
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
|
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
|
||||||
o.want_lto = m.enable_lto;
|
o.want_lto = m.enable_lto;
|
||||||
return o;
|
return o;
|
||||||
@ -105,7 +101,6 @@ const Maker = struct {
|
|||||||
// linkLibCpp already add (libc++ + libunwind + libc)
|
// linkLibCpp already add (libc++ + libunwind + libc)
|
||||||
e.linkLibCpp();
|
e.linkLibCpp();
|
||||||
}
|
}
|
||||||
e.addConfigHeader(m.config_header);
|
|
||||||
m.builder.installArtifact(e);
|
m.builder.installArtifact(e);
|
||||||
e.want_lto = m.enable_lto;
|
e.want_lto = m.enable_lto;
|
||||||
return e;
|
return e;
|
||||||
@ -121,6 +116,7 @@ pub fn build(b: *std.build.Builder) !void {
|
|||||||
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
||||||
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
|
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
|
||||||
const llama = make.obj("llama", "llama.cpp");
|
const llama = make.obj("llama", "llama.cpp");
|
||||||
|
const buildinfo = make.obj("common", "common/build-info.cpp");
|
||||||
const common = make.obj("common", "common/common.cpp");
|
const common = make.obj("common", "common/common.cpp");
|
||||||
const console = make.obj("console", "common/console.cpp");
|
const console = make.obj("console", "common/console.cpp");
|
||||||
const sampling = make.obj("sampling", "common/sampling.cpp");
|
const sampling = make.obj("sampling", "common/sampling.cpp");
|
||||||
@ -128,14 +124,14 @@ pub fn build(b: *std.build.Builder) !void {
|
|||||||
const train = make.obj("train", "common/train.cpp");
|
const train = make.obj("train", "common/train.cpp");
|
||||||
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
||||||
|
|
||||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, console, grammar_parser });
|
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
|
||||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
|
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
|
||||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
|
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
|
||||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
|
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
|
||||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
|
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
|
||||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
|
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
|
||||||
|
|
||||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, grammar_parser, clip });
|
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
|
||||||
if (server.target.isWindows()) {
|
if (server.target.isWindows()) {
|
||||||
server.linkSystemLibrary("ws2_32");
|
server.linkSystemLibrary("ws2_32");
|
||||||
}
|
}
|
||||||
|
100
cmake/FindSIMD.cmake
Normal file
100
cmake/FindSIMD.cmake
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
include(CheckCSourceRuns)
|
||||||
|
|
||||||
|
set(AVX_CODE "
|
||||||
|
#include <immintrin.h>
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
__m256 a;
|
||||||
|
a = _mm256_set1_ps(0);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
")
|
||||||
|
|
||||||
|
set(AVX512_CODE "
|
||||||
|
#include <immintrin.h>
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
__m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0);
|
||||||
|
__m512i b = a;
|
||||||
|
__mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
")
|
||||||
|
|
||||||
|
set(AVX2_CODE "
|
||||||
|
#include <immintrin.h>
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
__m256i a = {0};
|
||||||
|
a = _mm256_abs_epi16(a);
|
||||||
|
__m256i x;
|
||||||
|
_mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
")
|
||||||
|
|
||||||
|
set(FMA_CODE "
|
||||||
|
#include <immintrin.h>
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
__m256 acc = _mm256_setzero_ps();
|
||||||
|
const __m256 d = _mm256_setzero_ps();
|
||||||
|
const __m256 p = _mm256_setzero_ps();
|
||||||
|
acc = _mm256_fmadd_ps( d, p, acc );
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
")
|
||||||
|
|
||||||
|
macro(check_sse type flags)
|
||||||
|
set(__FLAG_I 1)
|
||||||
|
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
|
||||||
|
foreach (__FLAG ${flags})
|
||||||
|
if (NOT ${type}_FOUND)
|
||||||
|
set(CMAKE_REQUIRED_FLAGS ${__FLAG})
|
||||||
|
check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I})
|
||||||
|
if (HAS_${type}_${__FLAG_I})
|
||||||
|
set(${type}_FOUND TRUE CACHE BOOL "${type} support")
|
||||||
|
set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags")
|
||||||
|
endif()
|
||||||
|
math(EXPR __FLAG_I "${__FLAG_I}+1")
|
||||||
|
endif()
|
||||||
|
endforeach()
|
||||||
|
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
|
||||||
|
|
||||||
|
if (NOT ${type}_FOUND)
|
||||||
|
set(${type}_FOUND FALSE CACHE BOOL "${type} support")
|
||||||
|
set(${type}_FLAGS "" CACHE STRING "${type} flags")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
mark_as_advanced(${type}_FOUND ${type}_FLAGS)
|
||||||
|
endmacro()
|
||||||
|
|
||||||
|
# flags are for MSVC only!
|
||||||
|
check_sse("AVX" " ;/arch:AVX")
|
||||||
|
if (NOT ${AVX_FOUND})
|
||||||
|
set(LLAMA_AVX OFF)
|
||||||
|
else()
|
||||||
|
set(LLAMA_AVX ON)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
check_sse("AVX2" " ;/arch:AVX2")
|
||||||
|
check_sse("FMA" " ;/arch:AVX2")
|
||||||
|
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
|
||||||
|
set(LLAMA_AVX2 OFF)
|
||||||
|
else()
|
||||||
|
set(LLAMA_AVX2 ON)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
check_sse("AVX512" " ;/arch:AVX512")
|
||||||
|
if (NOT ${AVX512_FOUND})
|
||||||
|
set(LLAMA_AVX512 OFF)
|
||||||
|
else()
|
||||||
|
set(LLAMA_AVX512 ON)
|
||||||
|
endif()
|
@ -1,8 +1,47 @@
|
|||||||
# common
|
# common
|
||||||
|
|
||||||
|
|
||||||
|
# Build info header
|
||||||
|
#
|
||||||
|
|
||||||
|
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||||
|
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||||
|
|
||||||
|
# Is git submodule
|
||||||
|
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
||||||
|
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
|
||||||
|
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
|
||||||
|
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(GIT_INDEX "${GIT_DIR}/index")
|
||||||
|
else()
|
||||||
|
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
|
||||||
|
set(GIT_INDEX "")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# Add a custom command to rebuild build-info.cpp when .git/index changes
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
|
||||||
|
COMMENT "Generating build details from Git"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
||||||
|
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
||||||
|
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/build-info.cmake"
|
||||||
|
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
||||||
|
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
||||||
|
VERBATIM
|
||||||
|
)
|
||||||
|
set(TARGET build_info)
|
||||||
|
add_library(${TARGET} OBJECT build-info.cpp)
|
||||||
|
if (BUILD_SHARED_LIBS)
|
||||||
|
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
set(TARGET common)
|
set(TARGET common)
|
||||||
|
|
||||||
add_library(${TARGET} OBJECT
|
add_library(${TARGET} STATIC
|
||||||
|
base64.hpp
|
||||||
common.h
|
common.h
|
||||||
common.cpp
|
common.cpp
|
||||||
sampling.h
|
sampling.h
|
||||||
@ -21,4 +60,4 @@ endif()
|
|||||||
|
|
||||||
target_include_directories(${TARGET} PUBLIC .)
|
target_include_directories(${TARGET} PUBLIC .)
|
||||||
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama)
|
target_link_libraries(${TARGET} PRIVATE llama build_info)
|
||||||
|
392
common/base64.hpp
Normal file
392
common/base64.hpp
Normal file
@ -0,0 +1,392 @@
|
|||||||
|
/*
|
||||||
|
This is free and unencumbered software released into the public domain.
|
||||||
|
|
||||||
|
Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||||
|
distribute this software, either in source code form or as a compiled
|
||||||
|
binary, for any purpose, commercial or non-commercial, and by any
|
||||||
|
means.
|
||||||
|
|
||||||
|
In jurisdictions that recognize copyright laws, the author or authors
|
||||||
|
of this software dedicate any and all copyright interest in the
|
||||||
|
software to the public domain. We make this dedication for the benefit
|
||||||
|
of the public at large and to the detriment of our heirs and
|
||||||
|
successors. We intend this dedication to be an overt act of
|
||||||
|
relinquishment in perpetuity of all present and future rights to this
|
||||||
|
software under copyright law.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||||
|
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||||
|
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||||
|
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
For more information, please refer to <http://unlicense.org>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef PUBLIC_DOMAIN_BASE64_HPP_
|
||||||
|
#define PUBLIC_DOMAIN_BASE64_HPP_
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
#include <iterator>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
class base64_error : public std::runtime_error
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
using std::runtime_error::runtime_error;
|
||||||
|
};
|
||||||
|
|
||||||
|
class base64
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
enum class alphabet
|
||||||
|
{
|
||||||
|
/** the alphabet is detected automatically */
|
||||||
|
auto_,
|
||||||
|
/** the standard base64 alphabet is used */
|
||||||
|
standard,
|
||||||
|
/** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/
|
||||||
|
url_filename_safe
|
||||||
|
};
|
||||||
|
|
||||||
|
enum class decoding_behavior
|
||||||
|
{
|
||||||
|
/** if the input is not padded, the remaining bits are ignored */
|
||||||
|
moderate,
|
||||||
|
/** if a padding character is encounter decoding is finished */
|
||||||
|
loose
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
Encodes all the elements from `in_begin` to `in_end` to `out`.
|
||||||
|
|
||||||
|
@warning The source and destination cannot overlap. The destination must be able to hold at least
|
||||||
|
`required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator.
|
||||||
|
|
||||||
|
@tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than
|
||||||
|
8 bits
|
||||||
|
@tparam Output_iterator the destination; the elements written to it are from the type `char`
|
||||||
|
@param in_begin the beginning of the source
|
||||||
|
@param in_end the ending of the source
|
||||||
|
@param out the destination iterator
|
||||||
|
@param alphabet which alphabet should be used
|
||||||
|
@returns the iterator to the next element past the last element copied
|
||||||
|
@throws see `Input_iterator` and `Output_iterator`
|
||||||
|
*/
|
||||||
|
template<typename Input_iterator, typename Output_iterator>
|
||||||
|
static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
|
||||||
|
alphabet alphabet = alphabet::standard)
|
||||||
|
{
|
||||||
|
constexpr auto pad = '=';
|
||||||
|
const char* alpha = alphabet == alphabet::url_filename_safe
|
||||||
|
? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
|
||||||
|
: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
||||||
|
|
||||||
|
while (in_begin != in_end) {
|
||||||
|
std::uint8_t i0 = 0, i1 = 0, i2 = 0;
|
||||||
|
|
||||||
|
// first character
|
||||||
|
i0 = static_cast<std::uint8_t>(*in_begin);
|
||||||
|
++in_begin;
|
||||||
|
|
||||||
|
*out = alpha[i0 >> 2 & 0x3f];
|
||||||
|
++out;
|
||||||
|
|
||||||
|
// part of first character and second
|
||||||
|
if (in_begin != in_end) {
|
||||||
|
i1 = static_cast<std::uint8_t>(*in_begin);
|
||||||
|
++in_begin;
|
||||||
|
|
||||||
|
*out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)];
|
||||||
|
++out;
|
||||||
|
} else {
|
||||||
|
*out = alpha[(i0 & 0x3) << 4];
|
||||||
|
++out;
|
||||||
|
|
||||||
|
// last padding
|
||||||
|
*out = pad;
|
||||||
|
++out;
|
||||||
|
|
||||||
|
// last padding
|
||||||
|
*out = pad;
|
||||||
|
++out;
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// part of second character and third
|
||||||
|
if (in_begin != in_end) {
|
||||||
|
i2 = static_cast<std::uint8_t>(*in_begin);
|
||||||
|
++in_begin;
|
||||||
|
|
||||||
|
*out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)];
|
||||||
|
++out;
|
||||||
|
} else {
|
||||||
|
*out = alpha[(i1 & 0xf) << 2];
|
||||||
|
++out;
|
||||||
|
|
||||||
|
// last padding
|
||||||
|
*out = pad;
|
||||||
|
++out;
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// rest of third
|
||||||
|
*out = alpha[i2 & 0x3f];
|
||||||
|
++out;
|
||||||
|
}
|
||||||
|
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
Encodes a string.
|
||||||
|
|
||||||
|
@param str the string that should be encoded
|
||||||
|
@param alphabet which alphabet should be used
|
||||||
|
@returns the encoded base64 string
|
||||||
|
@throws see base64::encode()
|
||||||
|
*/
|
||||||
|
static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard)
|
||||||
|
{
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
result.reserve(required_encode_size(str.length()) + 1);
|
||||||
|
|
||||||
|
encode(str.begin(), str.end(), std::back_inserter(result), alphabet);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
Encodes a char array.
|
||||||
|
|
||||||
|
@param buffer the char array
|
||||||
|
@param size the size of the array
|
||||||
|
@param alphabet which alphabet should be used
|
||||||
|
@returns the encoded string
|
||||||
|
*/
|
||||||
|
static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard)
|
||||||
|
{
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
result.reserve(required_encode_size(size) + 1);
|
||||||
|
|
||||||
|
encode(buffer, buffer + size, std::back_inserter(result), alphabet);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`,
|
||||||
|
in other words: inplace decoding is possible.
|
||||||
|
|
||||||
|
@warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`,
|
||||||
|
otherwise the behavior depends on the output iterator.
|
||||||
|
|
||||||
|
@tparam Input_iterator the source; the returned elements are cast to `char`
|
||||||
|
@tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t`
|
||||||
|
@param in_begin the beginning of the source
|
||||||
|
@param in_end the ending of the source
|
||||||
|
@param out the destination iterator
|
||||||
|
@param alphabet which alphabet should be used
|
||||||
|
@param behavior the behavior when an error was detected
|
||||||
|
@returns the iterator to the next element past the last element copied
|
||||||
|
@throws base64_error depending on the set behavior
|
||||||
|
@throws see `Input_iterator` and `Output_iterator`
|
||||||
|
*/
|
||||||
|
template<typename Input_iterator, typename Output_iterator>
|
||||||
|
static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
|
||||||
|
alphabet alphabet = alphabet::auto_,
|
||||||
|
decoding_behavior behavior = decoding_behavior::moderate)
|
||||||
|
{
|
||||||
|
//constexpr auto pad = '=';
|
||||||
|
std::uint8_t last = 0;
|
||||||
|
auto bits = 0;
|
||||||
|
|
||||||
|
while (in_begin != in_end) {
|
||||||
|
auto c = *in_begin;
|
||||||
|
++in_begin;
|
||||||
|
|
||||||
|
if (c == '=') {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto part = _base64_value(alphabet, c);
|
||||||
|
|
||||||
|
// enough bits for one byte
|
||||||
|
if (bits + 6 >= 8) {
|
||||||
|
*out = (last << (8 - bits)) | (part >> (bits - 2));
|
||||||
|
++out;
|
||||||
|
|
||||||
|
bits -= 2;
|
||||||
|
} else {
|
||||||
|
bits += 6;
|
||||||
|
}
|
||||||
|
|
||||||
|
last = part;
|
||||||
|
}
|
||||||
|
|
||||||
|
// check padding
|
||||||
|
if (behavior != decoding_behavior::loose) {
|
||||||
|
while (in_begin != in_end) {
|
||||||
|
auto c = *in_begin;
|
||||||
|
++in_begin;
|
||||||
|
|
||||||
|
if (c != '=') {
|
||||||
|
throw base64_error("invalid base64 character.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
Decodes a string.
|
||||||
|
|
||||||
|
@param str the base64 encoded string
|
||||||
|
@param alphabet which alphabet should be used
|
||||||
|
@param behavior the behavior when an error was detected
|
||||||
|
@returns the decoded string
|
||||||
|
@throws see base64::decode()
|
||||||
|
*/
|
||||||
|
static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_,
|
||||||
|
decoding_behavior behavior = decoding_behavior::moderate)
|
||||||
|
{
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
result.reserve(max_decode_size(str.length()));
|
||||||
|
|
||||||
|
decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
Decodes a string.
|
||||||
|
|
||||||
|
@param buffer the base64 encoded buffer
|
||||||
|
@param size the size of the buffer
|
||||||
|
@param alphabet which alphabet should be used
|
||||||
|
@param behavior the behavior when an error was detected
|
||||||
|
@returns the decoded string
|
||||||
|
@throws see base64::decode()
|
||||||
|
*/
|
||||||
|
static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_,
|
||||||
|
decoding_behavior behavior = decoding_behavior::moderate)
|
||||||
|
{
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
result.reserve(max_decode_size(size));
|
||||||
|
|
||||||
|
decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
Decodes a string inplace.
|
||||||
|
|
||||||
|
@param[in,out] str the base64 encoded string
|
||||||
|
@param alphabet which alphabet should be used
|
||||||
|
@param behavior the behavior when an error was detected
|
||||||
|
@throws base64::decode_inplace()
|
||||||
|
*/
|
||||||
|
static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_,
|
||||||
|
decoding_behavior behavior = decoding_behavior::moderate)
|
||||||
|
{
|
||||||
|
str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin());
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
Decodes a char array inplace.
|
||||||
|
|
||||||
|
@param[in,out] str the string array
|
||||||
|
@param size the length of the array
|
||||||
|
@param alphabet which alphabet should be used
|
||||||
|
@param behavior the behavior when an error was detected
|
||||||
|
@returns the pointer to the next element past the last element decoded
|
||||||
|
@throws base64::decode_inplace()
|
||||||
|
*/
|
||||||
|
static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_,
|
||||||
|
decoding_behavior behavior = decoding_behavior::moderate)
|
||||||
|
{
|
||||||
|
return decode(str, str + size, str, alphabet, behavior);
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
Returns the required decoding size for a given size. The value is calculated with the following formula:
|
||||||
|
|
||||||
|
$$
|
||||||
|
\lceil \frac{size}{4} \rceil \cdot 3
|
||||||
|
$$
|
||||||
|
|
||||||
|
@param size the size of the encoded input
|
||||||
|
@returns the size of the resulting decoded buffer; this the absolute maximum
|
||||||
|
*/
|
||||||
|
static std::size_t max_decode_size(std::size_t size) noexcept
|
||||||
|
{
|
||||||
|
return (size / 4 + (size % 4 ? 1 : 0)) * 3;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
Returns the required encoding size for a given size. The value is calculated with the following formula:
|
||||||
|
|
||||||
|
$$
|
||||||
|
\lceil \frac{size}{3} \rceil \cdot 4
|
||||||
|
$$
|
||||||
|
|
||||||
|
@param size the size of the decoded input
|
||||||
|
@returns the size of the resulting encoded buffer
|
||||||
|
*/
|
||||||
|
static std::size_t required_encode_size(std::size_t size) noexcept
|
||||||
|
{
|
||||||
|
return (size / 3 + (size % 3 ? 1 : 0)) * 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
static std::uint8_t _base64_value(alphabet& alphabet, char c)
|
||||||
|
{
|
||||||
|
if (c >= 'A' && c <= 'Z') {
|
||||||
|
return c - 'A';
|
||||||
|
} else if (c >= 'a' && c <= 'z') {
|
||||||
|
return c - 'a' + 26;
|
||||||
|
} else if (c >= '0' && c <= '9') {
|
||||||
|
return c - '0' + 52;
|
||||||
|
}
|
||||||
|
|
||||||
|
// comes down to alphabet
|
||||||
|
if (alphabet == alphabet::standard) {
|
||||||
|
if (c == '+') {
|
||||||
|
return 62;
|
||||||
|
} else if (c == '/') {
|
||||||
|
return 63;
|
||||||
|
}
|
||||||
|
} else if (alphabet == alphabet::url_filename_safe) {
|
||||||
|
if (c == '-') {
|
||||||
|
return 62;
|
||||||
|
} else if (c == '_') {
|
||||||
|
return 63;
|
||||||
|
}
|
||||||
|
} // auto detect
|
||||||
|
else {
|
||||||
|
if (c == '+') {
|
||||||
|
alphabet = alphabet::standard;
|
||||||
|
|
||||||
|
return 62;
|
||||||
|
} else if (c == '/') {
|
||||||
|
alphabet = alphabet::standard;
|
||||||
|
|
||||||
|
return 63;
|
||||||
|
} else if (c == '-') {
|
||||||
|
alphabet = alphabet::url_filename_safe;
|
||||||
|
|
||||||
|
return 62;
|
||||||
|
} else if (c == '_') {
|
||||||
|
alphabet = alphabet::url_filename_safe;
|
||||||
|
|
||||||
|
return 63;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
throw base64_error("invalid base64 character.");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // !PUBLIC_DOMAIN_BASE64_HPP_
|
4
common/build-info.cpp.in
Normal file
4
common/build-info.cpp.in
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
|
||||||
|
char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
|
||||||
|
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
|
||||||
|
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
|
@ -1,5 +1,4 @@
|
|||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "build-info.h"
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
@ -13,6 +12,7 @@
|
|||||||
#include <regex>
|
#include <regex>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
@ -91,6 +91,19 @@ void process_escapes(std::string& input) {
|
|||||||
case '\'': input[output_idx++] = '\''; break;
|
case '\'': input[output_idx++] = '\''; break;
|
||||||
case '\"': input[output_idx++] = '\"'; break;
|
case '\"': input[output_idx++] = '\"'; break;
|
||||||
case '\\': input[output_idx++] = '\\'; break;
|
case '\\': input[output_idx++] = '\\'; break;
|
||||||
|
case 'x':
|
||||||
|
// Handle \x12, etc
|
||||||
|
if (input_idx + 2 < input_len) {
|
||||||
|
const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
|
||||||
|
char *err_p = nullptr;
|
||||||
|
const long val = std::strtol(x, &err_p, 16);
|
||||||
|
if (err_p == x + 2) {
|
||||||
|
input_idx += 2;
|
||||||
|
input[output_idx++] = char(val);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
default: input[output_idx++] = '\\';
|
default: input[output_idx++] = '\\';
|
||||||
input[output_idx++] = input[input_idx]; break;
|
input[output_idx++] = input[input_idx]; break;
|
||||||
}
|
}
|
||||||
@ -103,9 +116,24 @@ void process_escapes(std::string& input) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
|
bool result = true;
|
||||||
|
try {
|
||||||
|
if (!gpt_params_parse_ex(argc, argv, params)) {
|
||||||
|
gpt_print_usage(argc, argv, gpt_params());
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (const std::invalid_argument & ex) {
|
||||||
|
fprintf(stderr, "%s\n", ex.what());
|
||||||
|
gpt_print_usage(argc, argv, gpt_params());
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
bool invalid_param = false;
|
bool invalid_param = false;
|
||||||
std::string arg;
|
std::string arg;
|
||||||
gpt_params default_params;
|
|
||||||
const std::string arg_prefix = "--";
|
const std::string arg_prefix = "--";
|
||||||
llama_sampling_params & sparams = params.sparams;
|
llama_sampling_params & sparams = params.sparams;
|
||||||
|
|
||||||
@ -204,12 +232,52 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.rope_freq_scale = std::stof(argv[i]);
|
params.rope_freq_scale = std::stof(argv[i]);
|
||||||
|
} else if (arg == "--rope-scaling") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
std::string value(argv[i]);
|
||||||
|
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
|
||||||
|
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
|
||||||
|
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
|
||||||
|
else { invalid_param = true; break; }
|
||||||
} else if (arg == "--rope-scale") {
|
} else if (arg == "--rope-scale") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.rope_freq_scale = 1.0f/std::stof(argv[i]);
|
params.rope_freq_scale = 1.0f/std::stof(argv[i]);
|
||||||
|
} else if (arg == "--yarn-orig-ctx") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.yarn_orig_ctx = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "--yarn-ext-factor") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.yarn_ext_factor = std::stof(argv[i]);
|
||||||
|
} else if (arg == "--yarn-attn-factor") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.yarn_attn_factor = std::stof(argv[i]);
|
||||||
|
} else if (arg == "--yarn-beta-fast") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.yarn_beta_fast = std::stof(argv[i]);
|
||||||
|
} else if (arg == "--yarn-beta-slow") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.yarn_beta_slow = std::stof(argv[i]);
|
||||||
} else if (arg == "--memory-f32") {
|
} else if (arg == "--memory-f32") {
|
||||||
params.memory_f16 = false;
|
params.memory_f16 = false;
|
||||||
} else if (arg == "--top-p") {
|
} else if (arg == "--top-p") {
|
||||||
@ -218,6 +286,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
sparams.top_p = std::stof(argv[i]);
|
sparams.top_p = std::stof(argv[i]);
|
||||||
|
} else if (arg == "--min-p") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
sparams.min_p = std::stof(argv[i]);
|
||||||
} else if (arg == "--temp") {
|
} else if (arg == "--temp") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
@ -343,6 +417,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.n_sequences = std::stoi(argv[i]);
|
params.n_sequences = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "--p-accept" || arg == "-pa") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.p_accept = std::stof(argv[i]);
|
||||||
|
} else if (arg == "--p-split" || arg == "-ps") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.p_split = std::stof(argv[i]);
|
||||||
} else if (arg == "-m" || arg == "--model") {
|
} else if (arg == "-m" || arg == "--model") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
@ -406,8 +492,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||||||
params.interactive_first = true;
|
params.interactive_first = true;
|
||||||
} else if (arg == "-ins" || arg == "--instruct") {
|
} else if (arg == "-ins" || arg == "--instruct") {
|
||||||
params.instruct = true;
|
params.instruct = true;
|
||||||
|
} else if (arg == "-cml" || arg == "--chatml") {
|
||||||
|
params.chatml = true;
|
||||||
} else if (arg == "--infill") {
|
} else if (arg == "--infill") {
|
||||||
params.infill = true;
|
params.infill = true;
|
||||||
|
} else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
|
||||||
|
params.dump_kv_cache = true;
|
||||||
} else if (arg == "--multiline-input") {
|
} else if (arg == "--multiline-input") {
|
||||||
params.multiline_input = true;
|
params.multiline_input = true;
|
||||||
} else if (arg == "--simple-io") {
|
} else if (arg == "--simple-io") {
|
||||||
@ -548,11 +638,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else if (arg == "-h" || arg == "--help") {
|
} else if (arg == "-h" || arg == "--help") {
|
||||||
gpt_print_usage(argc, argv, default_params);
|
return false;
|
||||||
#ifndef LOG_DISABLE_LOGS
|
|
||||||
log_print_usage();
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
exit(0);
|
|
||||||
} else if (arg == "--random-prompt") {
|
} else if (arg == "--random-prompt") {
|
||||||
params.random_prompt = true;
|
params.random_prompt = true;
|
||||||
} else if (arg == "--in-prefix-bos") {
|
} else if (arg == "--in-prefix-bos") {
|
||||||
@ -611,22 +698,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||||||
// End of Parse args for logging parameters
|
// End of Parse args for logging parameters
|
||||||
#endif // LOG_DISABLE_LOGS
|
#endif // LOG_DISABLE_LOGS
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
throw std::invalid_argument("error: unknown argument: " + arg);
|
||||||
gpt_print_usage(argc, argv, default_params);
|
|
||||||
exit(1);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (invalid_param) {
|
if (invalid_param) {
|
||||||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
||||||
gpt_print_usage(argc, argv, default_params);
|
|
||||||
exit(1);
|
|
||||||
}
|
}
|
||||||
if (params.prompt_cache_all &&
|
if (params.prompt_cache_all &&
|
||||||
(params.interactive || params.interactive_first ||
|
(params.interactive || params.interactive_first ||
|
||||||
params.instruct)) {
|
params.instruct)) {
|
||||||
fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n");
|
|
||||||
gpt_print_usage(argc, argv, default_params);
|
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
||||||
exit(1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.escape) {
|
if (params.escape) {
|
||||||
@ -645,6 +727,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||||||
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
const llama_sampling_params & sparams = params.sparams;
|
const llama_sampling_params & sparams = params.sparams;
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
printf("usage: %s [options]\n", argv[0]);
|
printf("usage: %s [options]\n", argv[0]);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("options:\n");
|
printf("options:\n");
|
||||||
@ -652,6 +735,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||||||
printf(" -i, --interactive run in interactive mode\n");
|
printf(" -i, --interactive run in interactive mode\n");
|
||||||
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
||||||
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
||||||
|
printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n");
|
||||||
printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
|
printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
|
||||||
printf(" -r PROMPT, --reverse-prompt PROMPT\n");
|
printf(" -r PROMPT, --reverse-prompt PROMPT\n");
|
||||||
printf(" halt generation at PROMPT, return control in interactive mode\n");
|
printf(" halt generation at PROMPT, return control in interactive mode\n");
|
||||||
@ -679,6 +763,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
||||||
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
||||||
|
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
|
||||||
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
|
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
|
||||||
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
|
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
|
||||||
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
|
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
|
||||||
@ -701,9 +786,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||||||
printf(" --cfg-negative-prompt-file FNAME\n");
|
printf(" --cfg-negative-prompt-file FNAME\n");
|
||||||
printf(" negative prompt file to use for guidance. (default: empty)\n");
|
printf(" negative prompt file to use for guidance. (default: empty)\n");
|
||||||
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
|
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
|
||||||
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
|
printf(" --rope-scaling {none,linear,yarn}\n");
|
||||||
|
printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
|
||||||
|
printf(" --rope-scale N RoPE context scaling factor, expands context by a factor of N\n");
|
||||||
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
|
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
|
||||||
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
|
printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
|
||||||
|
printf(" --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)\n");
|
||||||
|
printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
|
||||||
|
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
|
||||||
|
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
|
||||||
|
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
||||||
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||||
printf(" --no-penalize-nl do not penalize newline token\n");
|
printf(" --no-penalize-nl do not penalize newline token\n");
|
||||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||||
@ -717,6 +809,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||||||
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
||||||
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
|
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
|
||||||
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
|
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
|
||||||
|
printf(" -pa N, --p-accept N speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept);
|
||||||
|
printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
|
||||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
||||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
|
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
|
||||||
printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
|
printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
|
||||||
@ -744,6 +838,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
#endif
|
#endif
|
||||||
printf(" --verbose-prompt print prompt before generation\n");
|
printf(" --verbose-prompt print prompt before generation\n");
|
||||||
|
printf(" -dkvc, --dump-kv-cache\n");
|
||||||
|
printf(" verbose print of the KV cache\n");
|
||||||
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
||||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||||
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
||||||
@ -755,6 +851,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||||||
printf(" -ld LOGDIR, --logdir LOGDIR\n");
|
printf(" -ld LOGDIR, --logdir LOGDIR\n");
|
||||||
printf(" path under which to save YAML logs (no logging if unset)\n");
|
printf(" path under which to save YAML logs (no logging if unset)\n");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
#ifndef LOG_DISABLE_LOGS
|
||||||
|
log_print_usage();
|
||||||
|
#endif // LOG_DISABLE_LOGS
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string get_system_info(const gpt_params & params) {
|
std::string get_system_info(const gpt_params & params) {
|
||||||
@ -817,8 +916,14 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|||||||
cparams.f16_kv = params.memory_f16;
|
cparams.f16_kv = params.memory_f16;
|
||||||
cparams.logits_all = params.logits_all;
|
cparams.logits_all = params.logits_all;
|
||||||
cparams.embedding = params.embedding;
|
cparams.embedding = params.embedding;
|
||||||
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
||||||
cparams.rope_freq_base = params.rope_freq_base;
|
cparams.rope_freq_base = params.rope_freq_base;
|
||||||
cparams.rope_freq_scale = params.rope_freq_scale;
|
cparams.rope_freq_scale = params.rope_freq_scale;
|
||||||
|
cparams.yarn_ext_factor = params.yarn_ext_factor;
|
||||||
|
cparams.yarn_attn_factor = params.yarn_attn_factor;
|
||||||
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
||||||
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
||||||
|
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
||||||
|
|
||||||
return cparams;
|
return cparams;
|
||||||
}
|
}
|
||||||
@ -834,7 +939,7 @@ void llama_batch_add(
|
|||||||
const std::vector<llama_seq_id> & seq_ids,
|
const std::vector<llama_seq_id> & seq_ids,
|
||||||
bool logits) {
|
bool logits) {
|
||||||
batch.token [batch.n_tokens] = id;
|
batch.token [batch.n_tokens] = id;
|
||||||
batch.pos [batch.n_tokens] = pos,
|
batch.pos [batch.n_tokens] = pos;
|
||||||
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
|
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
|
||||||
for (size_t i = 0; i < seq_ids.size(); ++i) {
|
for (size_t i = 0; i < seq_ids.size(); ++i) {
|
||||||
batch.seq_id[batch.n_tokens][i] = seq_ids[i];
|
batch.seq_id[batch.n_tokens][i] = seq_ids[i];
|
||||||
@ -975,6 +1080,12 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llama_should_add_bos_token(const llama_model * model) {
|
||||||
|
const int add_bos = llama_add_bos_token(model);
|
||||||
|
|
||||||
|
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// YAML utils
|
// YAML utils
|
||||||
//
|
//
|
||||||
@ -1091,6 +1202,7 @@ void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const cha
|
|||||||
if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
|
if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
|
||||||
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
|
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
|
||||||
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
|
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
|
||||||
|
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
|
||||||
data_str = "\"" + data_str + "\"";
|
data_str = "\"" + data_str + "\"";
|
||||||
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
||||||
return;
|
return;
|
||||||
@ -1128,8 +1240,8 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|||||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
||||||
const llama_sampling_params & sparams = params.sparams;
|
const llama_sampling_params & sparams = params.sparams;
|
||||||
|
|
||||||
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
|
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
||||||
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
|
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
|
||||||
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
||||||
@ -1275,6 +1387,81 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|||||||
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
||||||
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
||||||
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
||||||
|
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
||||||
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
||||||
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// KV cache utils
|
||||||
|
//
|
||||||
|
|
||||||
|
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
|
||||||
|
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
|
||||||
|
|
||||||
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
|
||||||
|
view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
|
||||||
|
|
||||||
|
llama_kv_cache_view_cell * c_curr = view.cells;
|
||||||
|
llama_seq_id * cs_curr = view.cells_sequences;
|
||||||
|
|
||||||
|
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
|
||||||
|
if (i % row_size == 0) {
|
||||||
|
printf("\n%5d: ", i);
|
||||||
|
}
|
||||||
|
int seq_count = 0;
|
||||||
|
for (int j = 0; j < view.n_max_seq; j++) {
|
||||||
|
if (cs_curr[j] >= 0) { seq_count++; }
|
||||||
|
}
|
||||||
|
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n=== Done dumping\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
||||||
|
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
||||||
|
|
||||||
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
|
||||||
|
view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
|
||||||
|
|
||||||
|
std::unordered_map<llama_seq_id, size_t> seqs;
|
||||||
|
llama_kv_cache_view_cell * c_curr = view.cells;
|
||||||
|
llama_seq_id * cs_curr = view.cells_sequences;
|
||||||
|
|
||||||
|
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
|
||||||
|
for (int j = 0; j < view.n_max_seq; j++) {
|
||||||
|
if (cs_curr[j] < 0) { continue; }
|
||||||
|
if (seqs.find(cs_curr[j]) == seqs.end()) {
|
||||||
|
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
||||||
|
seqs[cs_curr[j]] = seqs.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("=== Sequence legend: ");
|
||||||
|
for (const auto & it : seqs) {
|
||||||
|
printf("%zu=%d, ", it.second, it.first);
|
||||||
|
}
|
||||||
|
printf("'+'=other sequence ids");
|
||||||
|
|
||||||
|
c_curr = view.cells;
|
||||||
|
cs_curr = view.cells_sequences;
|
||||||
|
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
|
||||||
|
if (i % row_size == 0) {
|
||||||
|
printf("\n%5d: ", i);
|
||||||
|
}
|
||||||
|
for (int j = 0; j < view.n_max_seq; j++) {
|
||||||
|
if (cs_curr[j] >= 0) {
|
||||||
|
const auto & it = seqs.find(cs_curr[j]);
|
||||||
|
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
|
||||||
|
} else {
|
||||||
|
putchar('.');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
putchar(' ');
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n=== Done dumping\n");
|
||||||
|
}
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
#define LOG_NO_FILE_LINE_FUNCTION
|
#define LOG_NO_FILE_LINE_FUNCTION
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <random>
|
#include <random>
|
||||||
@ -26,10 +27,16 @@
|
|||||||
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
||||||
|
|
||||||
#define print_build_info() do { \
|
#define print_build_info() do { \
|
||||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \
|
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
|
||||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \
|
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
|
// build info
|
||||||
|
extern int LLAMA_BUILD_NUMBER;
|
||||||
|
extern char const *LLAMA_COMMIT;
|
||||||
|
extern char const *LLAMA_COMPILER;
|
||||||
|
extern char const *LLAMA_BUILD_TARGET;
|
||||||
|
|
||||||
//
|
//
|
||||||
// CLI argument parsing
|
// CLI argument parsing
|
||||||
//
|
//
|
||||||
@ -37,6 +44,7 @@ int32_t get_num_physical_cores();
|
|||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
uint32_t seed = -1; // RNG seed
|
uint32_t seed = -1; // RNG seed
|
||||||
|
|
||||||
int32_t n_threads = get_num_physical_cores();
|
int32_t n_threads = get_num_physical_cores();
|
||||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
@ -47,6 +55,8 @@ struct gpt_params {
|
|||||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||||
int32_t n_sequences = 1; // number of sequences to decode
|
int32_t n_sequences = 1; // number of sequences to decode
|
||||||
|
float p_accept = 0.5f; // speculative decoding accept probability
|
||||||
|
float p_split = 0.1f; // speculative decoding split probability
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
@ -54,6 +64,13 @@ struct gpt_params {
|
|||||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||||
float rope_freq_base = 0.0f; // RoPE base frequency
|
float rope_freq_base = 0.0f; // RoPE base frequency
|
||||||
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
||||||
|
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
|
||||||
|
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
|
||||||
|
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||||
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||||
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||||
|
int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
|
||||||
|
// pinging @cebtenzzre
|
||||||
|
|
||||||
// // sampling parameters
|
// // sampling parameters
|
||||||
struct llama_sampling_params sparams;
|
struct llama_sampling_params sparams;
|
||||||
@ -85,6 +102,7 @@ struct gpt_params {
|
|||||||
bool random_prompt = false; // do not randomize prompt if none provided
|
bool random_prompt = false; // do not randomize prompt if none provided
|
||||||
bool use_color = false; // use color to distinguish generations and inputs
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
bool interactive = false; // interactive mode
|
bool interactive = false; // interactive mode
|
||||||
|
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
||||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||||
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
||||||
|
|
||||||
@ -104,12 +122,15 @@ struct gpt_params {
|
|||||||
bool numa = false; // attempt optimizations that help on some NUMA systems
|
bool numa = false; // attempt optimizations that help on some NUMA systems
|
||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
bool infill = false; // use infill mode
|
bool infill = false; // use infill mode
|
||||||
|
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||||
|
|
||||||
// multimodal models (see examples/llava)
|
// multimodal models (see examples/llava)
|
||||||
std::string mmproj = ""; // path to multimodal projector
|
std::string mmproj = ""; // path to multimodal projector
|
||||||
std::string image = ""; // path to an image file
|
std::string image = ""; // path to an image file
|
||||||
};
|
};
|
||||||
|
|
||||||
|
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
|
||||||
|
|
||||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
||||||
|
|
||||||
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
||||||
@ -181,6 +202,10 @@ std::string llama_detokenize_bpe(
|
|||||||
llama_context * ctx,
|
llama_context * ctx,
|
||||||
const std::vector<llama_token> & tokens);
|
const std::vector<llama_token> & tokens);
|
||||||
|
|
||||||
|
// Uses the value from the model metadata if possible, otherwise
|
||||||
|
// defaults to true when model type is SPM, otherwise false.
|
||||||
|
bool llama_should_add_bos_token(const llama_model * model);
|
||||||
|
|
||||||
//
|
//
|
||||||
// YAML utils
|
// YAML utils
|
||||||
//
|
//
|
||||||
@ -194,3 +219,13 @@ std::string get_sortable_timestamp();
|
|||||||
void dump_non_result_info_yaml(
|
void dump_non_result_info_yaml(
|
||||||
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
||||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
||||||
|
|
||||||
|
//
|
||||||
|
// KV cache utils
|
||||||
|
//
|
||||||
|
|
||||||
|
// Dump the KV cache view with the number of sequences per cell.
|
||||||
|
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
|
||||||
|
|
||||||
|
// Dump the KV cache view showing individual sequences in each cell (long output).
|
||||||
|
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
||||||
|
122
common/log.h
122
common/log.h
@ -97,38 +97,56 @@
|
|||||||
#define LOG_TEE_TARGET stderr
|
#define LOG_TEE_TARGET stderr
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// NOTE: currently disabled as it produces too many log files
|
// Utility for synchronizing log configuration state
|
||||||
|
// since std::optional was introduced only in c++17
|
||||||
|
enum LogTriState
|
||||||
|
{
|
||||||
|
LogTriStateSame,
|
||||||
|
LogTriStateFalse,
|
||||||
|
LogTriStateTrue
|
||||||
|
};
|
||||||
|
|
||||||
// Utility to obtain "pid" like unique process id and use it when creating log files.
|
// Utility to obtain "pid" like unique process id and use it when creating log files.
|
||||||
//inline std::string log_get_pid()
|
inline std::string log_get_pid()
|
||||||
//{
|
{
|
||||||
// static std::string pid;
|
static std::string pid;
|
||||||
// if (pid.empty())
|
if (pid.empty())
|
||||||
// {
|
{
|
||||||
// // std::this_thread::get_id() is the most portable way of obtaining a "process id"
|
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
|
||||||
// // it's not the same as "pid" but is unique enough to solve multiple instances
|
// it's not the same as "pid" but is unique enough to solve multiple instances
|
||||||
// // trying to write to the same log.
|
// trying to write to the same log.
|
||||||
// std::stringstream ss;
|
std::stringstream ss;
|
||||||
// ss << std::this_thread::get_id();
|
ss << std::this_thread::get_id();
|
||||||
// pid = ss.str();
|
pid = ss.str();
|
||||||
// }
|
}
|
||||||
//
|
|
||||||
// return pid;
|
return pid;
|
||||||
//}
|
}
|
||||||
|
|
||||||
// Utility function for generating log file names with unique id based on thread id.
|
// Utility function for generating log file names with unique id based on thread id.
|
||||||
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
|
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
|
||||||
// where the number is a runtime id of the current thread.
|
// where the number is a runtime id of the current thread.
|
||||||
|
|
||||||
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension)
|
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
// INTERNAL, DO NOT USE
|
||||||
inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension)
|
inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
|
||||||
{
|
{
|
||||||
|
static bool _multilog = false;
|
||||||
|
|
||||||
|
if (multilog != LogTriStateSame)
|
||||||
|
{
|
||||||
|
_multilog = multilog == LogTriStateTrue;
|
||||||
|
}
|
||||||
|
|
||||||
std::stringstream buf;
|
std::stringstream buf;
|
||||||
|
|
||||||
buf << log_file_basename;
|
buf << log_file_basename;
|
||||||
//buf << ".";
|
if (_multilog)
|
||||||
//buf << log_get_pid();
|
{
|
||||||
|
buf << ".";
|
||||||
|
buf << log_get_pid();
|
||||||
|
}
|
||||||
buf << ".";
|
buf << ".";
|
||||||
buf << log_file_extension;
|
buf << log_file_extension;
|
||||||
|
|
||||||
@ -213,15 +231,6 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
|
|||||||
#define LOG_TEE_FLF_VAL ,""
|
#define LOG_TEE_FLF_VAL ,""
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Utility for synchronizing log configuration state
|
|
||||||
// since std::optional was introduced only in c++17
|
|
||||||
enum LogTriState
|
|
||||||
{
|
|
||||||
LogTriStateSame,
|
|
||||||
LogTriStateFalse,
|
|
||||||
LogTriStateTrue
|
|
||||||
};
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
// INTERNAL, DO NOT USE
|
||||||
// USE LOG() INSTEAD
|
// USE LOG() INSTEAD
|
||||||
//
|
//
|
||||||
@ -315,16 +324,23 @@ enum LogTriState
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
// INTERNAL, DO NOT USE
|
||||||
inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
|
inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
|
||||||
{
|
{
|
||||||
static bool _initialized{false};
|
static bool _initialized = false;
|
||||||
static bool _disabled{(filename.empty() && target == nullptr)};
|
static bool _append = false;
|
||||||
|
static bool _disabled = filename.empty() && target == nullptr;
|
||||||
static std::string log_current_filename{filename};
|
static std::string log_current_filename{filename};
|
||||||
static FILE *log_current_target{target};
|
static FILE *log_current_target{target};
|
||||||
static FILE *logfile = nullptr;
|
static FILE *logfile = nullptr;
|
||||||
|
|
||||||
if (change)
|
if (change)
|
||||||
{
|
{
|
||||||
|
if (append != LogTriStateSame)
|
||||||
|
{
|
||||||
|
_append = append == LogTriStateTrue;
|
||||||
|
return logfile;
|
||||||
|
}
|
||||||
|
|
||||||
if (disable == LogTriStateTrue)
|
if (disable == LogTriStateTrue)
|
||||||
{
|
{
|
||||||
// Disable primary target
|
// Disable primary target
|
||||||
@ -377,7 +393,7 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logfile = fopen(filename.c_str(), "w");
|
logfile = fopen(filename.c_str(), _append ? "a" : "w");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!logfile)
|
if (!logfile)
|
||||||
@ -398,9 +414,9 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri
|
|||||||
}
|
}
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
// INTERNAL, DO NOT USE
|
||||||
inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
|
inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
|
||||||
{
|
{
|
||||||
return log_handler1_impl(change, disable, filename, target);
|
return log_handler1_impl(change, append, disable, filename, target);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Disables logs entirely at runtime.
|
// Disables logs entirely at runtime.
|
||||||
@ -411,7 +427,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTri
|
|||||||
// INTERNAL, DO NOT USE
|
// INTERNAL, DO NOT USE
|
||||||
inline FILE *log_disable_impl()
|
inline FILE *log_disable_impl()
|
||||||
{
|
{
|
||||||
return log_handler1_impl(true, LogTriStateTrue);
|
return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Enables logs at runtime.
|
// Enables logs at runtime.
|
||||||
@ -420,19 +436,31 @@ inline FILE *log_disable_impl()
|
|||||||
// INTERNAL, DO NOT USE
|
// INTERNAL, DO NOT USE
|
||||||
inline FILE *log_enable_impl()
|
inline FILE *log_enable_impl()
|
||||||
{
|
{
|
||||||
return log_handler1_impl(true, LogTriStateFalse);
|
return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
|
// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
|
||||||
#define log_set_target(target) log_set_target_impl(target)
|
#define log_set_target(target) log_set_target_impl(target)
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
// INTERNAL, DO NOT USE
|
||||||
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); }
|
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
|
||||||
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); }
|
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
// INTERNAL, DO NOT USE
|
||||||
inline FILE *log_handler() { return log_handler1_impl(); }
|
inline FILE *log_handler() { return log_handler1_impl(); }
|
||||||
|
|
||||||
|
// Enable or disable creating separate log files for each run.
|
||||||
|
// can ONLY be invoked BEFORE first log use.
|
||||||
|
#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
|
||||||
|
// Enable or disable append mode for log file.
|
||||||
|
// can ONLY be invoked BEFORE first log use.
|
||||||
|
#define log_append(enable) log_append_impl(enable)
|
||||||
|
// INTERNAL, DO NOT USE
|
||||||
|
inline FILE *log_append_impl(bool enable)
|
||||||
|
{
|
||||||
|
return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
|
||||||
|
}
|
||||||
|
|
||||||
inline void log_test()
|
inline void log_test()
|
||||||
{
|
{
|
||||||
log_disable();
|
log_disable();
|
||||||
@ -494,6 +522,18 @@ inline bool log_param_single_parse(const std::string & param)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (param == "--log-new")
|
||||||
|
{
|
||||||
|
log_multilog(true);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (param == "--log-append")
|
||||||
|
{
|
||||||
|
log_append(true);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -523,7 +563,9 @@ inline void log_print_usage()
|
|||||||
printf(" --log-disable Disable trace logs\n");
|
printf(" --log-disable Disable trace logs\n");
|
||||||
printf(" --log-enable Enable trace logs\n");
|
printf(" --log-enable Enable trace logs\n");
|
||||||
printf(" --log-file Specify a log filename (without extension)\n");
|
printf(" --log-file Specify a log filename (without extension)\n");
|
||||||
printf(" Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */
|
printf(" --log-new Create a separate new log file on start. "
|
||||||
|
"Each log file will have unique name: \"<name>.<ID>.log\"\n");
|
||||||
|
printf(" --log-append Don't truncate the old log file.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
|
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
|
||||||
|
@ -39,6 +39,7 @@ void llama_sampling_free(struct llama_sampling_context * ctx) {
|
|||||||
void llama_sampling_reset(llama_sampling_context * ctx) {
|
void llama_sampling_reset(llama_sampling_context * ctx) {
|
||||||
if (ctx->grammar != NULL) {
|
if (ctx->grammar != NULL) {
|
||||||
llama_grammar_free(ctx->grammar);
|
llama_grammar_free(ctx->grammar);
|
||||||
|
ctx->grammar = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!ctx->parsed_grammar.rules.empty()) {
|
if (!ctx->parsed_grammar.rules.empty()) {
|
||||||
@ -89,10 +90,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
|
|||||||
|
|
||||||
snprintf(result, sizeof(result),
|
snprintf(result, sizeof(result),
|
||||||
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
||||||
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
||||||
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
||||||
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
|
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
|
||||||
params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp,
|
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
|
||||||
params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
||||||
|
|
||||||
return std::string(result);
|
return std::string(result);
|
||||||
@ -110,6 +111,7 @@ llama_token llama_sampling_sample(
|
|||||||
const float temp = params.temp;
|
const float temp = params.temp;
|
||||||
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
||||||
const float top_p = params.top_p;
|
const float top_p = params.top_p;
|
||||||
|
const float min_p = params.min_p;
|
||||||
const float tfs_z = params.tfs_z;
|
const float tfs_z = params.tfs_z;
|
||||||
const float typical_p = params.typical_p;
|
const float typical_p = params.typical_p;
|
||||||
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
|
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
|
||||||
@ -190,6 +192,7 @@ llama_token llama_sampling_sample(
|
|||||||
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
|
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
|
||||||
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
|
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
|
||||||
llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep);
|
llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep);
|
||||||
|
llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep);
|
||||||
llama_sample_temp (ctx_main, &cur_p, temp);
|
llama_sample_temp (ctx_main, &cur_p, temp);
|
||||||
|
|
||||||
id = llama_sample_token(ctx_main, &cur_p);
|
id = llama_sample_token(ctx_main, &cur_p);
|
||||||
|
@ -14,6 +14,7 @@ typedef struct llama_sampling_params {
|
|||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
int32_t top_k = 40; // <= 0 to use vocab size
|
int32_t top_k = 40; // <= 0 to use vocab size
|
||||||
float top_p = 0.95f; // 1.0 = disabled
|
float top_p = 0.95f; // 1.0 = disabled
|
||||||
|
float min_p = 0.05f; // 0.0 = disabled
|
||||||
float tfs_z = 1.00f; // 1.0 = disabled
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
float typical_p = 1.00f; // 1.0 = disabled
|
float typical_p = 1.00f; // 1.0 = disabled
|
||||||
float temp = 0.80f; // 1.0 = disabled
|
float temp = 0.80f; // 1.0 = disabled
|
||||||
|
@ -32,6 +32,7 @@ struct train_state * init_train_state() {
|
|||||||
state->opt = new struct ggml_opt_context;
|
state->opt = new struct ggml_opt_context;
|
||||||
state->opt->ctx = NULL;
|
state->opt->ctx = NULL;
|
||||||
state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
|
state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
|
||||||
|
state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
|
||||||
state->opt->loss_after = 0.0f;
|
state->opt->loss_after = 0.0f;
|
||||||
|
|
||||||
return state;
|
return state;
|
||||||
@ -1045,6 +1046,7 @@ struct train_params_common get_default_train_params_common() {
|
|||||||
params.n_batch = 8;
|
params.n_batch = 8;
|
||||||
params.n_gradient_accumulation = 1;
|
params.n_gradient_accumulation = 1;
|
||||||
params.n_epochs = -1;
|
params.n_epochs = -1;
|
||||||
|
params.n_gpu_layers = 0;
|
||||||
|
|
||||||
params.custom_n_ctx = false;
|
params.custom_n_ctx = false;
|
||||||
|
|
||||||
@ -1080,6 +1082,7 @@ struct train_params_common get_default_train_params_common() {
|
|||||||
params.adam_beta2 = 0.999f;
|
params.adam_beta2 = 0.999f;
|
||||||
params.adam_gclip = 1.0f;
|
params.adam_gclip = 1.0f;
|
||||||
params.adam_eps_f = 0.0f;
|
params.adam_eps_f = 0.0f;
|
||||||
|
|
||||||
return params;
|
return params;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1133,6 +1136,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
|
|||||||
fprintf(stderr, " --adam-beta2 N AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
|
fprintf(stderr, " --adam-beta2 N AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
|
||||||
fprintf(stderr, " --adam-gclip N AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
|
fprintf(stderr, " --adam-gclip N AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
|
||||||
fprintf(stderr, " --adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
|
fprintf(stderr, " --adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
|
||||||
|
fprintf(stderr, " -ngl N, --n-gpu-layers N Number of model layers to offload to GPU (default %d)", params->n_gpu_layers);
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1352,6 +1356,17 @@ bool consume_common_train_arg(
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
params->adam_gclip = std::stof(argv[i]);
|
params->adam_gclip = std::stof(argv[i]);
|
||||||
|
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
*invalid_param = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
|
params->n_gpu_layers = std::stoi(argv[i]);
|
||||||
|
#else
|
||||||
|
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||||
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||||
|
#endif
|
||||||
} else if (arg == "-h" || arg == "--help") {
|
} else if (arg == "-h" || arg == "--help") {
|
||||||
params->print_usage = true;
|
params->print_usage = true;
|
||||||
return true;
|
return true;
|
||||||
|
@ -9,6 +9,8 @@
|
|||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
#define LLAMA_TRAIN_MAX_NODES 16384
|
||||||
|
|
||||||
typedef std::string mt19937_state;
|
typedef std::string mt19937_state;
|
||||||
|
|
||||||
struct train_state {
|
struct train_state {
|
||||||
@ -44,6 +46,7 @@ struct train_params_common {
|
|||||||
int n_batch;
|
int n_batch;
|
||||||
int n_gradient_accumulation;
|
int n_gradient_accumulation;
|
||||||
int n_epochs;
|
int n_epochs;
|
||||||
|
int n_gpu_layers;
|
||||||
|
|
||||||
bool custom_n_ctx;
|
bool custom_n_ctx;
|
||||||
|
|
||||||
|
@ -1,316 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# HF baichuan --> gguf conversion
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import struct
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import TYPE_CHECKING, Any
|
|
||||||
import itertools
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from sentencepiece import SentencePieceProcessor # type: ignore[import]
|
|
||||||
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
|
||||||
import gguf
|
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from typing import TypeAlias
|
|
||||||
|
|
||||||
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
|
||||||
|
|
||||||
# reverse HF permute back to original pth layout
|
|
||||||
|
|
||||||
|
|
||||||
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
|
|
||||||
if n_kv_head is not None and n_head != n_kv_head:
|
|
||||||
n_head //= n_kv_head
|
|
||||||
|
|
||||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
|
||||||
.swapaxes(1, 2)
|
|
||||||
.reshape(weights.shape))
|
|
||||||
|
|
||||||
def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
|
|
||||||
r = weights.shape[0] // 3
|
|
||||||
return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
|
|
||||||
|
|
||||||
def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
|
|
||||||
r = weights.shape[0] // 3
|
|
||||||
return weights[r * n_part : r * n_part + r, ...]
|
|
||||||
|
|
||||||
def count_model_parts(dir_model: str) -> int:
|
|
||||||
num_parts = 0
|
|
||||||
|
|
||||||
for filename in os.listdir(dir_model):
|
|
||||||
if filename.startswith("pytorch_model-"):
|
|
||||||
num_parts += 1
|
|
||||||
|
|
||||||
if num_parts > 0:
|
|
||||||
print("gguf: found " + str(num_parts) + " model parts")
|
|
||||||
|
|
||||||
return num_parts
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
|
||||||
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
|
|
||||||
parser.add_argument(
|
|
||||||
"--vocab-only", action="store_true",
|
|
||||||
help="extract only the vocab",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--outfile", type=Path,
|
|
||||||
help="path to write to; default: based on input",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"model", type=Path,
|
|
||||||
help="directory containing model file, or model file itself (*.bin)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
|
||||||
help="output format - use 0 for float32, 1 for float16",
|
|
||||||
)
|
|
||||||
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
args = parse_args()
|
|
||||||
|
|
||||||
dir_model = args.model
|
|
||||||
ftype = args.ftype
|
|
||||||
if not dir_model.is_dir():
|
|
||||||
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
endianess = gguf.GGUFEndian.LITTLE
|
|
||||||
if args.bigendian:
|
|
||||||
endianess = gguf.GGUFEndian.BIG
|
|
||||||
endianess_str = "Big Endian" if args.bigendian else "Little Endian"
|
|
||||||
print(f"gguf: Conversion Endianess {endianess}")
|
|
||||||
# possible tensor data types
|
|
||||||
# ftype == 0 -> float32
|
|
||||||
# ftype == 1 -> float16
|
|
||||||
|
|
||||||
# map from ftype to string
|
|
||||||
ftype_str = ["f32", "f16"]
|
|
||||||
|
|
||||||
if args.outfile is not None:
|
|
||||||
fname_out = args.outfile
|
|
||||||
else:
|
|
||||||
# output in the same directory as the model by default
|
|
||||||
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
|
||||||
|
|
||||||
print("gguf: loading model "+dir_model.name)
|
|
||||||
|
|
||||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
|
||||||
hparams = json.load(f)
|
|
||||||
print("hello print: ",hparams["architectures"][0])
|
|
||||||
if hparams["architectures"][0] != "BaichuanForCausalLM" and hparams["architectures"][0] != "BaiChuanForCausalLM":
|
|
||||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
|
||||||
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
# get number of model parts
|
|
||||||
num_parts = count_model_parts(dir_model)
|
|
||||||
print(f"num_parts:{num_parts}\n")
|
|
||||||
ARCH=gguf.MODEL_ARCH.BAICHUAN
|
|
||||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
|
||||||
|
|
||||||
print("gguf: get model metadata")
|
|
||||||
|
|
||||||
block_count = hparams["num_hidden_layers"]
|
|
||||||
head_count = hparams["num_attention_heads"]
|
|
||||||
|
|
||||||
if "num_key_value_heads" in hparams:
|
|
||||||
head_count_kv = hparams["num_key_value_heads"]
|
|
||||||
else:
|
|
||||||
head_count_kv = head_count
|
|
||||||
|
|
||||||
if "_name_or_path" in hparams:
|
|
||||||
hf_repo = hparams["_name_or_path"]
|
|
||||||
else:
|
|
||||||
hf_repo = ""
|
|
||||||
|
|
||||||
if "max_sequence_length" in hparams:
|
|
||||||
ctx_length = hparams["max_sequence_length"]
|
|
||||||
elif "max_position_embeddings" in hparams:
|
|
||||||
ctx_length = hparams["max_position_embeddings"]
|
|
||||||
elif "model_max_length" in hparams:
|
|
||||||
ctx_length = hparams["model_max_length"]
|
|
||||||
else:
|
|
||||||
print("gguf: can not find ctx length parameter.")
|
|
||||||
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
|
|
||||||
gguf_writer.add_name(dir_model.name)
|
|
||||||
gguf_writer.add_source_hf_repo(hf_repo)
|
|
||||||
gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
|
||||||
gguf_writer.add_context_length(ctx_length)
|
|
||||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
|
||||||
gguf_writer.add_block_count(block_count)
|
|
||||||
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
|
||||||
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
|
||||||
gguf_writer.add_head_count(head_count)
|
|
||||||
gguf_writer.add_head_count_kv(head_count_kv)
|
|
||||||
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
|
||||||
|
|
||||||
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
|
|
||||||
if "type" in hparams["rope_scaling"]:
|
|
||||||
if hparams["rope_scaling"]["type"] == "linear":
|
|
||||||
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
|
|
||||||
|
|
||||||
|
|
||||||
# TOKENIZATION
|
|
||||||
|
|
||||||
print("gguf: get tokenizer metadata")
|
|
||||||
|
|
||||||
tokens: list[bytes] = []
|
|
||||||
scores: list[float] = []
|
|
||||||
toktypes: list[int] = []
|
|
||||||
|
|
||||||
tokenizer_model_file = dir_model / 'tokenizer.model'
|
|
||||||
if not tokenizer_model_file.is_file():
|
|
||||||
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# vocab type sentencepiece
|
|
||||||
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
|
|
||||||
|
|
||||||
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
|
|
||||||
vocab_size = hparams.get('vocab_size')
|
|
||||||
if vocab_size is None:
|
|
||||||
vocab_size = tokenizer.vocab_size()
|
|
||||||
|
|
||||||
for i in range(vocab_size):
|
|
||||||
text: bytes
|
|
||||||
score: float
|
|
||||||
|
|
||||||
piece = tokenizer.id_to_piece(i)
|
|
||||||
text = piece.encode("utf-8")
|
|
||||||
score = tokenizer.get_score(i)
|
|
||||||
|
|
||||||
toktype = 1 # defualt to normal token type
|
|
||||||
if tokenizer.is_unknown(i):
|
|
||||||
toktype = 2
|
|
||||||
if tokenizer.is_control(i):
|
|
||||||
toktype = 3
|
|
||||||
|
|
||||||
# toktype = 4 is user-defined = tokens from added_tokens.json
|
|
||||||
|
|
||||||
if tokenizer.is_unused(i):
|
|
||||||
toktype = 5
|
|
||||||
if tokenizer.is_byte(i):
|
|
||||||
toktype = 6
|
|
||||||
|
|
||||||
tokens.append(text)
|
|
||||||
scores.append(score)
|
|
||||||
toktypes.append(toktype)
|
|
||||||
|
|
||||||
added_tokens_file = dir_model / 'added_tokens.json'
|
|
||||||
if added_tokens_file.is_file():
|
|
||||||
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
|
||||||
addtokens_json = json.load(f)
|
|
||||||
|
|
||||||
print("gguf: get added tokens")
|
|
||||||
|
|
||||||
for key in addtokens_json:
|
|
||||||
tokens.append( key.encode("utf-8") )
|
|
||||||
scores.append(-1000.0)
|
|
||||||
toktypes.append(4) # user-defined token type
|
|
||||||
|
|
||||||
|
|
||||||
gguf_writer.add_tokenizer_model("llama")
|
|
||||||
gguf_writer.add_token_list(tokens)
|
|
||||||
gguf_writer.add_token_scores(scores)
|
|
||||||
gguf_writer.add_token_types(toktypes)
|
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, n_vocab = len(tokens))
|
|
||||||
special_vocab.add_to_gguf(gguf_writer)
|
|
||||||
|
|
||||||
# TENSORS
|
|
||||||
|
|
||||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
|
||||||
|
|
||||||
# tensor info
|
|
||||||
print("gguf: get tensor metadata")
|
|
||||||
|
|
||||||
if num_parts == 0:
|
|
||||||
part_names = iter(("pytorch_model.bin",))
|
|
||||||
else:
|
|
||||||
part_names = (
|
|
||||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
for part_name in part_names:
|
|
||||||
if args.vocab_only:
|
|
||||||
break
|
|
||||||
print("gguf: loading model part '" + part_name + "'")
|
|
||||||
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
|
||||||
|
|
||||||
tmp=model_part
|
|
||||||
for i in range(block_count):
|
|
||||||
if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
|
|
||||||
print(f"Unpacking and permuting layer {i}")
|
|
||||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
|
|
||||||
tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
|
|
||||||
tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
|
|
||||||
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
|
|
||||||
|
|
||||||
for name in model_part.keys():
|
|
||||||
data = model_part[name]
|
|
||||||
# we don't need these
|
|
||||||
if name.endswith(".rotary_emb.inv_freq"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
old_dtype = data.dtype
|
|
||||||
|
|
||||||
# convert any unsupported data types to float32
|
|
||||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
|
||||||
data = data.to(torch.float32)
|
|
||||||
|
|
||||||
data = data.squeeze().numpy()
|
|
||||||
|
|
||||||
# map tensor names
|
|
||||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
|
||||||
if new_name is None:
|
|
||||||
print("Can not map tensor '" + name + "'")
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
n_dims = len(data.shape)
|
|
||||||
data_dtype = data.dtype
|
|
||||||
|
|
||||||
# if f32 desired, convert any float16 to float32
|
|
||||||
if ftype == 0 and data_dtype == np.float16:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
|
||||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
|
||||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
|
||||||
data = data.astype(np.float16)
|
|
||||||
|
|
||||||
print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
|
||||||
gguf_writer.add_tensor(new_name, data)
|
|
||||||
|
|
||||||
|
|
||||||
print("gguf: write header")
|
|
||||||
gguf_writer.write_header_to_file()
|
|
||||||
print("gguf: write metadata")
|
|
||||||
gguf_writer.write_kv_data_to_file()
|
|
||||||
if not args.vocab_only:
|
|
||||||
print("gguf: write tensors")
|
|
||||||
gguf_writer.write_tensors_to_file()
|
|
||||||
|
|
||||||
gguf_writer.close()
|
|
||||||
|
|
||||||
print(f"gguf: model successfully exported to '{fname_out}'")
|
|
||||||
print("")
|
|
@ -1,247 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# HF bloom --> gguf conversion
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import struct
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from transformers import AutoTokenizer # type: ignore[import]
|
|
||||||
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
|
||||||
import gguf
|
|
||||||
|
|
||||||
|
|
||||||
def count_model_parts(dir_model: Path) -> int:
|
|
||||||
num_parts = 0
|
|
||||||
for filename in os.listdir(dir_model):
|
|
||||||
if filename.startswith("pytorch_model-"):
|
|
||||||
num_parts += 1
|
|
||||||
|
|
||||||
if num_parts > 0:
|
|
||||||
print("gguf: found " + str(num_parts) + " model parts")
|
|
||||||
return num_parts
|
|
||||||
|
|
||||||
|
|
||||||
# Supported Models:
|
|
||||||
# https://huggingface.co/bigscience/bloom-1b7
|
|
||||||
# https://huggingface.co/bigscience/bloom-3b
|
|
||||||
# https://huggingface.co/bigscience/bloom-7b1
|
|
||||||
# https://huggingface.co/Langboat/bloom-1b4-zh
|
|
||||||
def parse_args() -> argparse.Namespace:
|
|
||||||
parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file")
|
|
||||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
|
||||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
|
||||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
|
||||||
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
args = parse_args()
|
|
||||||
|
|
||||||
dir_model = args.model
|
|
||||||
ftype = args.ftype
|
|
||||||
if not dir_model.is_dir():
|
|
||||||
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# possible tensor data types
|
|
||||||
# ftype == 0 -> float32
|
|
||||||
# ftype == 1 -> float16
|
|
||||||
|
|
||||||
# map from ftype to string
|
|
||||||
ftype_str = ["f32", "f16"]
|
|
||||||
|
|
||||||
if args.outfile is not None:
|
|
||||||
fname_out = args.outfile
|
|
||||||
else:
|
|
||||||
# output in the same directory as the model by default
|
|
||||||
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
|
||||||
|
|
||||||
print("gguf: loading model "+dir_model.name)
|
|
||||||
|
|
||||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
|
||||||
hparams = json.load(f)
|
|
||||||
|
|
||||||
if hparams["architectures"][0] != "BloomForCausalLM":
|
|
||||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# get number of model parts
|
|
||||||
num_parts = count_model_parts(dir_model)
|
|
||||||
|
|
||||||
ARCH=gguf.MODEL_ARCH.BLOOM
|
|
||||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
|
||||||
|
|
||||||
print("gguf: get model metadata")
|
|
||||||
|
|
||||||
block_count = hparams["n_layer"]
|
|
||||||
|
|
||||||
gguf_writer.add_name("Bloom")
|
|
||||||
n_embed = hparams.get("hidden_size", hparams.get("n_embed"))
|
|
||||||
n_head = hparams.get("n_head", hparams.get("num_attention_heads"))
|
|
||||||
gguf_writer.add_context_length(hparams.get("seq_length", n_embed))
|
|
||||||
gguf_writer.add_embedding_length(n_embed)
|
|
||||||
gguf_writer.add_feed_forward_length(4 * n_embed)
|
|
||||||
gguf_writer.add_block_count(block_count)
|
|
||||||
gguf_writer.add_head_count(n_head)
|
|
||||||
gguf_writer.add_head_count_kv(n_head)
|
|
||||||
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
|
||||||
gguf_writer.add_file_type(ftype)
|
|
||||||
|
|
||||||
# TOKENIZATION
|
|
||||||
|
|
||||||
print("gguf: get tokenizer metadata")
|
|
||||||
|
|
||||||
tokens: list[bytearray] = []
|
|
||||||
scores: list[float] = []
|
|
||||||
toktypes: list[int] = []
|
|
||||||
|
|
||||||
# gpt2 tokenizer
|
|
||||||
gguf_writer.add_tokenizer_model("gpt2")
|
|
||||||
|
|
||||||
print("gguf: get gpt2 tokenizer vocab")
|
|
||||||
|
|
||||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
|
||||||
|
|
||||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
|
||||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
|
||||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
|
||||||
assert max(tokenizer.vocab.values()) < vocab_size
|
|
||||||
|
|
||||||
added_vocab = tokenizer.get_added_vocab()
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
|
||||||
|
|
||||||
for i in range(vocab_size):
|
|
||||||
if i not in reverse_vocab:
|
|
||||||
tokens.append(f"[PAD{i}]")
|
|
||||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
|
||||||
elif reverse_vocab[i] in added_vocab:
|
|
||||||
tokens.append(reverse_vocab[i])
|
|
||||||
if tokenizer.added_tokens_decoder[i].special:
|
|
||||||
toktypes.append(gguf.TokenType.CONTROL)
|
|
||||||
else:
|
|
||||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
|
||||||
else:
|
|
||||||
tokens.append(reverse_vocab[i])
|
|
||||||
toktypes.append(gguf.TokenType.NORMAL)
|
|
||||||
|
|
||||||
gguf_writer.add_token_list(tokens)
|
|
||||||
gguf_writer.add_token_types(toktypes)
|
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))
|
|
||||||
special_vocab.add_to_gguf(gguf_writer)
|
|
||||||
|
|
||||||
# TENSORS
|
|
||||||
|
|
||||||
tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
|
|
||||||
|
|
||||||
# params for qkv transform
|
|
||||||
n_head_kv = hparams.get("n_head_kv", n_head)
|
|
||||||
head_dim = n_embed // n_head
|
|
||||||
|
|
||||||
# tensor info
|
|
||||||
print("gguf: get tensor metadata")
|
|
||||||
|
|
||||||
if num_parts == 0:
|
|
||||||
part_names = iter(("pytorch_model.bin",))
|
|
||||||
else:
|
|
||||||
part_names = (
|
|
||||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
|
||||||
)
|
|
||||||
|
|
||||||
for part_name in part_names:
|
|
||||||
if args.vocab_only:
|
|
||||||
break
|
|
||||||
print("gguf: loading model part '" + part_name + "'")
|
|
||||||
model_part = torch.load(dir_model / part_name, map_location="cpu")
|
|
||||||
|
|
||||||
has_lm_head = True
|
|
||||||
if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys():
|
|
||||||
has_lm_head = False
|
|
||||||
|
|
||||||
for original_name in model_part.keys():
|
|
||||||
data = model_part[original_name]
|
|
||||||
name = re.sub(r'transformer\.', '', original_name)
|
|
||||||
|
|
||||||
old_dtype = data.dtype
|
|
||||||
|
|
||||||
# convert any unsupported data types to float32
|
|
||||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
|
||||||
data = data.to(torch.float32)
|
|
||||||
|
|
||||||
data = data.squeeze().numpy()
|
|
||||||
|
|
||||||
if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
|
|
||||||
# Map bloom-style qkv_linear to gpt-style qkv_linear
|
|
||||||
# bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
|
|
||||||
# gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
|
|
||||||
qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
|
|
||||||
data = np.concatenate(
|
|
||||||
(qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
|
|
||||||
qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
|
|
||||||
qkv_weights[:, 2, :, :].reshape((-1, n_embed))),
|
|
||||||
axis=0
|
|
||||||
)
|
|
||||||
print("re-format attention.linear_qkv.weight")
|
|
||||||
elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
|
|
||||||
qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
|
|
||||||
data = np.concatenate(
|
|
||||||
(qkv_bias[:, 0, :].reshape((n_embed,)),
|
|
||||||
qkv_bias[:, 1, :].reshape((n_embed,)),
|
|
||||||
qkv_bias[:, 2, :].reshape((n_embed,))),
|
|
||||||
axis=0
|
|
||||||
)
|
|
||||||
print("re-format attention.linear_qkv.bias")
|
|
||||||
|
|
||||||
# map tensor names
|
|
||||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
|
||||||
if new_name is None:
|
|
||||||
print("Can not map tensor '" + name + "'")
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
n_dims = len(data.shape)
|
|
||||||
data_dtype = data.dtype
|
|
||||||
|
|
||||||
# if f32 desired, convert any float16 to float32
|
|
||||||
if ftype == 0 and data_dtype == np.float16:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
|
||||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
|
||||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
|
||||||
data = data.astype(np.float16)
|
|
||||||
|
|
||||||
print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
|
||||||
|
|
||||||
gguf_writer.add_tensor(new_name, data)
|
|
||||||
|
|
||||||
if not has_lm_head and name == "word_embeddings.weight":
|
|
||||||
gguf_writer.add_tensor("output.weight", data)
|
|
||||||
print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa
|
|
||||||
|
|
||||||
|
|
||||||
print("gguf: write header")
|
|
||||||
gguf_writer.write_header_to_file()
|
|
||||||
print("gguf: write metadata")
|
|
||||||
gguf_writer.write_kv_data_to_file()
|
|
||||||
if not args.vocab_only:
|
|
||||||
print("gguf: write tensors")
|
|
||||||
gguf_writer.write_tensors_to_file()
|
|
||||||
|
|
||||||
gguf_writer.close()
|
|
||||||
|
|
||||||
print(f"gguf: model successfully exported to '{fname_out}'")
|
|
||||||
print("")
|
|
@ -1,253 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# HF falcon--> gguf conversion
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import contextlib
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import struct
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from transformers import AutoTokenizer # type: ignore[import]
|
|
||||||
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
|
||||||
import gguf
|
|
||||||
|
|
||||||
|
|
||||||
def count_model_parts(dir_model: Path, prefix: str) -> int:
|
|
||||||
num_parts = 0
|
|
||||||
for filename in os.listdir(dir_model):
|
|
||||||
if filename.startswith(prefix):
|
|
||||||
num_parts += 1
|
|
||||||
|
|
||||||
if num_parts > 0:
|
|
||||||
print("gguf: found " + str(num_parts) + " model parts")
|
|
||||||
return num_parts
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
|
||||||
parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
|
|
||||||
parser.add_argument(
|
|
||||||
"--vocab-only", action="store_true",
|
|
||||||
help="extract only the vocab",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--outfile", type=Path,
|
|
||||||
help="path to write to; default: based on input",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"model", type=Path,
|
|
||||||
help="directory containing model file, or model file itself (*.bin)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
|
||||||
help="output format - use 0 for float32, 1 for float16",
|
|
||||||
)
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
args = parse_args()
|
|
||||||
|
|
||||||
dir_model = args.model
|
|
||||||
ftype = args.ftype
|
|
||||||
if not dir_model.is_dir():
|
|
||||||
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# possible tensor data types
|
|
||||||
# ftype == 0 -> float32
|
|
||||||
# ftype == 1 -> float16
|
|
||||||
|
|
||||||
# map from ftype to string
|
|
||||||
ftype_str = ["f32", "f16"]
|
|
||||||
|
|
||||||
if args.outfile is not None:
|
|
||||||
fname_out = args.outfile
|
|
||||||
else:
|
|
||||||
# output in the same directory as the model by default
|
|
||||||
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
|
||||||
|
|
||||||
print("gguf: loading model "+dir_model.name)
|
|
||||||
|
|
||||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
|
||||||
hparams = json.load(f)
|
|
||||||
|
|
||||||
if hparams["architectures"][0] not in ("RWForCausalLM", "FalconForCausalLM"):
|
|
||||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
|
||||||
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# get number of model parts
|
|
||||||
num_parts = count_model_parts(dir_model, "model-00")
|
|
||||||
if num_parts:
|
|
||||||
is_safetensors = True
|
|
||||||
from safetensors import safe_open
|
|
||||||
else:
|
|
||||||
is_safetensors = False
|
|
||||||
num_parts = count_model_parts(dir_model, "pytorch_model-")
|
|
||||||
|
|
||||||
ARCH=gguf.MODEL_ARCH.FALCON
|
|
||||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
|
||||||
|
|
||||||
print("gguf: get model metadata")
|
|
||||||
|
|
||||||
block_count = hparams.get("num_hidden_layers")
|
|
||||||
if block_count is None:
|
|
||||||
block_count = hparams["n_layer"] # old name
|
|
||||||
|
|
||||||
n_head = hparams.get("num_attention_heads")
|
|
||||||
if n_head is None:
|
|
||||||
n_head = hparams["n_head"] # old name
|
|
||||||
|
|
||||||
n_head_kv = hparams.get("num_kv_heads")
|
|
||||||
if n_head_kv is None:
|
|
||||||
n_head_kv = hparams.get("n_head_kv", 1) # old name
|
|
||||||
|
|
||||||
gguf_writer.add_name("Falcon")
|
|
||||||
gguf_writer.add_context_length(2048) # not in config.json
|
|
||||||
gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
|
|
||||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
|
||||||
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
|
|
||||||
gguf_writer.add_block_count(block_count)
|
|
||||||
gguf_writer.add_head_count(n_head)
|
|
||||||
gguf_writer.add_head_count_kv(n_head_kv)
|
|
||||||
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
|
||||||
gguf_writer.add_file_type(ftype)
|
|
||||||
|
|
||||||
# TOKENIZATION
|
|
||||||
|
|
||||||
print("gguf: get tokenizer metadata")
|
|
||||||
|
|
||||||
tokens: list[bytearray] = []
|
|
||||||
scores: list[float] = []
|
|
||||||
toktypes: list[int] = []
|
|
||||||
|
|
||||||
# gpt2 tokenizer
|
|
||||||
gguf_writer.add_tokenizer_model("gpt2")
|
|
||||||
|
|
||||||
print("gguf: get gpt2 tokenizer vocab")
|
|
||||||
|
|
||||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
|
||||||
|
|
||||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
|
||||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
|
||||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
|
||||||
assert max(tokenizer.vocab.values()) < vocab_size
|
|
||||||
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
|
||||||
|
|
||||||
for i in range(vocab_size):
|
|
||||||
tokens.append(reverse_vocab[i])
|
|
||||||
scores.append(0.0) # dummy
|
|
||||||
toktypes.append(gguf.TokenType.NORMAL)
|
|
||||||
|
|
||||||
gguf_writer.add_token_list(tokens)
|
|
||||||
gguf_writer.add_token_scores(scores)
|
|
||||||
gguf_writer.add_token_types(toktypes)
|
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
|
|
||||||
special_vocab.add_to_gguf(gguf_writer)
|
|
||||||
|
|
||||||
# TENSORS
|
|
||||||
|
|
||||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
|
||||||
|
|
||||||
head_dim = hparams["hidden_size"] // n_head
|
|
||||||
|
|
||||||
# tensor info
|
|
||||||
print("gguf: get tensor metadata")
|
|
||||||
|
|
||||||
if num_parts == 0:
|
|
||||||
part_names = iter(("pytorch_model.bin",))
|
|
||||||
elif is_safetensors:
|
|
||||||
part_names = (
|
|
||||||
f"model-{n:05}-of-{num_parts:05}.safetensors" for n in range(1, num_parts + 1)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
part_names = (
|
|
||||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
|
||||||
)
|
|
||||||
|
|
||||||
for part_name in part_names:
|
|
||||||
if args.vocab_only:
|
|
||||||
break
|
|
||||||
print("gguf: loading model part '" + part_name + "'")
|
|
||||||
if is_safetensors:
|
|
||||||
ctx = safe_open(dir_model / part_name, framework="pt", device="cpu")
|
|
||||||
else:
|
|
||||||
ctx = contextlib.nullcontext(torch.load(dir_model / part_name, map_location="cpu"))
|
|
||||||
|
|
||||||
with ctx as model_part:
|
|
||||||
for name in model_part.keys():
|
|
||||||
data = model_part.get_tensor(name) if is_safetensors else model_part[name]
|
|
||||||
|
|
||||||
old_dtype = data.dtype
|
|
||||||
|
|
||||||
# convert any unsupported data types to float32
|
|
||||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
|
||||||
data = data.to(torch.float32)
|
|
||||||
|
|
||||||
# QKV tensor transform
|
|
||||||
# The original query_key_value tensor contains n_head_kv "kv groups",
|
|
||||||
# each consisting of n_head/n_head_kv query weights followed by one key
|
|
||||||
# and one value weight (shared by all query heads in the kv group).
|
|
||||||
# This layout makes it a big pain to work with in GGML.
|
|
||||||
# So we rearrange them here,, so that we have n_head query weights
|
|
||||||
# followed by n_head_kv key weights followed by n_head_kv value weights,
|
|
||||||
# in contiguous fashion.
|
|
||||||
# ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
|
|
||||||
|
|
||||||
if "query_key_value" in name:
|
|
||||||
qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
|
|
||||||
q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
|
|
||||||
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
|
||||||
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
|
||||||
data = torch.cat((q,k,v)).reshape_as(data)
|
|
||||||
|
|
||||||
data = data.squeeze().numpy()
|
|
||||||
|
|
||||||
# map tensor names
|
|
||||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
|
||||||
if new_name is None:
|
|
||||||
print("Can not map tensor '" + name + "'")
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
n_dims = len(data.shape)
|
|
||||||
data_dtype = data.dtype
|
|
||||||
|
|
||||||
# if f32 desired, convert any float16 to float32
|
|
||||||
if ftype == 0 and data_dtype == np.float16:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
|
||||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
|
||||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
|
||||||
data = data.astype(np.float16)
|
|
||||||
|
|
||||||
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
|
||||||
|
|
||||||
gguf_writer.add_tensor(new_name, data)
|
|
||||||
|
|
||||||
|
|
||||||
print("gguf: write header")
|
|
||||||
gguf_writer.write_header_to_file()
|
|
||||||
print("gguf: write metadata")
|
|
||||||
gguf_writer.write_kv_data_to_file()
|
|
||||||
if not args.vocab_only:
|
|
||||||
print("gguf: write tensors")
|
|
||||||
gguf_writer.write_tensors_to_file()
|
|
||||||
|
|
||||||
gguf_writer.close()
|
|
||||||
|
|
||||||
print(f"gguf: model successfully exported to '{fname_out}'")
|
|
||||||
print("")
|
|
@ -1,221 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# HF gptneox--> gguf conversion
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import struct
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from transformers import AutoTokenizer # type: ignore[import]
|
|
||||||
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
|
||||||
import gguf
|
|
||||||
|
|
||||||
|
|
||||||
def count_model_parts(dir_model: Path) -> int:
|
|
||||||
num_parts = 0
|
|
||||||
for filename in os.listdir(dir_model):
|
|
||||||
if filename.startswith("pytorch_model-"):
|
|
||||||
num_parts += 1
|
|
||||||
|
|
||||||
if num_parts > 0:
|
|
||||||
print("gguf: found " + str(num_parts) + " model parts")
|
|
||||||
return num_parts
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
|
||||||
parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file")
|
|
||||||
parser.add_argument(
|
|
||||||
"--vocab-only", action="store_true",
|
|
||||||
help="extract only the vocab",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--outfile", type=Path,
|
|
||||||
help="path to write to; default: based on input",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"model", type=Path,
|
|
||||||
help="directory containing model file, or model file itself (*.bin)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
|
||||||
help="output format - use 0 for float32, 1 for float16",
|
|
||||||
)
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
args = parse_args()
|
|
||||||
|
|
||||||
dir_model = args.model
|
|
||||||
ftype = args.ftype
|
|
||||||
if not dir_model.is_dir():
|
|
||||||
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# possible tensor data types
|
|
||||||
# ftype == 0 -> float32
|
|
||||||
# ftype == 1 -> float16
|
|
||||||
|
|
||||||
# map from ftype to string
|
|
||||||
ftype_str = ["f32", "f16"]
|
|
||||||
|
|
||||||
if args.outfile is not None:
|
|
||||||
fname_out = args.outfile
|
|
||||||
else:
|
|
||||||
# output in the same directory as the model by default
|
|
||||||
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
|
||||||
|
|
||||||
print("gguf: loading model "+dir_model.name)
|
|
||||||
|
|
||||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
|
||||||
hparams = json.load(f)
|
|
||||||
|
|
||||||
if hparams["architectures"][0] != "GPTNeoXForCausalLM":
|
|
||||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
|
||||||
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
# get number of model parts
|
|
||||||
num_parts = count_model_parts(dir_model)
|
|
||||||
|
|
||||||
ARCH=gguf.MODEL_ARCH.GPTNEOX
|
|
||||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
|
||||||
|
|
||||||
print("gguf: get model metadata")
|
|
||||||
|
|
||||||
block_count = hparams["num_hidden_layers"]
|
|
||||||
|
|
||||||
gguf_writer.add_name(dir_model.name)
|
|
||||||
gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
|
||||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
|
||||||
gguf_writer.add_block_count(block_count)
|
|
||||||
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
|
||||||
gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))
|
|
||||||
gguf_writer.add_head_count(hparams["num_attention_heads"])
|
|
||||||
gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
|
||||||
gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
|
|
||||||
|
|
||||||
# TOKENIZATION
|
|
||||||
|
|
||||||
print("gguf: get tokenizer metadata")
|
|
||||||
|
|
||||||
tokens: list[bytearray] = []
|
|
||||||
scores: list[float] = []
|
|
||||||
toktypes: list[int] = []
|
|
||||||
|
|
||||||
# gpt2 tokenizer
|
|
||||||
gguf_writer.add_tokenizer_model("gpt2")
|
|
||||||
|
|
||||||
print("gguf: get gpt2 tokenizer vocab")
|
|
||||||
|
|
||||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
|
||||||
|
|
||||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
|
||||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
|
||||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
|
||||||
assert max(tokenizer.vocab.values()) < vocab_size
|
|
||||||
|
|
||||||
added_vocab = tokenizer.get_added_vocab()
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
|
||||||
|
|
||||||
for i in range(vocab_size):
|
|
||||||
if i not in reverse_vocab:
|
|
||||||
tokens.append(f"[PAD{i}]")
|
|
||||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
|
||||||
elif reverse_vocab[i] in added_vocab:
|
|
||||||
tokens.append(reverse_vocab[i])
|
|
||||||
if tokenizer.added_tokens_decoder[i].special:
|
|
||||||
toktypes.append(gguf.TokenType.CONTROL)
|
|
||||||
else:
|
|
||||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
|
||||||
else:
|
|
||||||
tokens.append(reverse_vocab[i])
|
|
||||||
toktypes.append(gguf.TokenType.NORMAL)
|
|
||||||
|
|
||||||
gguf_writer.add_token_list(tokens)
|
|
||||||
gguf_writer.add_token_types(toktypes)
|
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
|
|
||||||
special_vocab.add_to_gguf(gguf_writer)
|
|
||||||
|
|
||||||
# TENSORS
|
|
||||||
|
|
||||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
|
||||||
|
|
||||||
# tensor info
|
|
||||||
print("gguf: get tensor metadata")
|
|
||||||
|
|
||||||
if num_parts == 0:
|
|
||||||
part_names = iter(("pytorch_model.bin",))
|
|
||||||
else:
|
|
||||||
part_names = (
|
|
||||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
|
||||||
)
|
|
||||||
|
|
||||||
for part_name in part_names:
|
|
||||||
if args.vocab_only:
|
|
||||||
break
|
|
||||||
print("gguf: loading model part '" + part_name + "'")
|
|
||||||
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
|
||||||
|
|
||||||
for name in model_part.keys():
|
|
||||||
data = model_part[name]
|
|
||||||
|
|
||||||
# we don't need these
|
|
||||||
if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
old_dtype = data.dtype
|
|
||||||
|
|
||||||
# convert any unsupported data types to float32
|
|
||||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
|
||||||
data = data.to(torch.float32)
|
|
||||||
|
|
||||||
data = data.squeeze().numpy()
|
|
||||||
|
|
||||||
# map tensor names
|
|
||||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
|
||||||
if new_name is None:
|
|
||||||
print("Can not map tensor '" + name + "'")
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
n_dims = len(data.shape)
|
|
||||||
data_dtype = data.dtype
|
|
||||||
|
|
||||||
# if f32 desired, convert any float16 to float32
|
|
||||||
if ftype == 0 and data_dtype == np.float16:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
|
||||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
|
||||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
|
||||||
data = data.astype(np.float16)
|
|
||||||
|
|
||||||
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
|
||||||
|
|
||||||
gguf_writer.add_tensor(new_name, data)
|
|
||||||
|
|
||||||
|
|
||||||
print("gguf: write header")
|
|
||||||
gguf_writer.write_header_to_file()
|
|
||||||
print("gguf: write metadata")
|
|
||||||
gguf_writer.write_kv_data_to_file()
|
|
||||||
if not args.vocab_only:
|
|
||||||
print("gguf: write tensors")
|
|
||||||
gguf_writer.write_tensors_to_file()
|
|
||||||
|
|
||||||
gguf_writer.close()
|
|
||||||
|
|
||||||
print(f"gguf: model successfully exported to '{fname_out}'")
|
|
||||||
print("")
|
|
900
convert-hf-to-gguf.py
Executable file
900
convert-hf-to-gguf.py
Executable file
@ -0,0 +1,900 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import contextlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from enum import IntEnum
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from torch import Tensor
|
||||||
|
|
||||||
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
|
###### MODEL DEFINITIONS ######
|
||||||
|
|
||||||
|
class SentencePieceTokenTypes(IntEnum):
|
||||||
|
NORMAL = 1
|
||||||
|
UNKNOWN = 2
|
||||||
|
CONTROL = 3
|
||||||
|
USER_DEFINED = 4
|
||||||
|
UNUSED = 5
|
||||||
|
BYTE = 6
|
||||||
|
|
||||||
|
|
||||||
|
class Model:
|
||||||
|
def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
|
||||||
|
self.dir_model = dir_model
|
||||||
|
self.ftype = ftype
|
||||||
|
self.fname_out = fname_out
|
||||||
|
self.is_big_endian = is_big_endian
|
||||||
|
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
||||||
|
self.is_safetensors = self._is_model_safetensors()
|
||||||
|
self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
|
||||||
|
self.part_names = self._get_part_names()
|
||||||
|
self.hparams = Model.load_hparams(self.dir_model)
|
||||||
|
self.model_arch = self._get_model_architecture()
|
||||||
|
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess)
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
self._set_vocab_gpt2()
|
||||||
|
|
||||||
|
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||||
|
for part_name in self.part_names:
|
||||||
|
print(f"gguf: loading model part '{part_name}'")
|
||||||
|
ctx: ContextManager[Any]
|
||||||
|
if self.is_safetensors:
|
||||||
|
from safetensors import safe_open
|
||||||
|
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
|
||||||
|
else:
|
||||||
|
ctx = contextlib.nullcontext(torch.load(self.dir_model / part_name, map_location="cpu"))
|
||||||
|
|
||||||
|
with ctx as model_part:
|
||||||
|
for name in model_part.keys():
|
||||||
|
data = model_part.get_tensor(name) if self.is_safetensors else model_part[name]
|
||||||
|
yield name, data
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
self.gguf_writer.add_name(self.dir_model.name)
|
||||||
|
self.gguf_writer.add_block_count(self.hparams.get(
|
||||||
|
"n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")),
|
||||||
|
))
|
||||||
|
if (n_ctx := self.hparams.get("max_position_embeddings")) is not None:
|
||||||
|
self.gguf_writer.add_context_length(n_ctx)
|
||||||
|
if (n_embd := self.hparams.get("hidden_size")) is not None:
|
||||||
|
self.gguf_writer.add_embedding_length(n_embd)
|
||||||
|
if (n_ff := self.hparams.get("intermediate_size")) is not None:
|
||||||
|
self.gguf_writer.add_feed_forward_length(n_ff)
|
||||||
|
if (n_head := self.hparams.get("num_attention_head")) is not None:
|
||||||
|
self.gguf_writer.add_head_count(n_head)
|
||||||
|
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
||||||
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||||
|
for name, data_torch in self.get_tensors():
|
||||||
|
# we don't need these
|
||||||
|
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
|
||||||
|
continue
|
||||||
|
|
||||||
|
old_dtype = data_torch.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||||
|
data_torch = data_torch.to(torch.float32)
|
||||||
|
|
||||||
|
data = data_torch.squeeze().numpy()
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if self.ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
|
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
def write(self):
|
||||||
|
self.write_tensors()
|
||||||
|
self.gguf_writer.write_header_to_file()
|
||||||
|
self.gguf_writer.write_kv_data_to_file()
|
||||||
|
self.gguf_writer.write_tensors_to_file()
|
||||||
|
self.gguf_writer.close()
|
||||||
|
|
||||||
|
def write_vocab(self):
|
||||||
|
self.gguf_writer.write_header_to_file()
|
||||||
|
self.gguf_writer.write_kv_data_to_file()
|
||||||
|
self.gguf_writer.close()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def count_model_parts(dir_model: Path, prefix: str) -> int:
|
||||||
|
num_parts = 0
|
||||||
|
for filename in os.listdir(dir_model):
|
||||||
|
if filename.endswith(prefix):
|
||||||
|
num_parts += 1
|
||||||
|
|
||||||
|
return num_parts
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load_hparams(dir_model):
|
||||||
|
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_model_architecture(model_architecture):
|
||||||
|
if model_architecture == "GPTNeoXForCausalLM":
|
||||||
|
return GPTNeoXModel
|
||||||
|
if model_architecture == "BloomForCausalLM":
|
||||||
|
return BloomModel
|
||||||
|
if model_architecture == "MPTForCausalLM":
|
||||||
|
return MPTModel
|
||||||
|
if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
|
||||||
|
return BaichuanModel
|
||||||
|
if model_architecture in ("FalconForCausalLM", "RWForCausalLM"):
|
||||||
|
return FalconModel
|
||||||
|
if model_architecture == "GPTBigCodeForCausalLM":
|
||||||
|
return StarCoderModel
|
||||||
|
if model_architecture == "GPTRefactForCausalLM":
|
||||||
|
return RefactModel
|
||||||
|
if model_architecture == "PersimmonForCausalLM":
|
||||||
|
return PersimmonModel
|
||||||
|
if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
||||||
|
return StableLMModel
|
||||||
|
return Model
|
||||||
|
|
||||||
|
def _is_model_safetensors(self) -> bool:
|
||||||
|
return Model.count_model_parts(self.dir_model, ".safetensors") > 0
|
||||||
|
|
||||||
|
def _get_part_names(self):
|
||||||
|
if self.is_safetensors:
|
||||||
|
if self.num_parts == 1: # there's only one .safetensors file
|
||||||
|
return ("model.safetensors",)
|
||||||
|
return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1))
|
||||||
|
|
||||||
|
if self.num_parts == 1: # there's only one .bin file
|
||||||
|
return ("pytorch_model.bin",)
|
||||||
|
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
|
||||||
|
|
||||||
|
def _get_model_architecture(self) -> gguf.MODEL_ARCH:
|
||||||
|
arch = self.hparams["architectures"][0]
|
||||||
|
if arch == "GPTNeoXForCausalLM":
|
||||||
|
return gguf.MODEL_ARCH.GPTNEOX
|
||||||
|
if arch == "BloomForCausalLM":
|
||||||
|
return gguf.MODEL_ARCH.BLOOM
|
||||||
|
if arch == "MPTForCausalLM":
|
||||||
|
return gguf.MODEL_ARCH.MPT
|
||||||
|
if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
|
||||||
|
return gguf.MODEL_ARCH.BAICHUAN
|
||||||
|
if arch in ("FalconForCausalLM", "RWForCausalLM"):
|
||||||
|
return gguf.MODEL_ARCH.FALCON
|
||||||
|
if arch == "GPTBigCodeForCausalLM":
|
||||||
|
return gguf.MODEL_ARCH.STARCODER
|
||||||
|
if arch == "GPTRefactForCausalLM":
|
||||||
|
return gguf.MODEL_ARCH.REFACT
|
||||||
|
if arch == "PersimmonForCausalLM":
|
||||||
|
return gguf.MODEL_ARCH.PERSIMMON
|
||||||
|
if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
||||||
|
return gguf.MODEL_ARCH.STABLELM
|
||||||
|
|
||||||
|
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
||||||
|
|
||||||
|
def _set_vocab_gpt2(self):
|
||||||
|
dir_model = self.dir_model
|
||||||
|
hparams = self.hparams
|
||||||
|
tokens: list[bytearray] = []
|
||||||
|
toktypes: list[int] = []
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer # type: ignore[attr-defined]
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||||
|
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||||
|
assert max(tokenizer.vocab.values()) < vocab_size
|
||||||
|
|
||||||
|
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
||||||
|
added_vocab = tokenizer.get_added_vocab()
|
||||||
|
|
||||||
|
for i in range(vocab_size):
|
||||||
|
if i not in reverse_vocab:
|
||||||
|
pad_token = f"[PAD{i}]".encode('utf-8')
|
||||||
|
tokens.append(bytearray(pad_token))
|
||||||
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||||
|
elif reverse_vocab[i] in added_vocab:
|
||||||
|
tokens.append(reverse_vocab[i])
|
||||||
|
if tokenizer.added_tokens_decoder[i].special:
|
||||||
|
toktypes.append(gguf.TokenType.CONTROL)
|
||||||
|
else:
|
||||||
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||||
|
else:
|
||||||
|
tokens.append(reverse_vocab[i])
|
||||||
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
|
|
||||||
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
def _set_vocab_sentencepiece(self):
|
||||||
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
|
tokenizer_path = self.dir_model / 'tokenizer.model'
|
||||||
|
|
||||||
|
tokens: list[bytes] = []
|
||||||
|
scores: list[float] = []
|
||||||
|
toktypes: list[int] = []
|
||||||
|
|
||||||
|
if not tokenizer_path.is_file():
|
||||||
|
print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
tokenizer = SentencePieceProcessor(str(tokenizer_path))
|
||||||
|
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||||
|
|
||||||
|
for token_id in range(vocab_size):
|
||||||
|
piece = tokenizer.id_to_piece(token_id)
|
||||||
|
text = piece.encode("utf-8")
|
||||||
|
score = tokenizer.get_score(token_id)
|
||||||
|
|
||||||
|
toktype = SentencePieceTokenTypes.NORMAL
|
||||||
|
if tokenizer.is_unknown(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||||
|
elif tokenizer.is_control(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.CONTROL
|
||||||
|
elif tokenizer.is_unused(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.UNUSED
|
||||||
|
elif tokenizer.is_byte(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.BYTE
|
||||||
|
|
||||||
|
tokens.append(text)
|
||||||
|
scores.append(score)
|
||||||
|
toktypes.append(toktype)
|
||||||
|
|
||||||
|
added_tokens_file = self.dir_model / 'added_tokens.json'
|
||||||
|
if added_tokens_file.is_file():
|
||||||
|
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
||||||
|
added_tokens_json = json.load(f)
|
||||||
|
|
||||||
|
for key in added_tokens_json:
|
||||||
|
tokens.append(key.encode("utf-8"))
|
||||||
|
scores.append(-1000.0)
|
||||||
|
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
|
||||||
|
|
||||||
|
self.gguf_writer.add_tokenizer_model("llama")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
|
||||||
|
class GPTNeoXModel(Model):
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
block_count = self.hparams["num_hidden_layers"]
|
||||||
|
|
||||||
|
self.gguf_writer.add_name(self.dir_model.name)
|
||||||
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||||
|
self.gguf_writer.add_block_count(block_count)
|
||||||
|
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||||
|
self.gguf_writer.add_rope_dimension_count(
|
||||||
|
int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
|
||||||
|
)
|
||||||
|
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
|
||||||
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
|
||||||
|
|
||||||
|
|
||||||
|
class BloomModel(Model):
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
self.gguf_writer.add_name("Bloom")
|
||||||
|
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
||||||
|
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
||||||
|
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
||||||
|
self.gguf_writer.add_embedding_length(n_embed)
|
||||||
|
self.gguf_writer.add_feed_forward_length(4 * n_embed)
|
||||||
|
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
||||||
|
self.gguf_writer.add_head_count(n_head)
|
||||||
|
self.gguf_writer.add_head_count_kv(n_head)
|
||||||
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
block_count = self.hparams["n_layer"]
|
||||||
|
tensors = dict(self.get_tensors())
|
||||||
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||||
|
has_lm_head = True
|
||||||
|
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
||||||
|
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
||||||
|
|
||||||
|
for name, data_torch in tensors.items():
|
||||||
|
if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys():
|
||||||
|
has_lm_head = False
|
||||||
|
|
||||||
|
name = re.sub(r'transformer\.', '', name)
|
||||||
|
|
||||||
|
old_dtype = data_torch.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||||
|
data_torch = data_torch.to(torch.float32)
|
||||||
|
|
||||||
|
data = data_torch.squeeze().numpy()
|
||||||
|
|
||||||
|
if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
|
||||||
|
# Map bloom-style qkv_linear to gpt-style qkv_linear
|
||||||
|
# bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
|
||||||
|
# gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
|
||||||
|
qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
|
||||||
|
data = np.concatenate(
|
||||||
|
(
|
||||||
|
qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
|
||||||
|
qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
|
||||||
|
qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
|
||||||
|
),
|
||||||
|
axis=0,
|
||||||
|
)
|
||||||
|
print("re-format attention.linear_qkv.weight")
|
||||||
|
elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
|
||||||
|
qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
|
||||||
|
data = np.concatenate(
|
||||||
|
(
|
||||||
|
qkv_bias[:, 0, :].reshape((n_embed,)),
|
||||||
|
qkv_bias[:, 1, :].reshape((n_embed,)),
|
||||||
|
qkv_bias[:, 2, :].reshape((n_embed,)),
|
||||||
|
),
|
||||||
|
axis=0,
|
||||||
|
)
|
||||||
|
print("re-format attention.linear_qkv.bias")
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if self.ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
|
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
if not has_lm_head and name == "word_embeddings.weight":
|
||||||
|
self.gguf_writer.add_tensor("output.weight", data)
|
||||||
|
print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||||
|
|
||||||
|
|
||||||
|
class MPTModel(Model):
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
block_count = self.hparams["n_layers"]
|
||||||
|
self.gguf_writer.add_name(self.dir_model.name)
|
||||||
|
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
||||||
|
self.gguf_writer.add_block_count(block_count)
|
||||||
|
self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"])
|
||||||
|
self.gguf_writer.add_head_count(self.hparams["n_heads"])
|
||||||
|
if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"):
|
||||||
|
self.gguf_writer.add_head_count_kv(kv_n_heads)
|
||||||
|
self.gguf_writer.add_layer_norm_eps(1e-5)
|
||||||
|
if self.hparams["attn_config"]["clip_qkv"] is not None:
|
||||||
|
self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
|
||||||
|
self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))
|
||||||
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||||
|
for name, data_torch in self.get_tensors():
|
||||||
|
# we don't need these
|
||||||
|
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
|
||||||
|
continue
|
||||||
|
|
||||||
|
old_dtype = data_torch.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||||
|
data_torch = data_torch.to(torch.float32)
|
||||||
|
|
||||||
|
data = data_torch.squeeze().numpy()
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if self.ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
|
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
# note: MPT output is tied to (same as) wte in original model;
|
||||||
|
# for easier implementation in llama.cpp it's duplicated in GGUF, though :/
|
||||||
|
if new_name == "token_embd.weight":
|
||||||
|
self.gguf_writer.add_tensor("output.weight", data)
|
||||||
|
|
||||||
|
|
||||||
|
class BaichuanModel(Model):
|
||||||
|
def set_vocab(self):
|
||||||
|
self._set_vocab_sentencepiece()
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
block_count = self.hparams["num_hidden_layers"]
|
||||||
|
head_count = self.hparams["num_attention_heads"]
|
||||||
|
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
||||||
|
hf_repo = self.hparams.get("_name_or_path", "")
|
||||||
|
|
||||||
|
ctx_length = 0
|
||||||
|
if "max_sequence_length" in self.hparams:
|
||||||
|
ctx_length = self.hparams["max_sequence_length"]
|
||||||
|
elif "max_position_embeddings" in self.hparams:
|
||||||
|
ctx_length = self.hparams["max_position_embeddings"]
|
||||||
|
elif "model_max_length" in self.hparams:
|
||||||
|
ctx_length = self.hparams["model_max_length"]
|
||||||
|
else:
|
||||||
|
print("gguf: can not find ctx length parameter.")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
self.gguf_writer.add_name(self.dir_model.name)
|
||||||
|
self.gguf_writer.add_source_hf_repo(hf_repo)
|
||||||
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||||
|
self.gguf_writer.add_context_length(ctx_length)
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||||
|
self.gguf_writer.add_block_count(block_count)
|
||||||
|
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||||
|
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_head_count(head_count)
|
||||||
|
self.gguf_writer.add_head_count_kv(head_count_kv)
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||||
|
|
||||||
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||||
|
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
# Collect tensors from generator object
|
||||||
|
model_kv = dict(self.get_tensors())
|
||||||
|
block_count = self.hparams["num_hidden_layers"]
|
||||||
|
head_count = self.hparams["num_attention_heads"]
|
||||||
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||||
|
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
||||||
|
|
||||||
|
for i in range(block_count):
|
||||||
|
if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None:
|
||||||
|
print(f"Unpacking and permuting layer {i}")
|
||||||
|
model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \
|
||||||
|
self._reverse_hf_permute_part(w, 0, head_count, head_count)
|
||||||
|
model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \
|
||||||
|
self._reverse_hf_permute_part(w, 1, head_count, head_count_kv)
|
||||||
|
model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \
|
||||||
|
self._reverse_hf_part(w, 2)
|
||||||
|
del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"]
|
||||||
|
|
||||||
|
for name, data_torch in model_kv.items():
|
||||||
|
# we don't need these
|
||||||
|
if name.endswith(".rotary_emb.inv_freq"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
old_dtype = data_torch.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||||
|
data_torch = data_torch.to(torch.float32)
|
||||||
|
|
||||||
|
data = data_torch.squeeze().numpy()
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if self.ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
|
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
||||||
|
if n_kv_head is not None and n_head != n_kv_head:
|
||||||
|
n_head //= n_kv_head
|
||||||
|
|
||||||
|
return (
|
||||||
|
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||||
|
.swapaxes(1, 2)
|
||||||
|
.reshape(weights.shape)
|
||||||
|
)
|
||||||
|
|
||||||
|
def _reverse_hf_permute_part(
|
||||||
|
self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None,
|
||||||
|
) -> Tensor:
|
||||||
|
r = weights.shape[0] // 3
|
||||||
|
return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)
|
||||||
|
|
||||||
|
def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor:
|
||||||
|
r = weights.shape[0] // 3
|
||||||
|
return weights[r * n_part:r * n_part + r, ...]
|
||||||
|
|
||||||
|
|
||||||
|
class FalconModel(Model):
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
block_count = self.hparams.get("num_hidden_layers")
|
||||||
|
if block_count is None:
|
||||||
|
block_count = self.hparams["n_layer"] # old name
|
||||||
|
|
||||||
|
n_head = self.hparams.get("num_attention_heads")
|
||||||
|
if n_head is None:
|
||||||
|
n_head = self.hparams["n_head"] # old name
|
||||||
|
|
||||||
|
n_head_kv = self.hparams.get("num_kv_heads")
|
||||||
|
if n_head_kv is None:
|
||||||
|
n_head_kv = self.hparams.get("n_head_kv", 1) # old name
|
||||||
|
|
||||||
|
self.gguf_writer.add_name("Falcon")
|
||||||
|
self.gguf_writer.add_context_length(2048) # not in config.json
|
||||||
|
self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||||
|
self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
|
||||||
|
self.gguf_writer.add_block_count(block_count)
|
||||||
|
self.gguf_writer.add_head_count(n_head)
|
||||||
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
||||||
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
block_count = self.hparams.get("num_hidden_layers")
|
||||||
|
if block_count is None:
|
||||||
|
block_count = self.hparams["n_layer"] # old name
|
||||||
|
|
||||||
|
n_head = self.hparams.get("num_attention_heads")
|
||||||
|
if n_head is None:
|
||||||
|
n_head = self.hparams["n_head"] # old name
|
||||||
|
|
||||||
|
n_head_kv = self.hparams.get("num_kv_heads")
|
||||||
|
if n_head_kv is None:
|
||||||
|
n_head_kv = self.hparams.get("n_head_kv", 1) # old name
|
||||||
|
|
||||||
|
head_dim = self.hparams["hidden_size"] // n_head
|
||||||
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||||
|
|
||||||
|
for name, data_torch in self.get_tensors():
|
||||||
|
old_dtype = data_torch.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||||
|
data_torch = data_torch.to(torch.float32)
|
||||||
|
|
||||||
|
# QKV tensor transform
|
||||||
|
# The original query_key_value tensor contains n_head_kv "kv groups",
|
||||||
|
# each consisting of n_head/n_head_kv query weights followed by one key
|
||||||
|
# and one value weight (shared by all query heads in the kv group).
|
||||||
|
# This layout makes it a big pain to work with in GGML.
|
||||||
|
# So we rearrange them here,, so that we have n_head query weights
|
||||||
|
# followed by n_head_kv key weights followed by n_head_kv value weights,
|
||||||
|
# in contiguous fashion.
|
||||||
|
# ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
|
||||||
|
|
||||||
|
if "query_key_value" in name:
|
||||||
|
qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
|
||||||
|
q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
|
||||||
|
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
||||||
|
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
||||||
|
data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
|
||||||
|
|
||||||
|
data = data_torch.squeeze().numpy()
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if self.ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
|
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
|
||||||
|
class StarCoderModel(Model):
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
block_count = self.hparams["n_layer"]
|
||||||
|
|
||||||
|
self.gguf_writer.add_name("StarCoder")
|
||||||
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
||||||
|
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
||||||
|
self.gguf_writer.add_block_count(block_count)
|
||||||
|
self.gguf_writer.add_head_count(self.hparams["n_head"])
|
||||||
|
self.gguf_writer.add_head_count_kv(1)
|
||||||
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
|
||||||
|
class RefactModel(Model):
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
hidden_dim = self.hparams["n_embd"]
|
||||||
|
inner_dim = 4 * hidden_dim
|
||||||
|
hidden_dim = int(2 * inner_dim / 3)
|
||||||
|
multiple_of = 256
|
||||||
|
ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
|
||||||
|
|
||||||
|
block_count = self.hparams["n_layer"]
|
||||||
|
|
||||||
|
self.gguf_writer.add_name("Refact")
|
||||||
|
# refact uses Alibi. So this is from config.json which might be used by training.
|
||||||
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
||||||
|
|
||||||
|
self.gguf_writer.add_feed_forward_length(ff_dim)
|
||||||
|
self.gguf_writer.add_block_count(block_count)
|
||||||
|
self.gguf_writer.add_head_count(self.hparams["n_head"])
|
||||||
|
self.gguf_writer.add_head_count_kv(1)
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
hidden_dim = self.hparams["n_embd"]
|
||||||
|
inner_dim = 4 * hidden_dim
|
||||||
|
hidden_dim = int(2 * inner_dim / 3)
|
||||||
|
multiple_of = 256
|
||||||
|
ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
|
||||||
|
n_head = self.hparams["n_head"]
|
||||||
|
n_head_kv = 1
|
||||||
|
head_dim = self.hparams["n_embd"] // n_head
|
||||||
|
block_count = self.hparams["n_layer"]
|
||||||
|
|
||||||
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||||
|
|
||||||
|
tensors = dict(self.get_tensors())
|
||||||
|
for i in range(block_count):
|
||||||
|
if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None:
|
||||||
|
tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim]
|
||||||
|
tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:]
|
||||||
|
del tensors[f"transformer.h.{i}.attn.kv.weight"]
|
||||||
|
if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None:
|
||||||
|
tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w
|
||||||
|
del tensors[f"transformer.h.{i}.attn.q.weight"]
|
||||||
|
if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None:
|
||||||
|
tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim]
|
||||||
|
tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:]
|
||||||
|
del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
|
||||||
|
|
||||||
|
for name, data_torch in tensors.items():
|
||||||
|
old_dtype = data_torch.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||||
|
data_torch = data_torch.to(torch.float32)
|
||||||
|
|
||||||
|
data = data_torch.squeeze().numpy()
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if self.ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
|
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
|
||||||
|
class PersimmonModel(Model):
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
|
||||||
|
head_count = self.hparams["num_attention_heads"]
|
||||||
|
head_count_kv = head_count
|
||||||
|
hidden_size = self.hparams["hidden_size"]
|
||||||
|
|
||||||
|
self.gguf_writer.add_name('persimmon-8b-chat')
|
||||||
|
self.gguf_writer.add_embedding_length(hidden_size)
|
||||||
|
self.gguf_writer.add_block_count(block_count)
|
||||||
|
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||||
|
self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
|
||||||
|
self.gguf_writer.add_head_count(head_count)
|
||||||
|
self.gguf_writer.add_head_count_kv(head_count_kv)
|
||||||
|
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
|
||||||
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
self._set_vocab_sentencepiece()
|
||||||
|
# self.gguf_writer.add_bos_token_id(71013)
|
||||||
|
# self.gguf_writer.add_eos_token_id(71013)
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
|
||||||
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||||
|
|
||||||
|
for name, data_torch in self.get_tensors():
|
||||||
|
if name.endswith(".self_attention.rotary_emb.inv_freq"):
|
||||||
|
continue
|
||||||
|
old_dtype = data_torch.dtype
|
||||||
|
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
|
||||||
|
data = data_torch.to(torch.float32).squeeze().numpy()
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
|
||||||
|
class StableLMModel(Model):
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
hparams = self.hparams
|
||||||
|
block_count = hparams["num_hidden_layers"]
|
||||||
|
|
||||||
|
self.gguf_writer.add_name(dir_model.name)
|
||||||
|
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
||||||
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||||
|
self.gguf_writer.add_block_count(block_count)
|
||||||
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||||
|
self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||||
|
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
||||||
|
self.gguf_writer.add_layer_norm_eps(1e-5)
|
||||||
|
|
||||||
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file")
|
||||||
|
parser.add_argument(
|
||||||
|
"--vocab-only", action="store_true",
|
||||||
|
help="extract only the vocab",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--outfile", type=Path,
|
||||||
|
help="path to write to; default: based on input",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--outtype", type=str, choices=["f32", "f16"], default="f16",
|
||||||
|
help="output format - use f32 for float32, f16 for float16",
|
||||||
|
)
|
||||||
|
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
|
||||||
|
parser.add_argument(
|
||||||
|
"model", type=Path,
|
||||||
|
help="directory containing model file",
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
dir_model = args.model
|
||||||
|
if not dir_model.is_dir():
|
||||||
|
print(f'Error: {args.model} is not a directory', file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
ftype_map = {
|
||||||
|
"f32": gguf.GGMLQuantizationType.F32,
|
||||||
|
"f16": gguf.GGMLQuantizationType.F16,
|
||||||
|
}
|
||||||
|
|
||||||
|
if args.outfile is not None:
|
||||||
|
fname_out = args.outfile
|
||||||
|
else:
|
||||||
|
# output in the same directory as the model by default
|
||||||
|
fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
|
||||||
|
|
||||||
|
print(f"Loading model: {dir_model.name}")
|
||||||
|
|
||||||
|
hparams = Model.load_hparams(dir_model)
|
||||||
|
|
||||||
|
with torch.inference_mode():
|
||||||
|
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||||
|
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
|
||||||
|
|
||||||
|
print("Set model parameters")
|
||||||
|
model_instance.set_gguf_parameters()
|
||||||
|
|
||||||
|
print("Set model tokenizer")
|
||||||
|
model_instance.set_vocab()
|
||||||
|
|
||||||
|
if args.vocab_only:
|
||||||
|
print(f"Exporting model vocab to '{fname_out}'")
|
||||||
|
model_instance.write_vocab()
|
||||||
|
else:
|
||||||
|
print(f"Exporting model to '{fname_out}'")
|
||||||
|
model_instance.write()
|
||||||
|
|
||||||
|
print(f"Model successfully exported to '{fname_out}'")
|
@ -2,7 +2,6 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import math
|
|
||||||
import struct
|
import struct
|
||||||
import sys
|
import sys
|
||||||
from enum import IntEnum
|
from enum import IntEnum
|
||||||
@ -12,34 +11,16 @@ import numpy as np
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
# Note: Does not support GGML_QKK_64
|
|
||||||
QK_K = 256
|
|
||||||
# Items here are (block size, type size)
|
|
||||||
GGML_QUANT_SIZES = {
|
|
||||||
gguf.GGMLQuantizationType.F32 : (1, 4),
|
|
||||||
gguf.GGMLQuantizationType.F16 : (1, 2),
|
|
||||||
gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
|
|
||||||
gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
|
|
||||||
gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
|
|
||||||
gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
|
|
||||||
gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
|
|
||||||
gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
|
|
||||||
gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
|
|
||||||
gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
|
|
||||||
gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
|
|
||||||
gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
|
|
||||||
gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
|
|
||||||
gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
|
|
||||||
}
|
|
||||||
|
|
||||||
class GGMLFormat(IntEnum):
|
class GGMLFormat(IntEnum):
|
||||||
GGML = 0
|
GGML = 0
|
||||||
GGMF = 1
|
GGMF = 1
|
||||||
GGJT = 2
|
GGJT = 2
|
||||||
|
|
||||||
|
|
||||||
class GGMLFType(IntEnum):
|
class GGMLFType(IntEnum):
|
||||||
ALL_F32 = 0
|
ALL_F32 = 0
|
||||||
MOSTLY_F16 = 1
|
MOSTLY_F16 = 1
|
||||||
@ -59,6 +40,7 @@ class GGMLFType(IntEnum):
|
|||||||
MOSTLY_Q5_K_M = 17
|
MOSTLY_Q5_K_M = 17
|
||||||
MOSTLY_Q6_K = 18
|
MOSTLY_Q6_K = 18
|
||||||
|
|
||||||
|
|
||||||
class Hyperparameters:
|
class Hyperparameters:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
|
self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
|
||||||
@ -90,6 +72,7 @@ class Hyperparameters:
|
|||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
|
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
|
||||||
|
|
||||||
|
|
||||||
class Vocab:
|
class Vocab:
|
||||||
def __init__(self, load_scores = True):
|
def __init__(self, load_scores = True):
|
||||||
self.items = []
|
self.items = []
|
||||||
@ -111,6 +94,7 @@ class Vocab:
|
|||||||
self.items.append((item_text, item_score))
|
self.items.append((item_text, item_score))
|
||||||
return offset - orig_offset
|
return offset - orig_offset
|
||||||
|
|
||||||
|
|
||||||
class Tensor:
|
class Tensor:
|
||||||
def __init__(self, use_padding = True):
|
def __init__(self, use_padding = True):
|
||||||
self.name = None
|
self.name = None
|
||||||
@ -125,7 +109,7 @@ class Tensor:
|
|||||||
(n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
|
(n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
|
||||||
assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
|
assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
|
||||||
assert name_len < 4096, 'Absurd tensor name length'
|
assert name_len < 4096, 'Absurd tensor name length'
|
||||||
quant = GGML_QUANT_SIZES.get(dtype)
|
quant = gguf.GGML_QUANT_SIZES.get(dtype)
|
||||||
assert quant is not None, 'Unknown tensor type'
|
assert quant is not None, 'Unknown tensor type'
|
||||||
(blksize, tysize) = quant
|
(blksize, tysize) = quant
|
||||||
offset += 12
|
offset += 12
|
||||||
@ -144,6 +128,7 @@ class Tensor:
|
|||||||
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
|
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
|
||||||
return offset - orig_offset
|
return offset - orig_offset
|
||||||
|
|
||||||
|
|
||||||
class GGMLModel:
|
class GGMLModel:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.hyperparameters = None
|
self.hyperparameters = None
|
||||||
@ -208,6 +193,7 @@ class GGMLModel:
|
|||||||
hp.set_n_ff(self)
|
hp.set_n_ff(self)
|
||||||
return offset
|
return offset
|
||||||
|
|
||||||
|
|
||||||
class GGMLToGGUF:
|
class GGMLToGGUF:
|
||||||
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
|
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
|
||||||
hp = ggml_model.hyperparameters
|
hp = ggml_model.hyperparameters
|
||||||
@ -364,6 +350,7 @@ class GGMLToGGUF:
|
|||||||
raw_shape = tempdims,
|
raw_shape = tempdims,
|
||||||
raw_dtype = tensor.dtype)
|
raw_dtype = tensor.dtype)
|
||||||
|
|
||||||
|
|
||||||
def handle_metadata(cfg, hp):
|
def handle_metadata(cfg, hp):
|
||||||
import convert
|
import convert
|
||||||
assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
|
assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
|
||||||
@ -394,6 +381,7 @@ def handle_metadata(cfg, hp):
|
|||||||
convert.check_vocab_size(params, vocab)
|
convert.check_vocab_size(params, vocab)
|
||||||
return (params, vocab, svocab)
|
return (params, vocab, svocab)
|
||||||
|
|
||||||
|
|
||||||
def handle_args():
|
def handle_args():
|
||||||
parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
|
parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
|
||||||
parser.add_argument('--input', '-i', type = Path, required = True,
|
parser.add_argument('--input', '-i', type = Path, required = True,
|
||||||
@ -418,6 +406,7 @@ def handle_args():
|
|||||||
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
|
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
cfg = handle_args()
|
cfg = handle_args()
|
||||||
print(f'* Using config: {cfg}')
|
print(f'* Using config: {cfg}')
|
||||||
@ -427,7 +416,7 @@ def main():
|
|||||||
data = np.memmap(cfg.input, mode = 'r')
|
data = np.memmap(cfg.input, mode = 'r')
|
||||||
model = GGMLModel()
|
model = GGMLModel()
|
||||||
print('* Scanning GGML input file')
|
print('* Scanning GGML input file')
|
||||||
offset = model.load(data, 0)
|
offset = model.load(data, 0) # noqa
|
||||||
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
||||||
vocab_override = None
|
vocab_override = None
|
||||||
params_override = None
|
params_override = None
|
||||||
@ -442,12 +431,15 @@ def main():
|
|||||||
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
||||||
if model.file_format == GGMLFormat.GGML:
|
if model.file_format == GGMLFormat.GGML:
|
||||||
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
|
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
|
||||||
converter = GGMLToGGUF(model, data, cfg,
|
converter = GGMLToGGUF(
|
||||||
|
model, data, cfg,
|
||||||
params_override = params_override,
|
params_override = params_override,
|
||||||
vocab_override = vocab_override,
|
vocab_override = vocab_override,
|
||||||
special_vocab = special_vocab )
|
special_vocab = special_vocab
|
||||||
|
)
|
||||||
converter.save()
|
converter.save()
|
||||||
print(f'* Successful completion. Output saved to: {cfg.output}')
|
print(f'* Successful completion. Output saved to: {cfg.output}')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
@ -1,227 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# HF mpt--> gguf conversion
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import struct
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from transformers import AutoTokenizer # type: ignore[import]
|
|
||||||
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
|
||||||
import gguf
|
|
||||||
|
|
||||||
|
|
||||||
def count_model_parts(dir_model: Path) -> int:
|
|
||||||
num_parts = 0
|
|
||||||
for filename in os.listdir(dir_model):
|
|
||||||
if filename.startswith("pytorch_model-"):
|
|
||||||
num_parts += 1
|
|
||||||
|
|
||||||
if num_parts > 0:
|
|
||||||
print("gguf: found " + str(num_parts) + " model parts")
|
|
||||||
return num_parts
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
|
||||||
parser = argparse.ArgumentParser(description="Convert an MPT model to a GGML compatible file")
|
|
||||||
parser.add_argument(
|
|
||||||
"--vocab-only", action="store_true",
|
|
||||||
help="extract only the vocab",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--outfile", type=Path,
|
|
||||||
help="path to write to; default: based on input",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"model", type=Path,
|
|
||||||
help="directory containing model file, or model file itself (*.bin)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
|
||||||
help="output format - use 0 for float32, 1 for float16",
|
|
||||||
)
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
args = parse_args()
|
|
||||||
|
|
||||||
dir_model = args.model
|
|
||||||
ftype = args.ftype
|
|
||||||
if not dir_model.is_dir():
|
|
||||||
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# possible tensor data types
|
|
||||||
# ftype == 0 -> float32
|
|
||||||
# ftype == 1 -> float16
|
|
||||||
|
|
||||||
# map from ftype to string
|
|
||||||
ftype_str = ["f32", "f16"]
|
|
||||||
|
|
||||||
if args.outfile is not None:
|
|
||||||
fname_out = args.outfile
|
|
||||||
else:
|
|
||||||
# output in the same directory as the model by default
|
|
||||||
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
|
||||||
|
|
||||||
print("gguf: loading model "+dir_model.name)
|
|
||||||
|
|
||||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
|
||||||
hparams = json.load(f)
|
|
||||||
|
|
||||||
if hparams["architectures"][0] != "MPTForCausalLM":
|
|
||||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
|
||||||
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
# get number of model parts
|
|
||||||
num_parts = count_model_parts(dir_model)
|
|
||||||
|
|
||||||
ARCH=gguf.MODEL_ARCH.MPT
|
|
||||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
|
||||||
|
|
||||||
print("gguf: get model metadata")
|
|
||||||
|
|
||||||
block_count = hparams["n_layers"]
|
|
||||||
|
|
||||||
gguf_writer.add_name(dir_model.name)
|
|
||||||
gguf_writer.add_context_length(hparams["max_seq_len"])
|
|
||||||
gguf_writer.add_embedding_length(hparams["d_model"])
|
|
||||||
gguf_writer.add_block_count(block_count)
|
|
||||||
gguf_writer.add_feed_forward_length(4 * hparams["d_model"])
|
|
||||||
gguf_writer.add_head_count(hparams["n_heads"])
|
|
||||||
if kv_n_heads := hparams["attn_config"].get("kv_n_heads"):
|
|
||||||
gguf_writer.add_head_count_kv(kv_n_heads)
|
|
||||||
gguf_writer.add_layer_norm_eps(1e-05)
|
|
||||||
if hparams["attn_config"]["clip_qkv"] is not None:
|
|
||||||
gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"])
|
|
||||||
gguf_writer.add_max_alibi_bias(hparams["attn_config"]["alibi_bias_max"])
|
|
||||||
|
|
||||||
# TOKENIZATION
|
|
||||||
|
|
||||||
print("gguf: get tokenizer metadata")
|
|
||||||
|
|
||||||
tokens: list[bytearray] = []
|
|
||||||
scores: list[float] = []
|
|
||||||
toktypes: list[int] = []
|
|
||||||
|
|
||||||
# gpt2 tokenizer
|
|
||||||
gguf_writer.add_tokenizer_model("gpt2")
|
|
||||||
|
|
||||||
print("gguf: get gpt2 tokenizer vocab")
|
|
||||||
|
|
||||||
# MPT token embedding tensors have dimension 50432 (hparams["vocab_size"]), but
|
|
||||||
# there are only 50254 (len(tokenizer.vocab)) tokens in the vocab, presumably to
|
|
||||||
# accomodate some "reserved" tokens; this is causing problems down the line in
|
|
||||||
# llama.cpp, so we pad the vocab with dummy tokens:
|
|
||||||
|
|
||||||
vocab_size = hparams["vocab_size"]
|
|
||||||
|
|
||||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
|
||||||
|
|
||||||
added_vocab = tokenizer.get_added_vocab()
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
|
||||||
|
|
||||||
for i in range(vocab_size):
|
|
||||||
if i not in reverse_vocab:
|
|
||||||
tokens.append(f"[PAD{i}]")
|
|
||||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
|
||||||
elif reverse_vocab[i] in added_vocab:
|
|
||||||
tokens.append(reverse_vocab[i])
|
|
||||||
if tokenizer.added_tokens_decoder[i].special:
|
|
||||||
toktypes.append(gguf.TokenType.CONTROL)
|
|
||||||
else:
|
|
||||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
|
||||||
else:
|
|
||||||
tokens.append(reverse_vocab[i])
|
|
||||||
toktypes.append(gguf.TokenType.NORMAL)
|
|
||||||
|
|
||||||
gguf_writer.add_token_list(tokens)
|
|
||||||
gguf_writer.add_token_types(toktypes)
|
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
|
|
||||||
special_vocab.add_to_gguf(gguf_writer)
|
|
||||||
|
|
||||||
# TENSORS
|
|
||||||
|
|
||||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
|
||||||
|
|
||||||
# tensor info
|
|
||||||
print("gguf: get tensor metadata")
|
|
||||||
|
|
||||||
if num_parts == 0:
|
|
||||||
part_names = iter(("pytorch_model.bin",))
|
|
||||||
else:
|
|
||||||
part_names = (
|
|
||||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
|
||||||
)
|
|
||||||
|
|
||||||
for part_name in part_names:
|
|
||||||
if args.vocab_only:
|
|
||||||
break
|
|
||||||
print("gguf: loading model part '" + part_name + "'")
|
|
||||||
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
|
||||||
|
|
||||||
for name in model_part.keys():
|
|
||||||
data = model_part[name]
|
|
||||||
|
|
||||||
old_dtype = data.dtype
|
|
||||||
|
|
||||||
# convert any unsupported data types to float32
|
|
||||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
|
||||||
data = data.to(torch.float32)
|
|
||||||
|
|
||||||
data = data.squeeze().numpy()
|
|
||||||
|
|
||||||
# map tensor names
|
|
||||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
|
||||||
if new_name is None:
|
|
||||||
print("Cannot map tensor '" + name + "'")
|
|
||||||
continue # for the sake of compatibility with some old published models, don't quit
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
n_dims = len(data.shape)
|
|
||||||
data_dtype = data.dtype
|
|
||||||
|
|
||||||
# if f32 desired, convert any float16 to float32
|
|
||||||
if ftype == 0 and data_dtype == np.float16:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
|
||||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
|
||||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
|
||||||
data = data.astype(np.float16)
|
|
||||||
|
|
||||||
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
|
||||||
|
|
||||||
gguf_writer.add_tensor(new_name, data)
|
|
||||||
|
|
||||||
# note: MPT output is tied to (same as) wte in original model;
|
|
||||||
# for easier implementation in llama.cpp it's duplicated in GGUF, though :/
|
|
||||||
if new_name == "token_embd.weight":
|
|
||||||
gguf_writer.add_tensor("output.weight", data)
|
|
||||||
|
|
||||||
print("gguf: write header")
|
|
||||||
gguf_writer.write_header_to_file()
|
|
||||||
print("gguf: write metadata")
|
|
||||||
gguf_writer.write_kv_data_to_file()
|
|
||||||
if not args.vocab_only:
|
|
||||||
print("gguf: write tensors")
|
|
||||||
gguf_writer.write_tensors_to_file()
|
|
||||||
|
|
||||||
gguf_writer.close()
|
|
||||||
|
|
||||||
print(f"gguf: model successfully exported to '{fname_out}'")
|
|
||||||
print("")
|
|
@ -6,9 +6,10 @@ import argparse
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from sentencepiece import SentencePieceProcessor
|
from sentencepiece import SentencePieceProcessor
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
def _flatten_dict(dct, tensors, prefix=None):
|
def _flatten_dict(dct, tensors, prefix=None):
|
||||||
assert isinstance(dct, dict)
|
assert isinstance(dct, dict)
|
||||||
for key in dct.keys():
|
for key in dct.keys():
|
||||||
@ -21,6 +22,7 @@ def _flatten_dict(dct, tensors, prefix=None):
|
|||||||
raise ValueError(type(dct[key]))
|
raise ValueError(type(dct[key]))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
||||||
tokenizer_path = dir_model / 'adept_vocab.model'
|
tokenizer_path = dir_model / 'adept_vocab.model'
|
||||||
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
|
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
|
||||||
@ -54,6 +56,7 @@ def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
|||||||
pass
|
pass
|
||||||
return tokens, scores, toktypes
|
return tokens, scores, toktypes
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
|
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
|
||||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||||
@ -125,6 +128,5 @@ def main():
|
|||||||
print("")
|
print("")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
@ -1,272 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# HF refact--> gguf conversion
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from transformers import AutoTokenizer # type: ignore[import]
|
|
||||||
|
|
||||||
if "NO_LOCAL_GGUF" not in os.environ:
|
|
||||||
sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
|
|
||||||
import gguf
|
|
||||||
|
|
||||||
def count_model_parts(dir_model: Path) -> int:
|
|
||||||
num_parts = 0
|
|
||||||
for filename in os.listdir(dir_model):
|
|
||||||
if filename.startswith("pytorch_model-"):
|
|
||||||
num_parts += 1
|
|
||||||
|
|
||||||
if num_parts > 0:
|
|
||||||
print("gguf: found " + str(num_parts) + " model parts")
|
|
||||||
return num_parts
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Convert a Refact model to a GGML compatible file"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--vocab-only",
|
|
||||||
action="store_true",
|
|
||||||
help="extract only the vocab",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--outfile",
|
|
||||||
type=Path,
|
|
||||||
help="path to write to; default: based on input",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"model",
|
|
||||||
type=Path,
|
|
||||||
help="directory containing model file, or model file itself (*.bin)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"ftype",
|
|
||||||
type=int,
|
|
||||||
choices=[0, 1],
|
|
||||||
default=1,
|
|
||||||
nargs="?",
|
|
||||||
help="output format - use 0 for float32, 1 for float16",
|
|
||||||
)
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
args = parse_args()
|
|
||||||
|
|
||||||
dir_model = args.model
|
|
||||||
ftype = args.ftype
|
|
||||||
if not dir_model.is_dir():
|
|
||||||
print(f"Error: {args.model} is not a directory", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# possible tensor data types
|
|
||||||
# ftype == 0 -> float32
|
|
||||||
# ftype == 1 -> float16
|
|
||||||
|
|
||||||
# map from ftype to string
|
|
||||||
ftype_str = ["f32", "f16"]
|
|
||||||
|
|
||||||
if args.outfile is not None:
|
|
||||||
fname_out = args.outfile
|
|
||||||
else:
|
|
||||||
# output in the same directory as the model by default
|
|
||||||
fname_out = dir_model / f"ggml-model-{ftype_str[ftype]}.gguf"
|
|
||||||
|
|
||||||
print("gguf: loading model " + dir_model.name)
|
|
||||||
|
|
||||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
|
||||||
hparams = json.load(f)
|
|
||||||
|
|
||||||
if hparams["architectures"][0] != "GPTRefactForCausalLM":
|
|
||||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
|
||||||
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# get number of model parts
|
|
||||||
num_parts = count_model_parts(dir_model)
|
|
||||||
|
|
||||||
ARCH = gguf.MODEL_ARCH.REFACT
|
|
||||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
|
||||||
|
|
||||||
print("gguf: get model metadata")
|
|
||||||
|
|
||||||
# Get refact feed forward dimension
|
|
||||||
hidden_dim = hparams["n_embd"]
|
|
||||||
inner_dim = 4 * hidden_dim
|
|
||||||
hidden_dim = int(2 * inner_dim / 3)
|
|
||||||
multiple_of = 256
|
|
||||||
ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
|
|
||||||
|
|
||||||
block_count = hparams["n_layer"]
|
|
||||||
|
|
||||||
gguf_writer.add_name("Refact")
|
|
||||||
# refact uses Alibi. So this is from config.json which might be used by training.
|
|
||||||
gguf_writer.add_context_length(hparams["n_positions"])
|
|
||||||
gguf_writer.add_embedding_length(hparams["n_embd"])
|
|
||||||
|
|
||||||
gguf_writer.add_feed_forward_length(ff_dim)
|
|
||||||
gguf_writer.add_block_count(block_count)
|
|
||||||
gguf_writer.add_head_count(hparams["n_head"])
|
|
||||||
gguf_writer.add_head_count_kv(1)
|
|
||||||
gguf_writer.add_layer_norm_rms_eps(hparams["layer_norm_epsilon"])
|
|
||||||
gguf_writer.add_file_type(ftype)
|
|
||||||
|
|
||||||
# TOKENIZATION
|
|
||||||
|
|
||||||
print("gguf: get tokenizer metadata")
|
|
||||||
|
|
||||||
tokens: list[bytearray] = []
|
|
||||||
scores: list[float] = []
|
|
||||||
toktypes: list[int] = []
|
|
||||||
|
|
||||||
# gpt2 tokenizer
|
|
||||||
gguf_writer.add_tokenizer_model("gpt2")
|
|
||||||
|
|
||||||
print("gguf: get gpt2 tokenizer vocab")
|
|
||||||
|
|
||||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
|
||||||
|
|
||||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
|
||||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
|
||||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
|
||||||
assert max(tokenizer.vocab.values()) < vocab_size
|
|
||||||
|
|
||||||
added_vocab = tokenizer.get_added_vocab()
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
|
||||||
|
|
||||||
for i in range(vocab_size):
|
|
||||||
if i not in reverse_vocab:
|
|
||||||
tokens.append(f"[PAD{i}]")
|
|
||||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
|
||||||
elif reverse_vocab[i] in added_vocab:
|
|
||||||
tokens.append(reverse_vocab[i])
|
|
||||||
if tokenizer.added_tokens_decoder[i].special:
|
|
||||||
toktypes.append(gguf.TokenType.CONTROL)
|
|
||||||
else:
|
|
||||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
|
||||||
else:
|
|
||||||
tokens.append(reverse_vocab[i])
|
|
||||||
toktypes.append(gguf.TokenType.NORMAL)
|
|
||||||
|
|
||||||
gguf_writer.add_token_list(tokens)
|
|
||||||
gguf_writer.add_token_types(toktypes)
|
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))
|
|
||||||
special_vocab.add_to_gguf(gguf_writer)
|
|
||||||
|
|
||||||
# TENSORS
|
|
||||||
|
|
||||||
tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
|
|
||||||
|
|
||||||
# params for qkv transform
|
|
||||||
n_head = hparams["n_head"]
|
|
||||||
n_head_kv = 1
|
|
||||||
|
|
||||||
head_dim = hparams["n_embd"] // n_head
|
|
||||||
|
|
||||||
# tensor info
|
|
||||||
print("gguf: get tensor metadata")
|
|
||||||
|
|
||||||
if num_parts == 0:
|
|
||||||
part_names = iter(("pytorch_model.bin",))
|
|
||||||
else:
|
|
||||||
part_names = (
|
|
||||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
|
||||||
)
|
|
||||||
for part_name in part_names:
|
|
||||||
if args.vocab_only:
|
|
||||||
break
|
|
||||||
print("gguf: loading model part '" + part_name + "'")
|
|
||||||
model_part = torch.load(dir_model / part_name, map_location="cpu")
|
|
||||||
|
|
||||||
for i in range(block_count):
|
|
||||||
if f"transformer.h.{i}.attn.kv.weight" in model_part:
|
|
||||||
data = model_part[f"transformer.h.{i}.attn.kv.weight"]
|
|
||||||
model_part[f"model.layers.{i}.self_attn.k_proj.weight"] = data[
|
|
||||||
: n_head_kv * head_dim
|
|
||||||
]
|
|
||||||
model_part[f"model.layers.{i}.self_attn.v_proj.weight"] = data[
|
|
||||||
n_head_kv * head_dim :
|
|
||||||
]
|
|
||||||
del model_part[f"transformer.h.{i}.attn.kv.weight"]
|
|
||||||
if f"transformer.h.{i}.attn.q.weight" in model_part:
|
|
||||||
model_part[f"model.layers.{i}.self_attn.q_proj.weight"] = model_part[
|
|
||||||
f"transformer.h.{i}.attn.q.weight"
|
|
||||||
]
|
|
||||||
del model_part[f"transformer.h.{i}.attn.q.weight"]
|
|
||||||
if f"transformer.h.{i}.mlp.gate_up_proj.weight" in model_part:
|
|
||||||
data = model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
|
|
||||||
model_part[f"model.layers.{i}.mlp.gate_proj.weight"] = data[:ff_dim]
|
|
||||||
model_part[f"model.layers.{i}.mlp.up_proj.weight"] = data[ff_dim:]
|
|
||||||
del model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
|
|
||||||
|
|
||||||
for name in model_part.keys():
|
|
||||||
data = model_part[name]
|
|
||||||
|
|
||||||
old_dtype = data.dtype
|
|
||||||
|
|
||||||
# convert any unsupported data types to float32
|
|
||||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
|
||||||
data = data.to(torch.float32)
|
|
||||||
|
|
||||||
data = data.squeeze().numpy()
|
|
||||||
|
|
||||||
# map tensor names
|
|
||||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
|
|
||||||
if new_name is None:
|
|
||||||
print("Can not map tensor '" + name + "'")
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
n_dims = len(data.shape)
|
|
||||||
data_dtype = data.dtype
|
|
||||||
|
|
||||||
# if f32 desired, convert any float16 to float32
|
|
||||||
if ftype == 0 and data_dtype == np.float16:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
|
||||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
|
||||||
if (
|
|
||||||
ftype == 1
|
|
||||||
and data_dtype == np.float32
|
|
||||||
and name.endswith(".weight")
|
|
||||||
and n_dims == 2
|
|
||||||
):
|
|
||||||
data = data.astype(np.float16)
|
|
||||||
|
|
||||||
print(
|
|
||||||
new_name
|
|
||||||
+ ", n_dims = "
|
|
||||||
+ str(n_dims)
|
|
||||||
+ ", "
|
|
||||||
+ str(old_dtype)
|
|
||||||
+ " --> "
|
|
||||||
+ str(data.dtype)
|
|
||||||
)
|
|
||||||
|
|
||||||
gguf_writer.add_tensor(new_name, data)
|
|
||||||
|
|
||||||
|
|
||||||
print("gguf: write header")
|
|
||||||
gguf_writer.write_header_to_file()
|
|
||||||
print("gguf: write metadata")
|
|
||||||
gguf_writer.write_kv_data_to_file()
|
|
||||||
if not args.vocab_only:
|
|
||||||
print("gguf: write tensors")
|
|
||||||
gguf_writer.write_tensors_to_file()
|
|
||||||
|
|
||||||
gguf_writer.close()
|
|
||||||
|
|
||||||
print(f"gguf: model successfully exported to '{fname_out}'")
|
|
||||||
print("")
|
|
@ -1,210 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# HF starcoder --> gguf conversion
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import struct
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from transformers import AutoTokenizer # type: ignore[import]
|
|
||||||
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
|
||||||
import gguf
|
|
||||||
|
|
||||||
|
|
||||||
def count_model_parts(dir_model: Path) -> int:
|
|
||||||
num_parts = 0
|
|
||||||
for filename in os.listdir(dir_model):
|
|
||||||
if filename.startswith("pytorch_model-"):
|
|
||||||
num_parts += 1
|
|
||||||
|
|
||||||
if num_parts > 0:
|
|
||||||
print("gguf: found " + str(num_parts) + " model parts")
|
|
||||||
return num_parts
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
|
||||||
parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file")
|
|
||||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
|
||||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
|
||||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
|
||||||
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
args = parse_args()
|
|
||||||
|
|
||||||
dir_model = args.model
|
|
||||||
ftype = args.ftype
|
|
||||||
if not dir_model.is_dir():
|
|
||||||
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# possible tensor data types
|
|
||||||
# ftype == 0 -> float32
|
|
||||||
# ftype == 1 -> float16
|
|
||||||
|
|
||||||
# map from ftype to string
|
|
||||||
ftype_str = ["f32", "f16"]
|
|
||||||
|
|
||||||
if args.outfile is not None:
|
|
||||||
fname_out = args.outfile
|
|
||||||
else:
|
|
||||||
# output in the same directory as the model by default
|
|
||||||
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
|
||||||
|
|
||||||
print("gguf: loading model "+dir_model.name)
|
|
||||||
|
|
||||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
|
||||||
hparams = json.load(f)
|
|
||||||
|
|
||||||
if hparams["architectures"][0] != "GPTBigCodeForCausalLM":
|
|
||||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
|
||||||
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# get number of model parts
|
|
||||||
num_parts = count_model_parts(dir_model)
|
|
||||||
|
|
||||||
ARCH=gguf.MODEL_ARCH.STARCODER
|
|
||||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
|
||||||
|
|
||||||
print("gguf: get model metadata")
|
|
||||||
|
|
||||||
block_count = hparams["n_layer"]
|
|
||||||
|
|
||||||
gguf_writer.add_name("StarCoder")
|
|
||||||
gguf_writer.add_context_length(hparams["n_positions"])
|
|
||||||
gguf_writer.add_embedding_length(hparams["n_embd"])
|
|
||||||
gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
|
|
||||||
gguf_writer.add_block_count(block_count)
|
|
||||||
gguf_writer.add_head_count(hparams["n_head"])
|
|
||||||
gguf_writer.add_head_count_kv(1)
|
|
||||||
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
|
||||||
gguf_writer.add_file_type(ftype)
|
|
||||||
|
|
||||||
# TOKENIZATION
|
|
||||||
|
|
||||||
print("gguf: get tokenizer metadata")
|
|
||||||
|
|
||||||
tokens: list[bytearray] = []
|
|
||||||
scores: list[float] = []
|
|
||||||
toktypes: list[int] = []
|
|
||||||
|
|
||||||
# gpt2 tokenizer
|
|
||||||
gguf_writer.add_tokenizer_model("gpt2")
|
|
||||||
|
|
||||||
print("gguf: get gpt2 tokenizer vocab")
|
|
||||||
|
|
||||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
|
||||||
|
|
||||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
|
||||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
|
||||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
|
||||||
assert max(tokenizer.vocab.values()) < vocab_size
|
|
||||||
|
|
||||||
added_vocab = tokenizer.get_added_vocab()
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
|
||||||
|
|
||||||
for i in range(vocab_size):
|
|
||||||
if i not in reverse_vocab:
|
|
||||||
tokens.append(f"[PAD{i}]")
|
|
||||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
|
||||||
elif reverse_vocab[i] in added_vocab:
|
|
||||||
tokens.append(reverse_vocab[i])
|
|
||||||
if tokenizer.added_tokens_decoder[i].special:
|
|
||||||
toktypes.append(gguf.TokenType.CONTROL)
|
|
||||||
else:
|
|
||||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
|
||||||
else:
|
|
||||||
tokens.append(reverse_vocab[i])
|
|
||||||
toktypes.append(gguf.TokenType.NORMAL)
|
|
||||||
|
|
||||||
gguf_writer.add_token_list(tokens)
|
|
||||||
gguf_writer.add_token_types(toktypes)
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
|
|
||||||
special_vocab.add_to_gguf(gguf_writer)
|
|
||||||
|
|
||||||
# TENSORS
|
|
||||||
|
|
||||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
|
||||||
|
|
||||||
# params for qkv transform
|
|
||||||
n_head = hparams["n_head"]
|
|
||||||
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
|
|
||||||
|
|
||||||
head_dim = hparams["n_embd"] // n_head
|
|
||||||
|
|
||||||
# tensor info
|
|
||||||
print("gguf: get tensor metadata")
|
|
||||||
|
|
||||||
if num_parts == 0:
|
|
||||||
part_names = iter(("pytorch_model.bin",))
|
|
||||||
else:
|
|
||||||
part_names = (
|
|
||||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
|
||||||
)
|
|
||||||
|
|
||||||
for part_name in part_names:
|
|
||||||
if args.vocab_only:
|
|
||||||
break
|
|
||||||
print("gguf: loading model part '" + part_name + "'")
|
|
||||||
model_part = torch.load(dir_model / part_name, map_location="cpu")
|
|
||||||
|
|
||||||
for name in model_part.keys():
|
|
||||||
data = model_part[name]
|
|
||||||
|
|
||||||
old_dtype = data.dtype
|
|
||||||
|
|
||||||
# convert any unsupported data types to float32
|
|
||||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
|
||||||
data = data.to(torch.float32)
|
|
||||||
|
|
||||||
data = data.squeeze().numpy()
|
|
||||||
|
|
||||||
# map tensor names
|
|
||||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
|
||||||
if new_name is None:
|
|
||||||
print("Can not map tensor '" + name + "'")
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
n_dims = len(data.shape)
|
|
||||||
data_dtype = data.dtype
|
|
||||||
|
|
||||||
# if f32 desired, convert any float16 to float32
|
|
||||||
if ftype == 0 and data_dtype == np.float16:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
|
||||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
|
||||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
|
||||||
data = data.astype(np.float16)
|
|
||||||
|
|
||||||
print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
|
||||||
|
|
||||||
gguf_writer.add_tensor(new_name, data)
|
|
||||||
|
|
||||||
|
|
||||||
print("gguf: write header")
|
|
||||||
gguf_writer.write_header_to_file()
|
|
||||||
print("gguf: write metadata")
|
|
||||||
gguf_writer.write_kv_data_to_file()
|
|
||||||
if not args.vocab_only:
|
|
||||||
print("gguf: write tensors")
|
|
||||||
gguf_writer.write_tensors_to_file()
|
|
||||||
|
|
||||||
gguf_writer.close()
|
|
||||||
|
|
||||||
print(f"gguf: model successfully exported to '{fname_out}'")
|
|
||||||
print("")
|
|
137
convert.py
Executable file → Normal file
137
convert.py
Executable file → Normal file
@ -3,11 +3,9 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import copy
|
|
||||||
import enum
|
import enum
|
||||||
import faulthandler
|
import faulthandler
|
||||||
import functools
|
import functools
|
||||||
import io
|
|
||||||
import itertools
|
import itertools
|
||||||
import json
|
import json
|
||||||
import math
|
import math
|
||||||
@ -23,14 +21,14 @@ from abc import ABCMeta, abstractmethod
|
|||||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar
|
from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sentencepiece import SentencePieceProcessor # type: ignore[import]
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
import os
|
import os
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@ -48,6 +46,7 @@ DEFAULT_CONCURRENCY = 8
|
|||||||
# data types
|
# data types
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class DataType:
|
class DataType:
|
||||||
name: str
|
name: str
|
||||||
@ -57,15 +56,18 @@ class DataType:
|
|||||||
def elements_to_bytes(self, n_elements: int) -> int:
|
def elements_to_bytes(self, n_elements: int) -> int:
|
||||||
return n_elements * self.dtype.itemsize
|
return n_elements * self.dtype.itemsize
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class UnquantizedDataType(DataType):
|
class UnquantizedDataType(DataType):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
|
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
|
||||||
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
|
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
|
||||||
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
|
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
|
||||||
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
|
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class QuantizedDataType(DataType):
|
class QuantizedDataType(DataType):
|
||||||
block_size: int
|
block_size: int
|
||||||
@ -79,6 +81,7 @@ class QuantizedDataType(DataType):
|
|||||||
assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
|
assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
|
||||||
return self.quantized_dtype.itemsize * (n_elements // self.block_size)
|
return self.quantized_dtype.itemsize * (n_elements // self.block_size)
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class Q8_0QuantizedDataType(QuantizedDataType):
|
class Q8_0QuantizedDataType(QuantizedDataType):
|
||||||
# Mini Q8_0 quantization in Python!
|
# Mini Q8_0 quantization in Python!
|
||||||
@ -88,6 +91,7 @@ class Q8_0QuantizedDataType(QuantizedDataType):
|
|||||||
n_blocks = arr.size // self.block_size
|
n_blocks = arr.size // self.block_size
|
||||||
blocks = arr.reshape((n_blocks, self.block_size))
|
blocks = arr.reshape((n_blocks, self.block_size))
|
||||||
# Much faster implementation of block quantization contributed by @Cebtenzzre
|
# Much faster implementation of block quantization contributed by @Cebtenzzre
|
||||||
|
|
||||||
def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
|
def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
|
||||||
d = abs(blocks).max(axis = 1) / np.float32(127)
|
d = abs(blocks).max(axis = 1) / np.float32(127)
|
||||||
with np.errstate(divide = 'ignore'):
|
with np.errstate(divide = 'ignore'):
|
||||||
@ -96,6 +100,7 @@ class Q8_0QuantizedDataType(QuantizedDataType):
|
|||||||
yield from zip(d, qs)
|
yield from zip(d, qs)
|
||||||
return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
|
return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
|
||||||
|
|
||||||
|
|
||||||
DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
|
DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
|
||||||
dtype = np.dtype(np.float32), valid_conversions = [],
|
dtype = np.dtype(np.float32), valid_conversions = [],
|
||||||
ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
|
ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
|
||||||
@ -118,6 +123,8 @@ SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
|
|||||||
# TODO: match this with `llama_ftype`
|
# TODO: match this with `llama_ftype`
|
||||||
# TODO: rename to LLAMAFileType
|
# TODO: rename to LLAMAFileType
|
||||||
# TODO: move to `gguf.py`
|
# TODO: move to `gguf.py`
|
||||||
|
|
||||||
|
|
||||||
class GGMLFileType(enum.IntEnum):
|
class GGMLFileType(enum.IntEnum):
|
||||||
AllF32 = 0
|
AllF32 = 0
|
||||||
MostlyF16 = 1 # except 1d tensors
|
MostlyF16 = 1 # except 1d tensors
|
||||||
@ -130,6 +137,7 @@ class GGMLFileType(enum.IntEnum):
|
|||||||
# 1D tensors are always F32.
|
# 1D tensors are always F32.
|
||||||
return dt if len(tensor.shape) > 1 else DT_F32
|
return dt if len(tensor.shape) > 1 else DT_F32
|
||||||
|
|
||||||
|
|
||||||
GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
||||||
GGMLFileType.AllF32 : DT_F32,
|
GGMLFileType.AllF32 : DT_F32,
|
||||||
GGMLFileType.MostlyF16 : DT_F16,
|
GGMLFileType.MostlyF16 : DT_F16,
|
||||||
@ -140,6 +148,7 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
|||||||
# hparams loading
|
# hparams loading
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Params:
|
class Params:
|
||||||
n_vocab: int
|
n_vocab: int
|
||||||
@ -151,8 +160,11 @@ class Params:
|
|||||||
n_head_kv: int
|
n_head_kv: int
|
||||||
f_norm_eps: float
|
f_norm_eps: float
|
||||||
|
|
||||||
|
rope_scaling_type: gguf.RopeScalingType | None = None
|
||||||
f_rope_freq_base: float | None = None
|
f_rope_freq_base: float | None = None
|
||||||
f_rope_scale: float | None = None
|
f_rope_scale: float | None = None
|
||||||
|
n_orig_ctx: int | None = None
|
||||||
|
rope_finetuned: bool | None = None
|
||||||
|
|
||||||
ftype: GGMLFileType | None = None
|
ftype: GGMLFileType | None = None
|
||||||
|
|
||||||
@ -198,20 +210,20 @@ class Params:
|
|||||||
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
|
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
|
||||||
config = json.load(open(config_path))
|
config = json.load(open(config_path))
|
||||||
|
|
||||||
n_vocab = config["vocab_size"]
|
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
|
||||||
n_embd = config["hidden_size"]
|
|
||||||
n_layer = config["num_hidden_layers"]
|
|
||||||
n_ff = config["intermediate_size"]
|
|
||||||
n_head = config["num_attention_heads"]
|
|
||||||
n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
|
|
||||||
f_norm_eps = config["rms_norm_eps"]
|
|
||||||
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
|
|
||||||
|
|
||||||
rope_scaling = config.get("rope_scaling")
|
rope_scaling = config.get("rope_scaling")
|
||||||
if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear":
|
|
||||||
f_rope_scale = config["rope_scaling"].get("factor")
|
if rope_scaling is not None and (typ := rope_scaling.get("type")):
|
||||||
|
rope_factor = rope_scaling.get("factor")
|
||||||
|
f_rope_scale = rope_factor
|
||||||
|
if typ == "linear":
|
||||||
|
rope_scaling_type = gguf.RopeScalingType.LINEAR
|
||||||
|
elif typ == "yarn":
|
||||||
|
rope_scaling_type = gguf.RopeScalingType.YARN
|
||||||
|
n_orig_ctx = rope_scaling['original_max_position_embeddings']
|
||||||
|
rope_finetuned = rope_scaling['finetuned']
|
||||||
else:
|
else:
|
||||||
f_rope_scale = None
|
raise NotImplementedError(f'Unknown rope scaling type: {typ}')
|
||||||
|
|
||||||
if "max_sequence_length" in config:
|
if "max_sequence_length" in config:
|
||||||
n_ctx = config["max_sequence_length"]
|
n_ctx = config["max_sequence_length"]
|
||||||
@ -222,16 +234,19 @@ class Params:
|
|||||||
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab = n_vocab,
|
n_vocab = config["vocab_size"],
|
||||||
n_embd = n_embd,
|
n_embd = config["hidden_size"],
|
||||||
n_layer = n_layer,
|
n_layer = config["num_hidden_layers"],
|
||||||
n_ctx = n_ctx,
|
n_ctx = n_ctx,
|
||||||
n_ff = n_ff,
|
n_ff = config["intermediate_size"],
|
||||||
n_head = n_head,
|
n_head = (n_head := config["num_attention_heads"]),
|
||||||
n_head_kv = n_head_kv,
|
n_head_kv = config.get("num_key_value_heads", n_head),
|
||||||
f_norm_eps = f_norm_eps,
|
f_norm_eps = config["rms_norm_eps"],
|
||||||
f_rope_freq_base = f_rope_freq_base,
|
f_rope_freq_base = config.get("rope_theta"),
|
||||||
|
rope_scaling_type = rope_scaling_type,
|
||||||
f_rope_scale = f_rope_scale,
|
f_rope_scale = f_rope_scale,
|
||||||
|
n_orig_ctx = n_orig_ctx,
|
||||||
|
rope_finetuned = rope_finetuned,
|
||||||
)
|
)
|
||||||
|
|
||||||
# LLaMA v2 70B params.json
|
# LLaMA v2 70B params.json
|
||||||
@ -240,17 +255,8 @@ class Params:
|
|||||||
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
|
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
|
||||||
config = json.load(open(config_path))
|
config = json.load(open(config_path))
|
||||||
|
|
||||||
n_vocab = config["vocab_size"] if "vocab_size" in config else -1
|
|
||||||
n_embd = config["dim"]
|
|
||||||
n_layer = config["n_layers"]
|
|
||||||
n_ff = -1
|
|
||||||
n_head = config["n_heads"]
|
|
||||||
n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
|
|
||||||
f_norm_eps = config["norm_eps"]
|
|
||||||
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
|
|
||||||
|
|
||||||
# hack to determine LLaMA v1 vs v2 vs CodeLlama
|
# hack to determine LLaMA v1 vs v2 vs CodeLlama
|
||||||
if f_rope_freq_base == 1000000:
|
if config.get("rope_theta") == 1000000:
|
||||||
# CodeLlama
|
# CodeLlama
|
||||||
n_ctx = 16384
|
n_ctx = 16384
|
||||||
elif config["norm_eps"] == 1e-05:
|
elif config["norm_eps"] == 1e-05:
|
||||||
@ -260,22 +266,16 @@ class Params:
|
|||||||
# LLaMA v1
|
# LLaMA v1
|
||||||
n_ctx = 2048
|
n_ctx = 2048
|
||||||
|
|
||||||
if n_vocab == -1:
|
|
||||||
n_vocab = model["tok_embeddings.weight"].shape[0]
|
|
||||||
|
|
||||||
if n_ff == -1:
|
|
||||||
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
|
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab = n_vocab,
|
n_vocab = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
|
||||||
n_embd = n_embd,
|
n_embd = config["dim"],
|
||||||
n_layer = n_layer,
|
n_layer = config["n_layers"],
|
||||||
n_ctx = n_ctx,
|
n_ctx = n_ctx,
|
||||||
n_ff = n_ff,
|
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0],
|
||||||
n_head = n_head,
|
n_head = (n_head := config["n_heads"]),
|
||||||
n_head_kv = n_head_kv,
|
n_head_kv = config.get("n_kv_heads", n_head),
|
||||||
f_norm_eps = f_norm_eps,
|
f_norm_eps = config["norm_eps"],
|
||||||
f_rope_freq_base = f_rope_freq_base,
|
f_rope_freq_base = config.get("rope_theta"),
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -337,7 +337,6 @@ class BpeVocab:
|
|||||||
|
|
||||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
tokenizer = self.bpe_tokenizer
|
tokenizer = self.bpe_tokenizer
|
||||||
from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import]
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
|
||||||
|
|
||||||
for i, _ in enumerate(tokenizer):
|
for i, _ in enumerate(tokenizer):
|
||||||
@ -417,6 +416,7 @@ class SentencePieceVocab:
|
|||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
||||||
|
|
||||||
#
|
#
|
||||||
@ -424,6 +424,7 @@ Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
|||||||
# TODO: reuse (probably move to gguf.py?)
|
# TODO: reuse (probably move to gguf.py?)
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
||||||
# print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
|
# print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
|
||||||
if n_head_kv is not None and n_head != n_head_kv:
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
@ -599,6 +600,7 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTe
|
|||||||
return lazy_tensor.load().permute(n_head, n_head_kv)
|
return lazy_tensor.load().permute(n_head, n_head_kv)
|
||||||
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
||||||
|
|
||||||
|
|
||||||
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
|
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
|
||||||
def load() -> Tensor:
|
def load() -> Tensor:
|
||||||
return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
|
return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
|
||||||
@ -606,6 +608,7 @@ def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_
|
|||||||
s[0] = s[0] // 3
|
s[0] = s[0] // 3
|
||||||
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
||||||
|
|
||||||
|
|
||||||
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
|
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
|
||||||
def load() -> Tensor:
|
def load() -> Tensor:
|
||||||
return lazy_tensor.load().part(n_part)
|
return lazy_tensor.load().part(n_part)
|
||||||
@ -701,6 +704,7 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
|
|||||||
data_base_path=pickle_paths[0][:-4],
|
data_base_path=pickle_paths[0][:-4],
|
||||||
zip_file=zf)
|
zip_file=zf)
|
||||||
model = unpickler.load()
|
model = unpickler.load()
|
||||||
|
if 'model' in model: model = model['model']
|
||||||
as_dict = dict(model.items())
|
as_dict = dict(model.items())
|
||||||
return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
|
return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
|
||||||
|
|
||||||
@ -754,6 +758,7 @@ def lazy_load_file(path: Path) -> ModelPlus:
|
|||||||
In = TypeVar('In')
|
In = TypeVar('In')
|
||||||
Out = TypeVar('Out')
|
Out = TypeVar('Out')
|
||||||
|
|
||||||
|
|
||||||
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
|
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
|
||||||
'''Parallel map, but with backpressure. If the caller doesn't call `next`
|
'''Parallel map, but with backpressure. If the caller doesn't call `next`
|
||||||
fast enough, this will stop calling `func` at some point rather than
|
fast enough, this will stop calling `func` at some point rather than
|
||||||
@ -788,6 +793,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
|
|||||||
break
|
break
|
||||||
yield result
|
yield result
|
||||||
|
|
||||||
|
|
||||||
def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
||||||
if params.n_vocab != vocab.vocab_size:
|
if params.n_vocab != vocab.vocab_size:
|
||||||
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
|
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
|
||||||
@ -831,8 +837,16 @@ class OutputFile:
|
|||||||
if params.f_rope_freq_base is not None:
|
if params.f_rope_freq_base is not None:
|
||||||
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
|
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
|
||||||
|
|
||||||
if params.f_rope_scale is not None:
|
if params.rope_scaling_type:
|
||||||
self.gguf.add_rope_scale_linear(params.f_rope_scale)
|
assert params.f_rope_scale is not None
|
||||||
|
self.gguf.add_rope_scaling_type(params.rope_scaling_type)
|
||||||
|
self.gguf.add_rope_scaling_factor(params.f_rope_scale)
|
||||||
|
|
||||||
|
if params.n_orig_ctx is not None:
|
||||||
|
self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
|
||||||
|
|
||||||
|
if params.rope_finetuned is not None:
|
||||||
|
self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
|
||||||
|
|
||||||
if params.ftype is not None:
|
if params.ftype is not None:
|
||||||
self.gguf.add_file_type(params.ftype)
|
self.gguf.add_file_type(params.ftype)
|
||||||
@ -852,7 +866,7 @@ class OutputFile:
|
|||||||
elif isinstance(vocab, BpeVocab):
|
elif isinstance(vocab, BpeVocab):
|
||||||
self.gguf.add_tokenizer_model("gpt2")
|
self.gguf.add_tokenizer_model("gpt2")
|
||||||
else:
|
else:
|
||||||
raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab')
|
raise ValueError('Unknown vocab type: Not BpeVocab or SentencePieceVocab')
|
||||||
self.gguf.add_token_list(tokens)
|
self.gguf.add_token_list(tokens)
|
||||||
self.gguf.add_token_scores(scores)
|
self.gguf.add_token_scores(scores)
|
||||||
self.gguf.add_token_types(toktypes)
|
self.gguf.add_token_types(toktypes)
|
||||||
@ -906,7 +920,7 @@ class OutputFile:
|
|||||||
return dt.quantize(arr)
|
return dt.quantize(arr)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
|
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
||||||
check_vocab_size(params, vocab)
|
check_vocab_size(params, vocab)
|
||||||
|
|
||||||
of = OutputFile(fname_out, endianess=endianess)
|
of = OutputFile(fname_out, endianess=endianess)
|
||||||
@ -940,6 +954,7 @@ class OutputFile:
|
|||||||
|
|
||||||
of.close()
|
of.close()
|
||||||
|
|
||||||
|
|
||||||
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
||||||
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type
|
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type
|
||||||
|
|
||||||
@ -954,10 +969,12 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
|
|||||||
|
|
||||||
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
||||||
|
|
||||||
|
|
||||||
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
||||||
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
|
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
|
||||||
for (name, tensor) in model.items()}
|
for (name, tensor) in model.items()}
|
||||||
|
|
||||||
|
|
||||||
def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
||||||
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
|
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
|
||||||
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
|
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
|
||||||
@ -995,6 +1012,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
|||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def nth_multifile_path(path: Path, n: int) -> Path | None:
|
def nth_multifile_path(path: Path, n: int) -> Path | None:
|
||||||
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
|
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
|
||||||
the nth path in the model.
|
the nth path in the model.
|
||||||
@ -1039,7 +1057,8 @@ def load_some_model(path: Path) -> ModelPlus:
|
|||||||
# Be extra-friendly and accept either a file or a directory:
|
# Be extra-friendly and accept either a file or a directory:
|
||||||
if path.is_dir():
|
if path.is_dir():
|
||||||
# Check if it's a set of safetensors files first
|
# Check if it's a set of safetensors files first
|
||||||
files = list(path.glob("model-00001-of-*.safetensors"))
|
globs = ["model-00001-of-*.safetensors", "model.safetensors"]
|
||||||
|
files = [file for glob in globs for file in path.glob(glob)]
|
||||||
if not files:
|
if not files:
|
||||||
# Try the PyTorch patterns too, with lower priority
|
# Try the PyTorch patterns too, with lower priority
|
||||||
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
|
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
|
||||||
@ -1115,14 +1134,18 @@ def do_dump_model(model_plus: ModelPlus) -> None:
|
|||||||
|
|
||||||
|
|
||||||
def main(args_in: list[str] | None = None) -> None:
|
def main(args_in: list[str] | None = None) -> None:
|
||||||
|
output_choices = ["f32", "f16"]
|
||||||
|
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
|
||||||
|
# We currently only support Q8_0 output on little endian systems.
|
||||||
|
output_choices.append("q8_0")
|
||||||
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
|
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
|
||||||
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
|
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
|
||||||
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
|
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
|
||||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||||
parser.add_argument("--outtype", choices=["f32", "f16", "q8_0"], help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
|
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
|
||||||
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
|
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
|
||||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
|
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin, *.safetensors)")
|
||||||
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
|
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
|
||||||
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
|
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
|
||||||
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
|
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
|
||||||
|
BIN
docs/llama-star/idea-arch.key
Executable file
BIN
docs/llama-star/idea-arch.key
Executable file
Binary file not shown.
BIN
docs/llama-star/idea-arch.pdf
Normal file
BIN
docs/llama-star/idea-arch.pdf
Normal file
Binary file not shown.
@ -17,7 +17,7 @@ llama_model_load_internal: [cublas] total VRAM used: 17223 MB
|
|||||||
If you see these lines, then the GPU is being used.
|
If you see these lines, then the GPU is being used.
|
||||||
|
|
||||||
## Verifying that the CPU is not oversaturated
|
## Verifying that the CPU is not oversaturated
|
||||||
llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physicial CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
|
llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physical CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
|
||||||
|
|
||||||
# Example of runtime flags effect on inference speed benchmark
|
# Example of runtime flags effect on inference speed benchmark
|
||||||
These runs were tested on the following machine:
|
These runs were tested on the following machine:
|
||||||
|
@ -24,6 +24,7 @@ else()
|
|||||||
add_subdirectory(llama-bench)
|
add_subdirectory(llama-bench)
|
||||||
add_subdirectory(llava)
|
add_subdirectory(llava)
|
||||||
add_subdirectory(main)
|
add_subdirectory(main)
|
||||||
|
add_subdirectory(tokenize)
|
||||||
add_subdirectory(parallel)
|
add_subdirectory(parallel)
|
||||||
add_subdirectory(perplexity)
|
add_subdirectory(perplexity)
|
||||||
add_subdirectory(quantize)
|
add_subdirectory(quantize)
|
||||||
|
@ -153,7 +153,7 @@ while n_cur <= n_len {
|
|||||||
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||||
|
|
||||||
// is it an end of stream? -> mark the stream as finished
|
// is it an end of stream? -> mark the stream as finished
|
||||||
if new_token_id == llama_token_eos(context) || n_cur == n_len {
|
if new_token_id == llama_token_eos(model) || n_cur == n_len {
|
||||||
i_batch[i] = -1
|
i_batch[i] = -1
|
||||||
// print("")
|
// print("")
|
||||||
if n_parallel > 1 {
|
if n_parallel > 1 {
|
||||||
|
@ -1,9 +1,6 @@
|
|||||||
set(TARGET benchmark)
|
set(TARGET benchmark)
|
||||||
add_executable(${TARGET} benchmark-matmult.cpp)
|
add_executable(${TARGET} benchmark-matmult.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
#include "build-info.h"
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
@ -172,7 +171,8 @@ int main(int argc, char ** argv) {
|
|||||||
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
|
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
|
||||||
|
|
||||||
// printf("Creating compute graph\n");
|
// printf("Creating compute graph\n");
|
||||||
struct ggml_cgraph gf = ggml_build_forward(m11xm2);
|
struct ggml_cgraph * gf = ggml_new_graph(ctx);
|
||||||
|
ggml_build_forward_expand(gf, m11xm2);
|
||||||
|
|
||||||
printf("n_threads=%i\n", benchmark_params.n_threads);
|
printf("n_threads=%i\n", benchmark_params.n_threads);
|
||||||
|
|
||||||
@ -181,9 +181,9 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
std::vector<uint8_t> work_buffer;
|
std::vector<uint8_t> work_buffer;
|
||||||
|
|
||||||
ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads);
|
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
|
||||||
|
|
||||||
TENSOR_DUMP(gf.nodes[0]);
|
TENSOR_DUMP(gf->nodes[0]);
|
||||||
|
|
||||||
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
|
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
|
||||||
|
|
||||||
@ -201,7 +201,8 @@ int main(int argc, char ** argv) {
|
|||||||
struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
|
struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
|
||||||
|
|
||||||
// printf("Creating compute graph\n");
|
// printf("Creating compute graph\n");
|
||||||
struct ggml_cgraph gf31 = ggml_build_forward(q31);
|
struct ggml_cgraph * gf31 = ggml_new_graph(ctx);
|
||||||
|
ggml_build_forward_expand(gf31, q31);
|
||||||
|
|
||||||
// Set up a second graph computation to make sure we override the CPU cache lines
|
// Set up a second graph computation to make sure we override the CPU cache lines
|
||||||
// printf("Creating new tensor q12 & Running quantize\n");
|
// printf("Creating new tensor q12 & Running quantize\n");
|
||||||
@ -212,7 +213,8 @@ int main(int argc, char ** argv) {
|
|||||||
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
|
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
|
||||||
|
|
||||||
//printf("Creating compute graph\n");
|
//printf("Creating compute graph\n");
|
||||||
struct ggml_cgraph gf32 = ggml_build_forward(q32);
|
struct ggml_cgraph * gf32 = ggml_new_graph(ctx);
|
||||||
|
ggml_build_forward_expand(gf32, q32);
|
||||||
printf("n_threads=%i\n", benchmark_params.n_threads);
|
printf("n_threads=%i\n", benchmark_params.n_threads);
|
||||||
|
|
||||||
const int dimx = sizex;
|
const int dimx = sizex;
|
||||||
@ -224,7 +226,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
|
|
||||||
// Let's use the F32 result from above as a reference for the quantized multiplication
|
// Let's use the F32 result from above as a reference for the quantized multiplication
|
||||||
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
|
float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
|
||||||
|
|
||||||
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
||||||
printf("=====================================================================================\n");
|
printf("=====================================================================================\n");
|
||||||
@ -234,7 +236,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
long long int start = ggml_time_us();
|
long long int start = ggml_time_us();
|
||||||
//printf("Running ggml_graph_compute\n");
|
//printf("Running ggml_graph_compute\n");
|
||||||
ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads);
|
ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
|
||||||
|
|
||||||
long long int stop = ggml_time_us();
|
long long int stop = ggml_time_us();
|
||||||
long long int usec = stop-start;
|
long long int usec = stop-start;
|
||||||
@ -252,7 +254,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// Check that the matrix multiplication result is in the right ballpark
|
// Check that the matrix multiplication result is in the right ballpark
|
||||||
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
||||||
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
|
float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
|
||||||
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
|
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
|
||||||
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
||||||
|
|
||||||
@ -267,7 +269,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Running a different graph computation to make sure we override the CPU cache lines
|
// Running a different graph computation to make sure we override the CPU cache lines
|
||||||
ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads);
|
ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
|
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
|
||||||
|
@ -3,6 +3,3 @@ add_executable(${TARGET} embedding.cpp)
|
|||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
#include "build-info.h"
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
@ -240,7 +240,7 @@ static struct lora_data * load_lora(struct lora_info * info) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_init_params params_ggml;
|
struct ggml_init_params params_ggml;
|
||||||
params_ggml.mem_size = ggml_tensor_overhead() * GGML_MAX_NODES;
|
params_ggml.mem_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE;
|
||||||
params_ggml.mem_buffer = NULL;
|
params_ggml.mem_buffer = NULL;
|
||||||
params_ggml.no_alloc = true;
|
params_ggml.no_alloc = true;
|
||||||
result->ctx = ggml_init(params_ggml);
|
result->ctx = ggml_init(params_ggml);
|
||||||
@ -334,7 +334,7 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
|
|||||||
float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
|
float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
|
||||||
|
|
||||||
struct ggml_init_params params;
|
struct ggml_init_params params;
|
||||||
params.mem_size = GGML_OBJECT_SIZE + GGML_GRAPH_SIZE + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
|
params.mem_size = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
|
||||||
params.mem_buffer = NULL;
|
params.mem_buffer = NULL;
|
||||||
params.no_alloc = true;
|
params.no_alloc = true;
|
||||||
struct ggml_context * ctx = NULL;
|
struct ggml_context * ctx = NULL;
|
||||||
|
@ -21,7 +21,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
|
|||||||
./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
||||||
```
|
```
|
||||||
|
|
||||||
Finetune output files will be saved every N iterations (config with `--save-every N`).
|
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
|
||||||
The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
|
The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
|
||||||
So in above example after 10 iterations these files will be written:
|
So in above example after 10 iterations these files will be written:
|
||||||
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
|
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
|
||||||
|
@ -3,9 +3,7 @@
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import gguf
|
import gguf
|
||||||
import os
|
|
||||||
import struct
|
import struct
|
||||||
import sys
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
@ -548,35 +548,35 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
|
|||||||
struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
|
struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
|
||||||
|
|
||||||
randomize_tensor_normal(lora->tok_embeddings_a, rnd);
|
randomize_tensor_normal(lora->tok_embeddings_a, rnd);
|
||||||
randomize_tensor_normal(lora->tok_embeddings_b, rnd);
|
ggml_set_zero(lora->tok_embeddings_b);
|
||||||
randomize_tensor_normal(lora->norm_a, rnd);
|
randomize_tensor_normal(lora->norm_a, rnd);
|
||||||
randomize_tensor_normal(lora->norm_b, rnd);
|
ggml_set_zero(lora->norm_b);
|
||||||
randomize_tensor_normal(lora->output_a, rnd);
|
randomize_tensor_normal(lora->output_a, rnd);
|
||||||
randomize_tensor_normal(lora->output_b, rnd);
|
ggml_set_zero(lora->output_b);
|
||||||
|
|
||||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
auto & layer = lora->layers[i];
|
auto & layer = lora->layers[i];
|
||||||
randomize_tensor_normal(layer.attention_norm_a, rnd);
|
randomize_tensor_normal(layer.attention_norm_a, rnd);
|
||||||
randomize_tensor_normal(layer.attention_norm_b, rnd);
|
ggml_set_zero(layer.attention_norm_b);
|
||||||
|
|
||||||
randomize_tensor_normal(layer.wq_a, rnd);
|
randomize_tensor_normal(layer.wq_a, rnd);
|
||||||
randomize_tensor_normal(layer.wq_b, rnd);
|
ggml_set_zero(layer.wq_b);
|
||||||
randomize_tensor_normal(layer.wk_a, rnd);
|
randomize_tensor_normal(layer.wk_a, rnd);
|
||||||
randomize_tensor_normal(layer.wk_b, rnd);
|
ggml_set_zero(layer.wk_b);
|
||||||
randomize_tensor_normal(layer.wv_a, rnd);
|
randomize_tensor_normal(layer.wv_a, rnd);
|
||||||
randomize_tensor_normal(layer.wv_b, rnd);
|
ggml_set_zero(layer.wv_b);
|
||||||
randomize_tensor_normal(layer.wo_a, rnd);
|
randomize_tensor_normal(layer.wo_a, rnd);
|
||||||
randomize_tensor_normal(layer.wo_b, rnd);
|
ggml_set_zero(layer.wo_b);
|
||||||
|
|
||||||
randomize_tensor_normal(layer.ffn_norm_a, rnd);
|
randomize_tensor_normal(layer.ffn_norm_a, rnd);
|
||||||
randomize_tensor_normal(layer.ffn_norm_b, rnd);
|
ggml_set_zero(layer.ffn_norm_b);
|
||||||
|
|
||||||
randomize_tensor_normal(layer.w1_a, rnd);
|
randomize_tensor_normal(layer.w1_a, rnd);
|
||||||
randomize_tensor_normal(layer.w1_b, rnd);
|
ggml_set_zero(layer.w1_b);
|
||||||
randomize_tensor_normal(layer.w2_a, rnd);
|
randomize_tensor_normal(layer.w2_a, rnd);
|
||||||
randomize_tensor_normal(layer.w2_b, rnd);
|
ggml_set_zero(layer.w2_b);
|
||||||
randomize_tensor_normal(layer.w3_a, rnd);
|
randomize_tensor_normal(layer.w3_a, rnd);
|
||||||
randomize_tensor_normal(layer.w3_b, rnd);
|
ggml_set_zero(layer.w3_b);
|
||||||
}
|
}
|
||||||
|
|
||||||
free_random_normal_distribution(rnd);
|
free_random_normal_distribution(rnd);
|
||||||
@ -642,8 +642,9 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
|||||||
const int rope_mode = 0;
|
const int rope_mode = 0;
|
||||||
|
|
||||||
return ggml_rope_custom(ctx,
|
return ggml_rope_custom(ctx,
|
||||||
t, KQ_pos, n_rot, rope_mode, n_ctx,
|
t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
|
||||||
rope_freq_base, rope_freq_scale);
|
rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
|
||||||
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
set_name(tokens_input, "tokens_input");
|
set_name(tokens_input, "tokens_input");
|
||||||
@ -652,7 +653,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
|||||||
GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
|
GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
|
||||||
|
|
||||||
auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
|
auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
|
||||||
if (ggml_is_quantized(a->type)) {
|
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
|
||||||
return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
|
return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
|
||||||
} else if (a->type == GGML_TYPE_F32) {
|
} else if (a->type == GGML_TYPE_F32) {
|
||||||
return ggml_add(ctx, a, b);
|
return ggml_add(ctx, a, b);
|
||||||
@ -771,7 +772,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
|||||||
if (enable_checkpointing) {
|
if (enable_checkpointing) {
|
||||||
ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
|
ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
|
||||||
} else {
|
} else {
|
||||||
*gb = *gf;
|
ggml_graph_cpy(gf, gb);
|
||||||
ggml_build_backward_expand(ctx, gf, gb, true);
|
ggml_build_backward_expand(ctx, gf, gb, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1545,6 +1546,7 @@ int main(int argc, char ** argv) {
|
|||||||
srand(params.common.seed);
|
srand(params.common.seed);
|
||||||
|
|
||||||
struct llama_model_params llama_mparams = llama_model_default_params();
|
struct llama_model_params llama_mparams = llama_model_default_params();
|
||||||
|
llama_mparams.n_gpu_layers = params.common.n_gpu_layers;
|
||||||
llama_mparams.vocab_only = false;
|
llama_mparams.vocab_only = false;
|
||||||
|
|
||||||
printf("%s: model base = '%s'\n", __func__, params.fn_model_base);
|
printf("%s: model base = '%s'\n", __func__, params.fn_model_base);
|
||||||
@ -1602,6 +1604,7 @@ int main(int argc, char ** argv) {
|
|||||||
opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
|
opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
|
||||||
opt->params.print_forward_graph = false;
|
opt->params.print_forward_graph = false;
|
||||||
opt->params.print_backward_graph = false;
|
opt->params.print_backward_graph = false;
|
||||||
|
opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
|
||||||
opt->params.n_threads = params.common.n_threads;
|
opt->params.n_threads = params.common.n_threads;
|
||||||
opt->params.past = params.common.opt_past;
|
opt->params.past = params.common.opt_past;
|
||||||
opt->params.delta = params.common.opt_delta;
|
opt->params.delta = params.common.opt_delta;
|
||||||
@ -1728,11 +1731,9 @@ int main(int argc, char ** argv) {
|
|||||||
ggml_allocr_free(alloc);
|
ggml_allocr_free(alloc);
|
||||||
|
|
||||||
// context for compute tensors without their data
|
// context for compute tensors without their data
|
||||||
size_t estimated_compute_size_wo_data = (
|
const size_t estimated_compute_size_wo_data = (
|
||||||
ggml_tensor_overhead()*GGML_MAX_NODES*2
|
2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
|
||||||
+ (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*(
|
(params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
|
||||||
params.common.use_checkpointing ? 3 : 2
|
|
||||||
)
|
|
||||||
);
|
);
|
||||||
struct ggml_init_params ctx_compute_params = {
|
struct ggml_init_params ctx_compute_params = {
|
||||||
estimated_compute_size_wo_data, // mem_size
|
estimated_compute_size_wo_data, // mem_size
|
||||||
@ -1755,11 +1756,11 @@ int main(int argc, char ** argv) {
|
|||||||
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
|
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
|
||||||
ctx_compute = ggml_init(ctx_compute_params);
|
ctx_compute = ggml_init(ctx_compute_params);
|
||||||
alloc = ggml_allocr_new_measure(tensor_alignment);
|
alloc = ggml_allocr_new_measure(tensor_alignment);
|
||||||
gf = ggml_new_graph(ctx_compute);
|
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||||
gf->order = (enum ggml_cgraph_eval_order) order;
|
gf->order = (enum ggml_cgraph_eval_order) order;
|
||||||
gb = ggml_new_graph(ctx_compute);
|
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||||
gb_tmp = params.common.use_checkpointing
|
gb_tmp = params.common.use_checkpointing
|
||||||
? ggml_new_graph(ctx_compute)
|
? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
|
||||||
: NULL;
|
: NULL;
|
||||||
loss = llama_build_lora_finetune_graphs(
|
loss = llama_build_lora_finetune_graphs(
|
||||||
&model, &lora, alloc, ctx_compute,
|
&model, &lora, alloc, ctx_compute,
|
||||||
@ -1788,11 +1789,11 @@ int main(int argc, char ** argv) {
|
|||||||
mem_compute_data.resize(max_compute_size);
|
mem_compute_data.resize(max_compute_size);
|
||||||
ctx_compute = ggml_init(ctx_compute_params);
|
ctx_compute = ggml_init(ctx_compute_params);
|
||||||
alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
|
alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
|
||||||
gf = ggml_new_graph(ctx_compute);
|
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||||
gf->order = best_order;
|
gf->order = best_order;
|
||||||
gb = ggml_new_graph(ctx_compute);
|
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||||
gb_tmp = params.common.use_checkpointing
|
gb_tmp = params.common.use_checkpointing
|
||||||
? ggml_new_graph(ctx_compute)
|
? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
|
||||||
: NULL;
|
: NULL;
|
||||||
loss = llama_build_lora_finetune_graphs(
|
loss = llama_build_lora_finetune_graphs(
|
||||||
&model, &lora, alloc, ctx_compute,
|
&model, &lora, alloc, ctx_compute,
|
||||||
|
34
examples/finetune/finetune.sh
Normal file
34
examples/finetune/finetune.sh
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
cd `dirname $0`
|
||||||
|
cd ../..
|
||||||
|
|
||||||
|
EXE="./finetune"
|
||||||
|
|
||||||
|
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
|
||||||
|
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
|
||||||
|
|
||||||
|
# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
|
||||||
|
MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing.
|
||||||
|
|
||||||
|
while getopts "dg" opt; do
|
||||||
|
case $opt in
|
||||||
|
d)
|
||||||
|
DEBUGGER="gdb --args"
|
||||||
|
;;
|
||||||
|
g)
|
||||||
|
EXE="./build/bin/Release/finetune"
|
||||||
|
GPUARG="--gpu-layers 25"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
$DEBUGGER $EXE \
|
||||||
|
--model-base $MODEL \
|
||||||
|
$GPUARG \
|
||||||
|
--checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \
|
||||||
|
--checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
|
||||||
|
--lora-out lora-ol3b-shakespeare-ITERATION.bin \
|
||||||
|
--train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
|
||||||
|
--save-every 10 \
|
||||||
|
--threads 10 --adam-iter 30 --batch 4 --ctx 64 \
|
||||||
|
--use-checkpointing
|
@ -3,6 +3,3 @@ add_executable(${TARGET} infill.cpp)
|
|||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
@ -2,7 +2,6 @@
|
|||||||
|
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "build-info.h"
|
|
||||||
#include "grammar-parser.h"
|
#include "grammar-parser.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
@ -147,6 +146,13 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
if (params.chatml) {
|
||||||
|
printf("\n************\n");
|
||||||
|
printf("%s: please use the 'main' tool for chatml mode\n", __func__);
|
||||||
|
printf("************\n\n");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
if (!params.antiprompt.empty()) {
|
if (!params.antiprompt.empty()) {
|
||||||
printf("\n************\n");
|
printf("\n************\n");
|
||||||
printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
|
printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
|
||||||
@ -184,8 +190,8 @@ int main(int argc, char ** argv) {
|
|||||||
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
||||||
LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
|
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||||
params.seed = time(NULL);
|
params.seed = time(NULL);
|
||||||
@ -231,7 +237,7 @@ int main(int argc, char ** argv) {
|
|||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s\n", get_system_info(params).c_str());
|
LOG_TEE("%s\n", get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
|
const bool add_bos = llama_should_add_bos_token(model);
|
||||||
LOG("add_bos: %d\n", add_bos);
|
LOG("add_bos: %d\n", add_bos);
|
||||||
|
|
||||||
bool suff_rm_leading_spc = params.escape;
|
bool suff_rm_leading_spc = params.escape;
|
||||||
|
@ -3,6 +3,3 @@ add_executable(${TARGET} llama-bench.cpp)
|
|||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
@ -19,7 +19,6 @@
|
|||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "build-info.h"
|
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
|
|
||||||
// utils
|
// utils
|
||||||
@ -641,8 +640,8 @@ struct test {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const std::string test::build_commit = BUILD_COMMIT;
|
const std::string test::build_commit = LLAMA_COMMIT;
|
||||||
const int test::build_number = BUILD_NUMBER;
|
const int test::build_number = LLAMA_BUILD_NUMBER;
|
||||||
const bool test::cuda = !!ggml_cpu_has_cublas();
|
const bool test::cuda = !!ggml_cpu_has_cublas();
|
||||||
const bool test::opencl = !!ggml_cpu_has_clblast();
|
const bool test::opencl = !!ggml_cpu_has_clblast();
|
||||||
const bool test::metal = !!ggml_cpu_has_metal();
|
const bool test::metal = !!ggml_cpu_has_metal();
|
||||||
|
@ -1,20 +1,36 @@
|
|||||||
set(TARGET clip)
|
add_library(llava OBJECT
|
||||||
add_library(${TARGET} clip.cpp clip.h)
|
llava.cpp
|
||||||
install(TARGETS ${TARGET} LIBRARY)
|
llava.h
|
||||||
target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
|
clip.cpp
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
clip.h
|
||||||
if (NOT MSVC)
|
)
|
||||||
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
|
|
||||||
endif()
|
target_link_libraries(llava PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
target_include_directories(llava PUBLIC .)
|
||||||
|
target_include_directories(llava PUBLIC ../..)
|
||||||
|
target_include_directories(llava PUBLIC ../../common)
|
||||||
|
|
||||||
|
target_compile_features(llava PRIVATE cxx_std_11)
|
||||||
|
|
||||||
|
add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
|
||||||
|
if (BUILD_SHARED_LIBS)
|
||||||
|
set_target_properties(llava PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
target_compile_definitions(llava PRIVATE LLAMA_SHARED LLAMA_BUILD)
|
||||||
|
add_library(llava_shared SHARED $<TARGET_OBJECTS:llava>)
|
||||||
|
target_link_libraries(llava_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
install(TARGETS llava_shared LIBRARY)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(TARGET llava)
|
if (NOT MSVC)
|
||||||
add_executable(${TARGET} llava.cpp)
|
target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
|
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
endif()
|
||||||
|
if(TARGET BUILD_INFO)
|
||||||
|
add_dependencies(llava BUILD_INFO)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(TARGET llava-cli)
|
||||||
|
add_executable(llava-cli llava-cli.cpp)
|
||||||
|
install(TARGETS llava-cli RUNTIME)
|
||||||
|
target_link_libraries(llava-cli PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(llava PRIVATE cxx_std_11)
|
||||||
|
@ -9,12 +9,12 @@ models are available.
|
|||||||
After API is confirmed, more models will be supported / uploaded.
|
After API is confirmed, more models will be supported / uploaded.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
Build with cmake or run `make llava` to build it.
|
Build with cmake or run `make llava-cli` to build it.
|
||||||
|
|
||||||
After building, run: `./llava` to see the usage. For example:
|
After building, run: `./llava-cli` to see the usage. For example:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./llava -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
|
./llava-cli -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
|
||||||
```
|
```
|
||||||
|
|
||||||
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
|
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
|
||||||
@ -51,7 +51,6 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director
|
|||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
|
|
||||||
- [ ] Support server mode.
|
|
||||||
- [ ] Support non-CPU backend for the image encoding part.
|
- [ ] Support non-CPU backend for the image encoding part.
|
||||||
- [ ] Support different sampling methods.
|
- [ ] Support different sampling methods.
|
||||||
- [ ] Support more model variants.
|
- [ ] Support more model variants.
|
||||||
|
@ -664,7 +664,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
// measure mem requirement and allocate
|
// measure mem requirement and allocate
|
||||||
{
|
{
|
||||||
static const size_t tensor_alignment = 32;
|
static const size_t tensor_alignment = 32;
|
||||||
new_clip->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
|
new_clip->buf_compute.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead());
|
||||||
new_clip->alloc = ggml_allocr_new_measure(tensor_alignment);
|
new_clip->alloc = ggml_allocr_new_measure(tensor_alignment);
|
||||||
clip_image_f32_batch batch;
|
clip_image_f32_batch batch;
|
||||||
batch.size = 1;
|
batch.size = 1;
|
||||||
@ -680,26 +680,44 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
return new_clip;
|
return new_clip;
|
||||||
}
|
}
|
||||||
|
|
||||||
clip_image_u8 * make_clip_image_u8() { return new clip_image_u8(); }
|
clip_image_u8 * make_clip_image_u8() {
|
||||||
|
auto img = new clip_image_u8();
|
||||||
|
return img;
|
||||||
|
}
|
||||||
clip_image_f32 * make_clip_image_f32() { return new clip_image_f32(); }
|
clip_image_f32 * make_clip_image_f32() { return new clip_image_f32(); }
|
||||||
|
|
||||||
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
void clip_image_u8_free(clip_image_u8 * img) { if (img->data) { delete[] img->data; } delete img; }
|
||||||
int nx, ny, nc;
|
void clip_image_f32_free(clip_image_f32 * img) { if (img->data) { delete[] img->data; } delete img; }
|
||||||
auto data = stbi_load(fname, &nx, &ny, &nc, 3);
|
|
||||||
if (!data) {
|
|
||||||
fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
|
||||||
img->nx = nx;
|
img->nx = nx;
|
||||||
img->ny = ny;
|
img->ny = ny;
|
||||||
img->size = nx * ny * 3;
|
img->size = nx * ny * 3;
|
||||||
img->data = new uint8_t[img->size]();
|
img->data = new uint8_t[img->size]();
|
||||||
memcpy(img->data, data, img->size);
|
memcpy(img->data, data, img->size);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
||||||
|
int nx, ny, nc;
|
||||||
|
auto data = stbi_load(fname, &nx, &ny, &nc, 3);
|
||||||
|
if (!data) {
|
||||||
|
fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
build_clip_img_from_data(data, nx, ny, img);
|
||||||
stbi_image_free(data);
|
stbi_image_free(data);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
|
||||||
|
int nx, ny, nc;
|
||||||
|
auto data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
||||||
|
if (!data) {
|
||||||
|
fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
build_clip_img_from_data(data, nx, ny, img);
|
||||||
|
stbi_image_free(data);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -714,39 +732,40 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
|||||||
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
|
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
|
||||||
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
|
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
|
||||||
|
|
||||||
clip_image_u8 temp; // we will keep the input image data here temporarily
|
clip_image_u8 * temp = make_clip_image_u8(); // we will keep the input image data here temporarily
|
||||||
if (pad2square && img->nx != img->ny) {
|
if (pad2square && img->nx != img->ny) {
|
||||||
int longer_side = std::max(img->nx, img->ny);
|
int longer_side = std::max(img->nx, img->ny);
|
||||||
temp.nx = longer_side;
|
temp->nx = longer_side;
|
||||||
temp.ny = longer_side;
|
temp->ny = longer_side;
|
||||||
temp.size = 3 * longer_side * longer_side;
|
temp->size = 3 * longer_side * longer_side;
|
||||||
temp.data = new uint8_t[temp.size]();
|
temp->data = new uint8_t[temp->size]();
|
||||||
uint8_t bc[3] = {122, 116, 104}; // bakground color in RGB from LLaVA
|
uint8_t bc[3] = {122, 116, 104}; // bakground color in RGB from LLaVA
|
||||||
|
|
||||||
// fill with background color
|
// fill with background color
|
||||||
for (size_t i = 0; i < temp.size; i++) {
|
for (size_t i = 0; i < temp->size; i++) {
|
||||||
temp.data[i] = bc[i % 3];
|
temp->data[i] = bc[i % 3];
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy from the input image
|
// copy from the input image
|
||||||
for (int y = 0; y < img->ny; y++) {
|
for (int y = 0; y < img->ny; y++) {
|
||||||
for (int x = 0; x < img->nx; x++) {
|
for (int x = 0; x < img->nx; x++) {
|
||||||
const int i = 3 * (y * img->nx + x);
|
const int i = 3 * (y * img->nx + x);
|
||||||
const int j = 3 * (y * temp.nx + x);
|
const int j = 3 * (y * temp->nx + x);
|
||||||
temp.data[j] = img->data[i];
|
temp->data[j] = img->data[i];
|
||||||
temp.data[j+1] = img->data[i+1];
|
temp->data[j+1] = img->data[i+1];
|
||||||
temp.data[j+2] = img->data[i+2];
|
temp->data[j+2] = img->data[i+2];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
temp.nx = img->nx;
|
temp->nx = img->nx;
|
||||||
temp.ny = img->ny;
|
temp->ny = img->ny;
|
||||||
temp.size = img->size;
|
temp->size = img->size;
|
||||||
temp.data = img->data;
|
temp->data = new uint8_t[temp->size]();
|
||||||
|
memcpy(&temp->data[0], &img->data[0], temp->size); // copy
|
||||||
}
|
}
|
||||||
|
|
||||||
const int nx = temp.nx;
|
const int nx = temp->nx;
|
||||||
const int ny = temp.ny;
|
const int ny = temp->ny;
|
||||||
|
|
||||||
const int nx2 = ctx->vision_model.hparams.image_size;
|
const int nx2 = ctx->vision_model.hparams.image_size;
|
||||||
const int ny2 = ctx->vision_model.hparams.image_size;
|
const int ny2 = ctx->vision_model.hparams.image_size;
|
||||||
@ -785,10 +804,10 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
|||||||
const int j10 = 3 * (y1 * nx + x0) + c;
|
const int j10 = 3 * (y1 * nx + x0) + c;
|
||||||
const int j11 = 3 * (y1 * nx + x1) + c;
|
const int j11 = 3 * (y1 * nx + x1) + c;
|
||||||
|
|
||||||
const float v00 = temp.data[j00];
|
const float v00 = temp->data[j00];
|
||||||
const float v01 = temp.data[j01];
|
const float v01 = temp->data[j01];
|
||||||
const float v10 = temp.data[j10];
|
const float v10 = temp->data[j10];
|
||||||
const float v11 = temp.data[j11];
|
const float v11 = temp->data[j11];
|
||||||
|
|
||||||
const float v0 = v00 * (1.0f - dx) + v01 * dx;
|
const float v0 = v00 * (1.0f - dx) + v01 * dx;
|
||||||
const float v1 = v10 * (1.0f - dx) + v11 * dx;
|
const float v1 = v10 * (1.0f - dx) + v11 * dx;
|
||||||
@ -803,6 +822,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
clip_image_u8_free(temp);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -1049,16 +1069,16 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
int clip_n_mmproj_embd(struct clip_ctx * ctx) {
|
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||||
return ctx->vision_model.mm_2_b->ne[0];
|
return ctx->vision_model.mm_2_b->ne[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
int clip_n_patches(struct clip_ctx * ctx) {
|
int clip_n_patches(const struct clip_ctx * ctx) {
|
||||||
auto & params = ctx->vision_model.hparams;
|
auto & params = ctx->vision_model.hparams;
|
||||||
|
|
||||||
return (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
return (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t clip_embd_nbytes(struct clip_ctx * ctx) {
|
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
|
||||||
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,22 @@
|
|||||||
#ifndef CLIP_H
|
#ifndef CLIP_H
|
||||||
#define CLIP_H
|
#define CLIP_H
|
||||||
|
|
||||||
#include "ggml.h"
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#ifdef LLAMA_SHARED
|
||||||
|
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||||
|
# ifdef LLAMA_BUILD
|
||||||
|
# define CLIP_API __declspec(dllexport)
|
||||||
|
# else
|
||||||
|
# define CLIP_API __declspec(dllimport)
|
||||||
|
# endif
|
||||||
|
# else
|
||||||
|
# define CLIP_API __attribute__ ((visibility ("default")))
|
||||||
|
# endif
|
||||||
|
#else
|
||||||
|
# define CLIP_API
|
||||||
|
#endif
|
||||||
|
|
||||||
struct clip_ctx;
|
struct clip_ctx;
|
||||||
|
|
||||||
@ -20,19 +35,20 @@ struct clip_vision_hparams {
|
|||||||
float eps;
|
float eps;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
|
/** load mmproj model */
|
||||||
|
CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
|
||||||
|
/** free mmproj model */
|
||||||
|
CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||||
|
|
||||||
void clip_free(struct clip_ctx * ctx);
|
size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||||
|
int clip_n_patches(const struct clip_ctx * ctx);
|
||||||
size_t clip_embd_nbytes(struct clip_ctx * ctx);
|
int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
||||||
int clip_n_patches(struct clip_ctx * ctx);
|
|
||||||
int clip_n_mmproj_embd(struct clip_ctx * ctx);
|
|
||||||
|
|
||||||
// RGB uint8 image
|
// RGB uint8 image
|
||||||
struct clip_image_u8 {
|
struct clip_image_u8 {
|
||||||
int nx;
|
int nx;
|
||||||
int ny;
|
int ny;
|
||||||
uint8_t * data;
|
uint8_t * data = NULL;
|
||||||
size_t size;
|
size_t size;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -41,7 +57,7 @@ struct clip_image_u8 {
|
|||||||
struct clip_image_f32 {
|
struct clip_image_f32 {
|
||||||
int nx;
|
int nx;
|
||||||
int ny;
|
int ny;
|
||||||
float * data;
|
float * data = NULL;
|
||||||
size_t size;
|
size_t size;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -57,7 +73,12 @@ struct clip_image_f32_batch {
|
|||||||
|
|
||||||
struct clip_image_u8 * make_clip_image_u8();
|
struct clip_image_u8 * make_clip_image_u8();
|
||||||
struct clip_image_f32 * make_clip_image_f32();
|
struct clip_image_f32 * make_clip_image_f32();
|
||||||
bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
CLIP_API void clip_image_u8_free(clip_image_u8 * img);
|
||||||
|
CLIP_API void clip_image_f32_free(clip_image_f32 * img);
|
||||||
|
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
||||||
|
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
||||||
|
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
||||||
|
|
||||||
bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
|
bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
|
||||||
bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
|
bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
|
||||||
|
|
||||||
|
314
examples/llava/llava-cli.cpp
Normal file
314
examples/llava/llava-cli.cpp
Normal file
@ -0,0 +1,314 @@
|
|||||||
|
#include "ggml.h"
|
||||||
|
#include "common.h"
|
||||||
|
#include "clip.h"
|
||||||
|
#include "llava.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include "base64.hpp"
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
||||||
|
int N = (int) tokens.size();
|
||||||
|
for (int i = 0; i < N; i += n_batch) {
|
||||||
|
int n_eval = (int) tokens.size() - i;
|
||||||
|
if (n_eval > n_batch) {
|
||||||
|
n_eval = n_batch;
|
||||||
|
}
|
||||||
|
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
|
||||||
|
fprintf(stderr, "%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
*n_past += n_eval;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
|
||||||
|
std::vector<llama_token> tokens;
|
||||||
|
tokens.push_back(id);
|
||||||
|
return eval_tokens(ctx_llama, tokens, 1, n_past);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
||||||
|
std::string str2 = str;
|
||||||
|
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
|
||||||
|
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: use common/sampling.h
|
||||||
|
static llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
|
||||||
|
auto & sparams = params.sparams;
|
||||||
|
|
||||||
|
// out of user input, sample next token
|
||||||
|
const float temp = sparams.temp;
|
||||||
|
const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
|
||||||
|
const float top_p = sparams.top_p;
|
||||||
|
const float tfs_z = sparams.tfs_z;
|
||||||
|
const float typical_p = sparams.typical_p;
|
||||||
|
// const int32_t repeat_last_n = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
|
||||||
|
// const float repeat_penalty = sparams.repeat_penalty;
|
||||||
|
// const float alpha_presence = sparams.presence_penalty;
|
||||||
|
// const float alpha_frequency = sparams.frequency_penalty;
|
||||||
|
const int mirostat = sparams.mirostat;
|
||||||
|
const float mirostat_tau = sparams.mirostat_tau;
|
||||||
|
const float mirostat_eta = sparams.mirostat_eta;
|
||||||
|
// const bool penalize_nl = sparams.penalize_nl;
|
||||||
|
|
||||||
|
llama_token id = 0;
|
||||||
|
{
|
||||||
|
auto logits = llama_get_logits(ctx_llama);
|
||||||
|
auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
|
||||||
|
|
||||||
|
// Apply params.logit_bias map
|
||||||
|
for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
|
||||||
|
logits[it->first] += it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_token_data> candidates;
|
||||||
|
candidates.reserve(n_vocab);
|
||||||
|
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
|
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||||
|
|
||||||
|
if (temp <= 0) {
|
||||||
|
// Greedy sampling
|
||||||
|
id = llama_sample_token_greedy(ctx_llama, &candidates_p);
|
||||||
|
} else {
|
||||||
|
if (mirostat == 1) {
|
||||||
|
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||||
|
const int mirostat_m = 100;
|
||||||
|
llama_sample_temp(ctx_llama, &candidates_p, temp);
|
||||||
|
id = llama_sample_token_mirostat(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
||||||
|
} else if (mirostat == 2) {
|
||||||
|
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||||
|
llama_sample_temp(ctx_llama, &candidates_p, temp);
|
||||||
|
id = llama_sample_token_mirostat_v2(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
||||||
|
} else {
|
||||||
|
// Temperature sampling
|
||||||
|
llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
|
||||||
|
llama_sample_tail_free(ctx_llama, &candidates_p, tfs_z, 1);
|
||||||
|
llama_sample_typical(ctx_llama, &candidates_p, typical_p, 1);
|
||||||
|
llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
|
||||||
|
llama_sample_temp(ctx_llama, &candidates_p, temp);
|
||||||
|
id = llama_sample_token(ctx_llama, &candidates_p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
|
||||||
|
int id = sample_id(ctx_llama, params);
|
||||||
|
static std::string ret;
|
||||||
|
if (id == llama_token_eos(llama_get_model(ctx_llama))) {
|
||||||
|
ret = "</s>";
|
||||||
|
} else {
|
||||||
|
ret = llama_token_to_piece(ctx_llama, id);
|
||||||
|
}
|
||||||
|
eval_id(ctx_llama, id, n_past);
|
||||||
|
return ret.c_str();
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
|
||||||
|
static const char* IMG_BASE64_TAG_END = "\">";
|
||||||
|
|
||||||
|
static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
|
||||||
|
begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
|
||||||
|
end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool prompt_contains_image(const std::string& prompt) {
|
||||||
|
size_t begin, end;
|
||||||
|
find_image_tag_in_prompt(prompt, begin, end);
|
||||||
|
return (begin != std::string::npos);
|
||||||
|
}
|
||||||
|
|
||||||
|
// replaces the base64 image tag in the prompt with `replacement`
|
||||||
|
static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
|
||||||
|
size_t img_base64_str_start, img_base64_str_end;
|
||||||
|
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
|
||||||
|
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
|
||||||
|
fprintf(stderr, "%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
|
||||||
|
auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
|
||||||
|
auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
|
||||||
|
|
||||||
|
auto required_bytes = base64::required_encode_size(base64_str.size());
|
||||||
|
auto img_bytes = std::vector<unsigned char>(required_bytes);
|
||||||
|
base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
|
||||||
|
|
||||||
|
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
|
||||||
|
if (!embed) {
|
||||||
|
fprintf(stderr, "%s: could not load image from base64 string.\n", __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return embed;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
|
||||||
|
size_t begin, end;
|
||||||
|
find_image_tag_in_prompt(prompt, begin, end);
|
||||||
|
if (begin == std::string::npos || end == std::string::npos) {
|
||||||
|
return prompt;
|
||||||
|
}
|
||||||
|
auto pre = prompt.substr(0, begin);
|
||||||
|
auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
|
||||||
|
return pre + replacement + post;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct llava_context {
|
||||||
|
struct clip_ctx * ctx_clip = NULL;
|
||||||
|
struct llama_context * ctx_llama = NULL;
|
||||||
|
struct llama_model * model = NULL;
|
||||||
|
};
|
||||||
|
|
||||||
|
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||||
|
printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||||
|
printf(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) {
|
||||||
|
|
||||||
|
// load and preprocess the image
|
||||||
|
llava_image_embed * embed = NULL;
|
||||||
|
auto prompt = params->prompt;
|
||||||
|
if (prompt_contains_image(prompt)) {
|
||||||
|
if (!params->image.empty()) {
|
||||||
|
printf("using base64 encoded image instead of command line image path\n");
|
||||||
|
}
|
||||||
|
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
|
||||||
|
if (!embed) {
|
||||||
|
fprintf(stderr, "%s: can't load image from prompt\n", __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
params->prompt = remove_image_from_prompt(prompt);
|
||||||
|
} else {
|
||||||
|
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str());
|
||||||
|
if (!embed) {
|
||||||
|
fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str());
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return embed;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, gpt_params * params, const std::string & prompt) {
|
||||||
|
int n_past = 0;
|
||||||
|
|
||||||
|
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
||||||
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
|
||||||
|
|
||||||
|
// llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
|
||||||
|
eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos);
|
||||||
|
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
|
||||||
|
eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
|
||||||
|
|
||||||
|
// generate the response
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
for (int i = 0; i < max_tgt_len; i++) {
|
||||||
|
const char * tmp = sample(ctx_llava->ctx_llama, *params, &n_past);
|
||||||
|
if (strcmp(tmp, "</s>") == 0) break;
|
||||||
|
|
||||||
|
printf("%s", tmp);
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static struct llava_context * llava_init(gpt_params * params) {
|
||||||
|
const char * clip_path = params->mmproj.c_str();
|
||||||
|
|
||||||
|
auto prompt = params->prompt;
|
||||||
|
if (prompt.empty()) {
|
||||||
|
prompt = "describe the image in detail.";
|
||||||
|
}
|
||||||
|
|
||||||
|
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
||||||
|
|
||||||
|
llama_backend_init(params->numa);
|
||||||
|
|
||||||
|
llama_model_params model_params = llama_model_params_from_gpt_params(*params);
|
||||||
|
|
||||||
|
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
||||||
|
if (model == NULL) {
|
||||||
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
||||||
|
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
||||||
|
|
||||||
|
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
|
if (ctx_llama == NULL) {
|
||||||
|
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||||
|
|
||||||
|
ctx_llava->ctx_llama = ctx_llama;
|
||||||
|
ctx_llava->ctx_clip = ctx_clip;
|
||||||
|
ctx_llava->model = model;
|
||||||
|
return ctx_llava;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void llava_free(struct llava_context * ctx_llava) {
|
||||||
|
if (ctx_llava->ctx_clip) {
|
||||||
|
clip_free(ctx_llava->ctx_clip);
|
||||||
|
ctx_llava->ctx_clip = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_free(ctx_llava->ctx_llama);
|
||||||
|
llama_free_model(ctx_llava->model);
|
||||||
|
llama_backend_free();
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
ggml_time_init();
|
||||||
|
|
||||||
|
gpt_params params;
|
||||||
|
|
||||||
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
show_additional_info(argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||||
|
gpt_print_usage(argc, argv, params);
|
||||||
|
show_additional_info(argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto ctx_llava = llava_init(¶ms);
|
||||||
|
if (ctx_llava == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to init llava\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto image_embed = load_image(ctx_llava, ¶ms);
|
||||||
|
|
||||||
|
// process the prompt
|
||||||
|
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||||
|
|
||||||
|
llama_print_timings(ctx_llava->ctx_llama);
|
||||||
|
|
||||||
|
llava_image_embed_free(image_embed);
|
||||||
|
llava_free(ctx_llava);
|
||||||
|
return 0;
|
||||||
|
}
|
@ -1,147 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
// this one and clip lib will be eventually merged to a single lib, let's keep it this way for now
|
|
||||||
|
|
||||||
#include "common.h"
|
|
||||||
#include "llama.h"
|
|
||||||
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
inline bool eval_image_embd(llama_context * ctx_llama, float * embd, int N, int n_batch, int * n_past) {
|
|
||||||
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
|
|
||||||
|
|
||||||
for (int i = 0; i < N; i += n_batch) {
|
|
||||||
int n_eval = N - i;
|
|
||||||
if (n_eval > n_batch) {
|
|
||||||
n_eval = n_batch;
|
|
||||||
}
|
|
||||||
llama_batch batch = {int32_t(n_eval), nullptr, (embd+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
|
|
||||||
if (llama_decode(ctx_llama, batch)) {
|
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
*n_past += n_eval;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
|
||||||
int N = (int) tokens.size();
|
|
||||||
for (int i = 0; i < N; i += n_batch) {
|
|
||||||
int n_eval = (int) tokens.size() - i;
|
|
||||||
if (n_eval > n_batch) {
|
|
||||||
n_eval = n_batch;
|
|
||||||
}
|
|
||||||
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
|
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
*n_past += n_eval;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
|
|
||||||
std::vector<llama_token> tokens;
|
|
||||||
tokens.push_back(id);
|
|
||||||
return eval_tokens(ctx_llama, tokens, 1, n_past);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
|
||||||
std::string str2 = str;
|
|
||||||
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
|
|
||||||
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: use common/sampling.h
|
|
||||||
inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
|
|
||||||
auto & sparams = params.sparams;
|
|
||||||
|
|
||||||
// out of user input, sample next token
|
|
||||||
const float temp = sparams.temp;
|
|
||||||
const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
|
|
||||||
const float top_p = sparams.top_p;
|
|
||||||
const float tfs_z = sparams.tfs_z;
|
|
||||||
const float typical_p = sparams.typical_p;
|
|
||||||
// const int32_t repeat_last_n = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
|
|
||||||
// const float repeat_penalty = sparams.repeat_penalty;
|
|
||||||
// const float alpha_presence = sparams.presence_penalty;
|
|
||||||
// const float alpha_frequency = sparams.frequency_penalty;
|
|
||||||
const int mirostat = sparams.mirostat;
|
|
||||||
const float mirostat_tau = sparams.mirostat_tau;
|
|
||||||
const float mirostat_eta = sparams.mirostat_eta;
|
|
||||||
// const bool penalize_nl = sparams.penalize_nl;
|
|
||||||
|
|
||||||
llama_token id = 0;
|
|
||||||
{
|
|
||||||
auto logits = llama_get_logits(ctx_llama);
|
|
||||||
auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
|
|
||||||
|
|
||||||
// Apply params.logit_bias map
|
|
||||||
for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
|
|
||||||
logits[it->first] += it->second;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
|
||||||
candidates.reserve(n_vocab);
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
||||||
|
|
||||||
// TODO: Apply penalties
|
|
||||||
// float nl_logit = logits[llama_token_nl(ctx)];
|
|
||||||
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
|
||||||
// llama_sample_repetition_penalty(ctx, &candidates_p,
|
|
||||||
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
|
||||||
// last_n_repeat, repeat_penalty);
|
|
||||||
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
|
|
||||||
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
|
||||||
// last_n_repeat, alpha_frequency, alpha_presence);
|
|
||||||
// if (!penalize_nl) {
|
|
||||||
// logits[llama_token_nl(ctx)] = nl_logit;
|
|
||||||
// }
|
|
||||||
|
|
||||||
if (temp <= 0) {
|
|
||||||
// Greedy sampling
|
|
||||||
id = llama_sample_token_greedy(ctx_llama, &candidates_p);
|
|
||||||
} else {
|
|
||||||
if (mirostat == 1) {
|
|
||||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
|
||||||
const int mirostat_m = 100;
|
|
||||||
llama_sample_temp(ctx_llama, &candidates_p, temp);
|
|
||||||
id = llama_sample_token_mirostat(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
|
||||||
} else if (mirostat == 2) {
|
|
||||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
|
||||||
llama_sample_temp(ctx_llama, &candidates_p, temp);
|
|
||||||
id = llama_sample_token_mirostat_v2(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
|
||||||
} else {
|
|
||||||
// Temperature sampling
|
|
||||||
llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
|
|
||||||
llama_sample_tail_free(ctx_llama, &candidates_p, tfs_z, 1);
|
|
||||||
llama_sample_typical(ctx_llama, &candidates_p, typical_p, 1);
|
|
||||||
llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
|
|
||||||
llama_sample_temp(ctx_llama, &candidates_p, temp);
|
|
||||||
id = llama_sample_token(ctx_llama, &candidates_p);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return id;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
|
|
||||||
int id = sample_id(ctx_llama, params);
|
|
||||||
static std::string ret;
|
|
||||||
if (id == llama_token_eos(llama_get_model(ctx_llama))) {
|
|
||||||
ret = "</s>";
|
|
||||||
} else {
|
|
||||||
ret = llama_token_to_piece(ctx_llama, id);
|
|
||||||
}
|
|
||||||
eval_id(ctx_llama, id, n_past);
|
|
||||||
return ret.c_str();
|
|
||||||
}
|
|
@ -1,164 +1,163 @@
|
|||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
#include "llava-utils.h"
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "llava.h"
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
#include "base64.hpp"
|
||||||
printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
|
||||||
printf(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
|
||||||
|
clip_image_f32 * img_res = make_clip_image_f32();
|
||||||
|
if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) {
|
||||||
|
fprintf(stderr, "%s: unable to preprocess image\n", __func__);
|
||||||
|
clip_image_f32_free(img_res);
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
*n_img_pos = clip_n_patches(ctx_clip);
|
||||||
ggml_time_init();
|
|
||||||
|
|
||||||
gpt_params params;
|
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
|
||||||
show_additional_info(argc, argv);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.mmproj.empty() || params.image.empty()) {
|
|
||||||
gpt_print_usage(argc, argv, params);
|
|
||||||
show_additional_info(argc, argv);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
const char * clip_path = params.mmproj.c_str();
|
|
||||||
const char * img_path = params.image.c_str();
|
|
||||||
|
|
||||||
if (params.prompt.empty()) {
|
|
||||||
params.prompt = "describe the image in detail.";
|
|
||||||
}
|
|
||||||
|
|
||||||
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
|
||||||
|
|
||||||
// load and preprocess the image
|
|
||||||
clip_image_u8 img;
|
|
||||||
clip_image_f32 img_res;
|
|
||||||
|
|
||||||
if (!clip_image_load_from_file(img_path, &img)) {
|
|
||||||
fprintf(stderr, "%s: is %s really an image file?\n", __func__, img_path);
|
|
||||||
|
|
||||||
clip_free(ctx_clip);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!clip_image_preprocess(ctx_clip, &img, &img_res, /*pad2square =*/ true)) {
|
|
||||||
fprintf(stderr, "%s: unable to preprocess %s\n", __func__, img_path);
|
|
||||||
|
|
||||||
clip_free(ctx_clip);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int n_img_pos = clip_n_patches(ctx_clip);
|
|
||||||
int n_img_embd = clip_n_mmproj_embd(ctx_clip);
|
|
||||||
|
|
||||||
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
|
|
||||||
|
|
||||||
if (!image_embd) {
|
|
||||||
fprintf(stderr, "Unable to allocate memory for image embeddings\n");
|
|
||||||
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int64_t t_img_enc_start_us = ggml_time_us();
|
const int64_t t_img_enc_start_us = ggml_time_us();
|
||||||
if (!clip_image_encode(ctx_clip, params.n_threads, &img_res, image_embd)) {
|
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
|
||||||
|
clip_image_f32_free(img_res);
|
||||||
|
if (!encoded) {
|
||||||
fprintf(stderr, "Unable to encode image\n");
|
fprintf(stderr, "Unable to encode image\n");
|
||||||
|
|
||||||
return 1;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t t_img_enc_end_us = ggml_time_us();
|
const int64_t t_img_enc_end_us = ggml_time_us();
|
||||||
|
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
|
||||||
|
|
||||||
// we get the embeddings, free up the memory required for CLIP
|
printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
|
||||||
clip_free(ctx_clip);
|
|
||||||
|
|
||||||
llama_backend_init(params.numa);
|
return true;
|
||||||
|
|
||||||
llama_model_params model_params = llama_model_default_params();
|
|
||||||
model_params.n_gpu_layers = params.n_gpu_layers;
|
|
||||||
model_params.main_gpu = params.main_gpu;
|
|
||||||
model_params.tensor_split = params.tensor_split;
|
|
||||||
model_params.use_mmap = params.use_mmap;
|
|
||||||
model_params.use_mlock = params.use_mlock;
|
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
|
||||||
if (model == NULL) {
|
|
||||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
|
||||||
|
|
||||||
ctx_params.n_ctx = params.n_ctx < 2048 ? 2048 : params.n_ctx; // we need a longer context size to process image embeddings
|
|
||||||
ctx_params.n_threads = params.n_threads;
|
|
||||||
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
|
||||||
ctx_params.seed = params.seed;
|
|
||||||
|
|
||||||
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
|
||||||
|
|
||||||
if (ctx_llama == NULL) {
|
|
||||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
|
||||||
// make sure that the correct mmproj was used, i.e., compare apples to apples
|
// make sure that the correct mmproj was used, i.e., compare apples to apples
|
||||||
const int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
|
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
|
||||||
|
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
|
||||||
|
if (n_image_embd != n_llama_embd) {
|
||||||
|
printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
if (n_img_embd != n_llama_embd) {
|
static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
||||||
printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_img_embd, n_llama_embd);
|
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
|
||||||
|
if (!image_embd) {
|
||||||
llama_free(ctx_llama);
|
fprintf(stderr, "Unable to allocate memory for image embeddings\n");
|
||||||
llama_free_model(model);
|
|
||||||
llama_backend_free();
|
|
||||||
free(image_embd);
|
free(image_embd);
|
||||||
|
return false;
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// process the prompt
|
int n_img_pos;
|
||||||
// llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:"
|
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
|
||||||
|
fprintf(stderr, "%s: cannot encode image, aborting\n", __func__);
|
||||||
int n_past = 0;
|
|
||||||
|
|
||||||
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
|
||||||
|
|
||||||
eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params.n_batch, &n_past, true);
|
|
||||||
eval_image_embd(ctx_llama, image_embd, n_img_pos, params.n_batch, &n_past);
|
|
||||||
eval_string(ctx_llama, (params.prompt + "\nASSISTANT:").c_str(), params.n_batch, &n_past, false);
|
|
||||||
|
|
||||||
// generate the response
|
|
||||||
|
|
||||||
printf("\n");
|
|
||||||
printf("prompt: '%s'\n", params.prompt.c_str());
|
|
||||||
printf("\n");
|
|
||||||
|
|
||||||
for (int i = 0; i < max_tgt_len; i++) {
|
|
||||||
const char * tmp = sample(ctx_llama, params, &n_past);
|
|
||||||
if (strcmp(tmp, "</s>") == 0) break;
|
|
||||||
|
|
||||||
printf("%s", tmp);
|
|
||||||
fflush(stdout);
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("\n");
|
|
||||||
|
|
||||||
{
|
|
||||||
const float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
|
|
||||||
|
|
||||||
printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / n_img_pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_print_timings(ctx_llama);
|
|
||||||
|
|
||||||
llama_free(ctx_llama);
|
|
||||||
llama_free_model(model);
|
|
||||||
llama_backend_free();
|
|
||||||
free(image_embd);
|
free(image_embd);
|
||||||
|
return false;
|
||||||
return 0;
|
}
|
||||||
|
*image_embd_out = image_embd;
|
||||||
|
*n_img_pos_out = n_img_pos;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
|
||||||
|
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
|
||||||
|
|
||||||
|
for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
|
||||||
|
int n_eval = image_embed->n_image_pos - i;
|
||||||
|
if (n_eval > n_batch) {
|
||||||
|
n_eval = n_batch;
|
||||||
|
}
|
||||||
|
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
|
||||||
|
if (llama_decode(ctx_llama, batch)) {
|
||||||
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
*n_past += n_eval;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
|
||||||
|
clip_image_u8 * img = make_clip_image_u8();
|
||||||
|
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
|
||||||
|
clip_image_u8_free(img);
|
||||||
|
fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
float* image_embed = NULL;
|
||||||
|
int n_image_pos = 0;
|
||||||
|
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
|
||||||
|
if (!image_embed_result) {
|
||||||
|
clip_image_u8_free(img);
|
||||||
|
fprintf(stderr, "%s: coulnd't embed the image\n", __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
clip_image_u8_free(img);
|
||||||
|
auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed));
|
||||||
|
result->embed = image_embed;
|
||||||
|
result->n_image_pos = n_image_pos;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
|
||||||
|
auto file = fopen(path, "rb");
|
||||||
|
if (file == NULL) {
|
||||||
|
fprintf(stderr, "%s: can't read file %s\n", __func__, path);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
fseek(file, 0, SEEK_END);
|
||||||
|
auto fileSize = ftell(file);
|
||||||
|
fseek(file, 0, SEEK_SET);
|
||||||
|
|
||||||
|
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
|
||||||
|
if (buffer == NULL) {
|
||||||
|
fprintf(stderr, "%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
|
||||||
|
perror("Memory allocation error");
|
||||||
|
fclose(file);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
errno = 0;
|
||||||
|
size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
|
||||||
|
if (ferror(file)) {
|
||||||
|
die_fmt("read error: %s", strerror(errno));
|
||||||
|
}
|
||||||
|
if (ret != (size_t) fileSize) {
|
||||||
|
die("unexpectedly reached end of file");
|
||||||
|
}
|
||||||
|
fclose(file); // Close the file
|
||||||
|
|
||||||
|
*bytesOut = buffer;
|
||||||
|
*sizeOut = fileSize;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
|
||||||
|
unsigned char* image_bytes;
|
||||||
|
long image_bytes_length;
|
||||||
|
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
|
||||||
|
if (!loaded) {
|
||||||
|
fprintf(stderr, "%s: failed to load %s\n", __func__, image_path);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
|
||||||
|
free(image_bytes);
|
||||||
|
|
||||||
|
return embed;
|
||||||
|
}
|
||||||
|
|
||||||
|
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed) {
|
||||||
|
free(embed->embed);
|
||||||
|
free(embed);
|
||||||
}
|
}
|
||||||
|
50
examples/llava/llava.h
Normal file
50
examples/llava/llava.h
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
#ifndef LLAVA_H
|
||||||
|
#define LLAVA_H
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef LLAMA_SHARED
|
||||||
|
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||||
|
# ifdef LLAMA_BUILD
|
||||||
|
# define LLAVA_API __declspec(dllexport)
|
||||||
|
# else
|
||||||
|
# define LLAVA_API __declspec(dllimport)
|
||||||
|
# endif
|
||||||
|
# else
|
||||||
|
# define LLAVA_API __attribute__ ((visibility ("default")))
|
||||||
|
# endif
|
||||||
|
#else
|
||||||
|
# define LLAVA_API
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct clip_ctx;
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct llava_image_embed {
|
||||||
|
float * embed;
|
||||||
|
int n_image_pos;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** sanity check for clip <-> llava embed size match */
|
||||||
|
LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
|
||||||
|
|
||||||
|
/** build an image embed from image file bytes */
|
||||||
|
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
|
||||||
|
/** build an image embed from a path to an image filename */
|
||||||
|
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
|
||||||
|
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
|
||||||
|
/** free an embedding made with llava_image_embed_make_* */
|
||||||
|
|
||||||
|
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
|
||||||
|
LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
@ -3,6 +3,3 @@ add_executable(${TARGET} main.cpp)
|
|||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
@ -142,7 +142,7 @@ The `--ctx-size` option allows you to set the size of the prompt context used by
|
|||||||
|
|
||||||
### Extended Context Size
|
### Extended Context Size
|
||||||
|
|
||||||
Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
|
Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
|
||||||
|
|
||||||
- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
|
- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
|
||||||
|
|
||||||
@ -208,6 +208,14 @@ Top-p sampling, also known as nucleus sampling, is another text generation metho
|
|||||||
|
|
||||||
Example usage: `--top-p 0.95`
|
Example usage: `--top-p 0.95`
|
||||||
|
|
||||||
|
### Min P Sampling
|
||||||
|
|
||||||
|
- `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.05).
|
||||||
|
|
||||||
|
The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.
|
||||||
|
|
||||||
|
Example usage: `--min-p 0.05`
|
||||||
|
|
||||||
### Tail Free Sampling (TFS)
|
### Tail Free Sampling (TFS)
|
||||||
|
|
||||||
- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
|
- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
|
||||||
|
@ -2,7 +2,6 @@
|
|||||||
|
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "build-info.h"
|
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
@ -153,8 +152,8 @@ int main(int argc, char ** argv) {
|
|||||||
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
||||||
LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
|
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||||
params.seed = time(NULL);
|
params.seed = time(NULL);
|
||||||
@ -230,13 +229,16 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
|
const bool add_bos = llama_should_add_bos_token(model);
|
||||||
LOG("add_bos: %d\n", add_bos);
|
LOG("add_bos: %d\n", add_bos);
|
||||||
|
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
|
|
||||||
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
|
||||||
LOG("tokenize the prompt\n");
|
LOG("tokenize the prompt\n");
|
||||||
|
if (params.chatml) {
|
||||||
|
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
|
||||||
|
}
|
||||||
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||||
} else {
|
} else {
|
||||||
LOG("use session tokens\n");
|
LOG("use session tokens\n");
|
||||||
@ -314,7 +316,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// number of tokens to keep when resetting context
|
// number of tokens to keep when resetting context
|
||||||
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
|
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
|
||||||
params.n_keep = (int)embd_inp.size();
|
params.n_keep = (int)embd_inp.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -325,11 +327,23 @@ int main(int argc, char ** argv) {
|
|||||||
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
||||||
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
||||||
|
|
||||||
|
// chatml prefix & suffix
|
||||||
|
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true);
|
||||||
|
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
|
||||||
|
|
||||||
|
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
|
||||||
|
LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
|
||||||
|
|
||||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||||
if (params.instruct) {
|
if (params.instruct) {
|
||||||
params.interactive_first = true;
|
params.interactive_first = true;
|
||||||
params.antiprompt.push_back("### Instruction:\n\n");
|
params.antiprompt.push_back("### Instruction:\n\n");
|
||||||
}
|
}
|
||||||
|
// similar for chatml mode
|
||||||
|
else if (params.chatml) {
|
||||||
|
params.interactive_first = true;
|
||||||
|
params.antiprompt.push_back("<|im_start|>user\n");
|
||||||
|
}
|
||||||
|
|
||||||
// enable interactive mode if interactive start is specified
|
// enable interactive mode if interactive start is specified
|
||||||
if (params.interactive_first) {
|
if (params.interactive_first) {
|
||||||
@ -706,7 +720,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
printf("\n");
|
printf("\n");
|
||||||
} else if (params.instruct) {
|
} else if (params.instruct || params.chatml) {
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -714,7 +728,7 @@ int main(int argc, char ** argv) {
|
|||||||
if (n_past > 0 && is_interacting) {
|
if (n_past > 0 && is_interacting) {
|
||||||
LOG("waiting for user input\n");
|
LOG("waiting for user input\n");
|
||||||
|
|
||||||
if (params.instruct) {
|
if (params.instruct || params.chatml) {
|
||||||
printf("\n> ");
|
printf("\n> ");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -761,6 +775,12 @@ int main(int argc, char ** argv) {
|
|||||||
n_consumed = embd_inp.size();
|
n_consumed = embd_inp.size();
|
||||||
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
||||||
}
|
}
|
||||||
|
// chatml mode: insert user chat prefix
|
||||||
|
if (params.chatml && !is_antiprompt) {
|
||||||
|
LOG("inserting chatml prefix\n");
|
||||||
|
n_consumed = embd_inp.size();
|
||||||
|
embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
|
||||||
|
}
|
||||||
if (params.escape) {
|
if (params.escape) {
|
||||||
process_escapes(buffer);
|
process_escapes(buffer);
|
||||||
}
|
}
|
||||||
@ -779,6 +799,11 @@ int main(int argc, char ** argv) {
|
|||||||
LOG("inserting instruction suffix\n");
|
LOG("inserting instruction suffix\n");
|
||||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||||
}
|
}
|
||||||
|
// chatml mode: insert assistant chat suffix
|
||||||
|
if (params.chatml) {
|
||||||
|
LOG("inserting chatml suffix\n");
|
||||||
|
embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end());
|
||||||
|
}
|
||||||
|
|
||||||
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
||||||
const llama_token token = embd_inp[i];
|
const llama_token token = embd_inp[i];
|
||||||
@ -804,7 +829,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// end of text token
|
// end of text token
|
||||||
if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) {
|
if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) {
|
||||||
LOG_TEE(" [end of text]\n");
|
LOG_TEE(" [end of text]\n");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -34,7 +34,7 @@ int main(int argc, char ** argv) {
|
|||||||
struct ggml_context * ctx_data = NULL;
|
struct ggml_context * ctx_data = NULL;
|
||||||
struct ggml_context * ctx_eval = NULL;
|
struct ggml_context * ctx_eval = NULL;
|
||||||
|
|
||||||
struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
|
struct ggml_cgraph * gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
|
||||||
|
|
||||||
// this allocates all Metal resources and memory buffers
|
// this allocates all Metal resources and memory buffers
|
||||||
auto * ctx_metal = ggml_metal_init(1);
|
auto * ctx_metal = ggml_metal_init(1);
|
||||||
@ -46,13 +46,13 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// main
|
// main
|
||||||
{
|
{
|
||||||
struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
|
struct ggml_tensor * input = ggml_graph_get_tensor(gf, "embd");
|
||||||
*(int32_t *) input->data = 1; // BOS
|
*(int32_t *) input->data = 1; // BOS
|
||||||
|
|
||||||
ggml_metal_set_tensor(ctx_metal, input);
|
ggml_metal_set_tensor(ctx_metal, input);
|
||||||
|
|
||||||
// warmup
|
// warmup
|
||||||
ggml_metal_graph_compute(ctx_metal, &gf);
|
ggml_metal_graph_compute(ctx_metal, gf);
|
||||||
|
|
||||||
const int n_iter = 16;
|
const int n_iter = 16;
|
||||||
|
|
||||||
@ -60,7 +60,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// the actual inference happens here
|
// the actual inference happens here
|
||||||
for (int i = 0; i < n_iter; ++i) {
|
for (int i = 0; i < n_iter; ++i) {
|
||||||
ggml_metal_graph_compute(ctx_metal, &gf);
|
ggml_metal_graph_compute(ctx_metal, gf);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t t1 = ggml_time_us();
|
const int64_t t1 = ggml_time_us();
|
||||||
@ -70,7 +70,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// debug output
|
// debug output
|
||||||
{
|
{
|
||||||
struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
|
struct ggml_tensor * logits = gf->nodes[gf->n_nodes - 1];
|
||||||
ggml_metal_get_tensor(ctx_metal, logits);
|
ggml_metal_get_tensor(ctx_metal, logits);
|
||||||
|
|
||||||
float * ptr = (float *) ggml_get_data(logits);
|
float * ptr = (float *) ggml_get_data(logits);
|
||||||
|
@ -3,6 +3,3 @@ add_executable(${TARGET} parallel.cpp)
|
|||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
@ -1,3 +1,3 @@
|
|||||||
# llama.cpp/example/parallel
|
# llama.cpp/example/parallel
|
||||||
|
|
||||||
Simplified simluation for serving incoming requests in parallel
|
Simplified simulation of serving incoming requests in parallel
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
// A basic application simulating a server with multiple clients.
|
// A basic application simulating a server with multiple clients.
|
||||||
// The clients submite requests to the server and they are processed in parallel.
|
// The clients submit requests to the server and they are processed in parallel.
|
||||||
|
|
||||||
#include "build-info.h"
|
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
@ -115,6 +113,8 @@ int main(int argc, char ** argv) {
|
|||||||
// insert new requests as soon as the previous one is done
|
// insert new requests as soon as the previous one is done
|
||||||
const bool cont_batching = params.cont_batching;
|
const bool cont_batching = params.cont_batching;
|
||||||
|
|
||||||
|
const bool dump_kv_cache = params.dump_kv_cache;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
log_set_target(log_filename_generator("parallel", "log"));
|
log_set_target(log_filename_generator("parallel", "log"));
|
||||||
LOG_TEE("Log start\n");
|
LOG_TEE("Log start\n");
|
||||||
@ -174,6 +174,8 @@ int main(int argc, char ** argv) {
|
|||||||
int32_t n_total_gen = 0;
|
int32_t n_total_gen = 0;
|
||||||
int32_t n_cache_miss = 0;
|
int32_t n_cache_miss = 0;
|
||||||
|
|
||||||
|
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);
|
||||||
|
|
||||||
const auto t_main_start = ggml_time_us();
|
const auto t_main_start = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
|
LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
|
||||||
@ -203,6 +205,11 @@ int main(int argc, char ** argv) {
|
|||||||
LOG_TEE("Processing requests ...\n\n");
|
LOG_TEE("Processing requests ...\n\n");
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
|
if (dump_kv_cache) {
|
||||||
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||||
|
dump_kv_cache_view_seqs(kvc_view, 40);
|
||||||
|
}
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
// decode any currently ongoing sequences
|
// decode any currently ongoing sequences
|
||||||
|
@ -3,6 +3,3 @@ add_executable(${TARGET} perplexity.cpp)
|
|||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
#include "build-info.h"
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
@ -150,8 +149,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|||||||
// Output: `perplexity: 13.5106 [114/114]`
|
// Output: `perplexity: 13.5106 [114/114]`
|
||||||
// BOS tokens will be added for each chunk before eval
|
// BOS tokens will be added for each chunk before eval
|
||||||
|
|
||||||
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
const bool add_bos = is_spm;
|
|
||||||
|
|
||||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
@ -289,8 +287,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||||||
// Output: `perplexity: 13.5106 [114/114]`
|
// Output: `perplexity: 13.5106 [114/114]`
|
||||||
// BOS tokens will be added for each chunk before eval
|
// BOS tokens will be added for each chunk before eval
|
||||||
|
|
||||||
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
const bool add_bos = is_spm;
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||||
@ -482,7 +479,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
|
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
|
||||||
|
|
||||||
// This is needed as usual for LLaMA models
|
// This is needed as usual for LLaMA models
|
||||||
const bool add_bos = is_spm;
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
|
||||||
// Number of tasks to use when computing the score
|
// Number of tasks to use when computing the score
|
||||||
if ( params.hellaswag_tasks < hs_task_count ) {
|
if ( params.hellaswag_tasks < hs_task_count ) {
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
set(TARGET quantize-stats)
|
set(TARGET quantize-stats)
|
||||||
add_executable(${TARGET} quantize-stats.cpp)
|
add_executable(${TARGET} quantize-stats.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
#define LLAMA_API_INTERNAL
|
#define LLAMA_API_INTERNAL
|
||||||
#include "build-info.h"
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
@ -1,9 +1,6 @@
|
|||||||
set(TARGET quantize)
|
set(TARGET quantize)
|
||||||
add_executable(${TARGET} quantize.cpp)
|
add_executable(${TARGET} quantize.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
#include "build-info.h"
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
@ -3,6 +3,3 @@ add_executable(${TARGET} save-load-state.cpp)
|
|||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
#include "build-info.h"
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
@ -6,11 +6,8 @@ install(TARGETS ${TARGET} RUNTIME)
|
|||||||
target_compile_definitions(${TARGET} PRIVATE
|
target_compile_definitions(${TARGET} PRIVATE
|
||||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||||
)
|
)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||||
endif()
|
endif()
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
@ -7,7 +7,7 @@ Command line options:
|
|||||||
- `--threads N`, `-t N`: Set the number of threads to use during generation.
|
- `--threads N`, `-t N`: Set the number of threads to use during generation.
|
||||||
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
|
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
|
||||||
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
||||||
- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
||||||
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
||||||
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
||||||
@ -122,6 +122,8 @@ node index.js
|
|||||||
|
|
||||||
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).
|
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).
|
||||||
|
|
||||||
|
`min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token (default: 0.05).
|
||||||
|
|
||||||
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
|
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
|
||||||
|
|
||||||
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
|
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -94,6 +94,10 @@ export async function* llama(prompt, params = {}, config = {}) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (result.error) {
|
||||||
|
result.error = JSON.parse(result.error);
|
||||||
|
console.error(`llama.cpp error: ${result.error.content}`);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -160,6 +160,11 @@
|
|||||||
height: 10em;
|
height: 10em;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[contenteditable] {
|
||||||
|
display: inline-block;
|
||||||
|
white-space: pre-wrap;
|
||||||
|
outline: 0px solid transparent;
|
||||||
|
}
|
||||||
|
|
||||||
@keyframes loading-bg-wipe {
|
@keyframes loading-bg-wipe {
|
||||||
0% {
|
0% {
|
||||||
@ -219,6 +224,7 @@
|
|||||||
repeat_penalty: 1.18, // 1.0 = disabled
|
repeat_penalty: 1.18, // 1.0 = disabled
|
||||||
top_k: 40, // <= 0 to use vocab size
|
top_k: 40, // <= 0 to use vocab size
|
||||||
top_p: 0.5, // 1.0 = disabled
|
top_p: 0.5, // 1.0 = disabled
|
||||||
|
min_p: 0.05, // 0 = disabled
|
||||||
tfs_z: 1.0, // 1.0 = disabled
|
tfs_z: 1.0, // 1.0 = disabled
|
||||||
typical_p: 1.0, // 1.0 = disabled
|
typical_p: 1.0, // 1.0 = disabled
|
||||||
presence_penalty: 0.0, // 0.0 = disabled
|
presence_penalty: 0.0, // 0.0 = disabled
|
||||||
@ -461,18 +467,23 @@
|
|||||||
}, "{{char}}");
|
}, "{{char}}");
|
||||||
}
|
}
|
||||||
|
|
||||||
const runCompletion = async () => {
|
const runCompletion = () => {
|
||||||
if (controller.value) {
|
if (controller.value) {
|
||||||
console.log('already running...');
|
console.log('already running...');
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const { prompt } = session.value;
|
const { prompt } = session.value;
|
||||||
transcriptUpdate([...session.value.transcript, ["", prompt]]);
|
transcriptUpdate([...session.value.transcript, ["", prompt]]);
|
||||||
await runLlama(prompt, {
|
runLlama(prompt, {
|
||||||
...params.value,
|
...params.value,
|
||||||
slot_id: slot_id,
|
slot_id: slot_id,
|
||||||
stop: [],
|
stop: [],
|
||||||
}, "");
|
}, "").finally(() => {
|
||||||
|
session.value.prompt = session.value.transcript.map(([_, data]) =>
|
||||||
|
Array.isArray(data) ? data.map(msg => msg.content).join('') : data
|
||||||
|
).join('');
|
||||||
|
session.value.transcript = [];
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
const stop = (e) => {
|
const stop = (e) => {
|
||||||
@ -572,6 +583,7 @@
|
|||||||
}
|
}
|
||||||
}, [messages])
|
}, [messages])
|
||||||
|
|
||||||
|
const isCompletionMode = session.value.type === 'completion'
|
||||||
const chatLine = ([user, data], index) => {
|
const chatLine = ([user, data], index) => {
|
||||||
let message
|
let message
|
||||||
const isArrayMessage = Array.isArray(data)
|
const isArrayMessage = Array.isArray(data)
|
||||||
@ -581,20 +593,31 @@
|
|||||||
const text = isArrayMessage ?
|
const text = isArrayMessage ?
|
||||||
data.map(msg => msg.content).join('').replace(/^\s+/, '') :
|
data.map(msg => msg.content).join('').replace(/^\s+/, '') :
|
||||||
data;
|
data;
|
||||||
message = html`<${Markdownish} text=${template(text)} />`
|
message = isCompletionMode ?
|
||||||
|
text :
|
||||||
|
html`<${Markdownish} text=${template(text)} />`
|
||||||
}
|
}
|
||||||
if (user) {
|
if (user) {
|
||||||
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
|
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
|
||||||
} else {
|
} else {
|
||||||
return html`<p key=${index}>${message}</p>`
|
return isCompletionMode ?
|
||||||
|
html`<span key=${index}>${message}</span>` :
|
||||||
|
html`<p key=${index}>${message}</p>`
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const handleCompletionEdit = (e) => {
|
||||||
|
session.value.prompt = e.target.innerText;
|
||||||
|
session.value.transcript = [];
|
||||||
|
}
|
||||||
|
|
||||||
return html`
|
return html`
|
||||||
<section id="chat" ref=${container}>
|
<div id="chat" ref=${container} key=${messages.length}>
|
||||||
<img style="width: 60%;${!session.value.image_selected ? `display: none;` : ``}" src="${session.value.image_selected}"/>
|
<img style="width: 60%;${!session.value.image_selected ? `display: none;` : ``}" src="${session.value.image_selected}"/>
|
||||||
|
<span contenteditable=${isCompletionMode} ref=${container} oninput=${handleCompletionEdit}>
|
||||||
${messages.flatMap(chatLine)}
|
${messages.flatMap(chatLine)}
|
||||||
</section>`;
|
</span>
|
||||||
|
</div>`;
|
||||||
};
|
};
|
||||||
|
|
||||||
const ConfigForm = (props) => {
|
const ConfigForm = (props) => {
|
||||||
@ -744,6 +767,7 @@
|
|||||||
${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
|
${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
|
||||||
${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
|
${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
|
||||||
${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
|
${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
|
||||||
|
${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
|
||||||
</fieldset>
|
</fieldset>
|
||||||
<details>
|
<details>
|
||||||
<summary>More options</summary>
|
<summary>More options</summary>
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "build-info.h"
|
|
||||||
#include "grammar-parser.h"
|
#include "grammar-parser.h"
|
||||||
|
|
||||||
#include "../llava/clip.h"
|
#include "../llava/clip.h"
|
||||||
@ -149,6 +148,7 @@ struct task_server {
|
|||||||
task_type type;
|
task_type type;
|
||||||
json data;
|
json data;
|
||||||
bool infill_mode = false;
|
bool infill_mode = false;
|
||||||
|
bool embedding_mode = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct task_result {
|
struct task_result {
|
||||||
@ -371,6 +371,7 @@ struct llama_client_slot
|
|||||||
std::vector<completion_token_output> generated_token_probs;
|
std::vector<completion_token_output> generated_token_probs;
|
||||||
|
|
||||||
bool infill = false;
|
bool infill = false;
|
||||||
|
bool embedding = false;
|
||||||
bool has_next_token = true;
|
bool has_next_token = true;
|
||||||
bool truncated = false;
|
bool truncated = false;
|
||||||
bool stopped_eos = false;
|
bool stopped_eos = false;
|
||||||
@ -500,6 +501,7 @@ struct llama_server_context
|
|||||||
bool multimodal = false;
|
bool multimodal = false;
|
||||||
bool clean_kv_cache = true;
|
bool clean_kv_cache = true;
|
||||||
bool all_slots_are_idle = false;
|
bool all_slots_are_idle = false;
|
||||||
|
bool add_bos_token = true;
|
||||||
|
|
||||||
int32_t id_gen;
|
int32_t id_gen;
|
||||||
int32_t n_ctx; // total context for all clients / slots
|
int32_t n_ctx; // total context for all clients / slots
|
||||||
@ -572,6 +574,8 @@ struct llama_server_context
|
|||||||
|
|
||||||
n_ctx = llama_n_ctx(ctx);
|
n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
|
add_bos_token = llama_should_add_bos_token(model);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -678,6 +682,7 @@ struct llama_server_context
|
|||||||
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
||||||
slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
||||||
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
||||||
|
slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
|
||||||
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
|
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
|
||||||
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
|
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
|
||||||
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
||||||
@ -862,7 +867,7 @@ struct llama_server_context
|
|||||||
}
|
}
|
||||||
|
|
||||||
void update_system_prompt() {
|
void update_system_prompt() {
|
||||||
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
|
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
@ -1090,6 +1095,7 @@ struct llama_server_context
|
|||||||
std::lock_guard<std::mutex> lock(mutex_results);
|
std::lock_guard<std::mutex> lock(mutex_results);
|
||||||
task_result res;
|
task_result res;
|
||||||
res.id = id;
|
res.id = id;
|
||||||
|
res.stop = false;
|
||||||
res.error = true;
|
res.error = true;
|
||||||
res.result_json = { { "content", error } };
|
res.result_json = { { "content", error } };
|
||||||
queue_results.push_back(res);
|
queue_results.push_back(res);
|
||||||
@ -1112,6 +1118,7 @@ struct llama_server_context
|
|||||||
{"temp", slot.sparams.temp},
|
{"temp", slot.sparams.temp},
|
||||||
{"top_k", slot.sparams.top_k},
|
{"top_k", slot.sparams.top_k},
|
||||||
{"top_p", slot.sparams.top_p},
|
{"top_p", slot.sparams.top_p},
|
||||||
|
{"min_p", slot.sparams.min_p},
|
||||||
{"tfs_z", slot.sparams.tfs_z},
|
{"tfs_z", slot.sparams.tfs_z},
|
||||||
{"typical_p", slot.sparams.typical_p},
|
{"typical_p", slot.sparams.typical_p},
|
||||||
{"repeat_last_n", slot.sparams.penalty_last_n},
|
{"repeat_last_n", slot.sparams.penalty_last_n},
|
||||||
@ -1244,13 +1251,15 @@ struct llama_server_context
|
|||||||
queue_results.push_back(res);
|
queue_results.push_back(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
int request_completion(json data, bool infill)
|
int request_completion(json data, bool infill, bool embedding)
|
||||||
{
|
{
|
||||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||||
task_server task;
|
task_server task;
|
||||||
task.id = id_gen++;
|
task.id = id_gen++;
|
||||||
|
task.target_id = 0;
|
||||||
task.data = data;
|
task.data = data;
|
||||||
task.infill_mode = infill;
|
task.infill_mode = infill;
|
||||||
|
task.embedding_mode = embedding;
|
||||||
task.type = COMPLETION_TASK;
|
task.type = COMPLETION_TASK;
|
||||||
queue_tasks.push_back(task);
|
queue_tasks.push_back(task);
|
||||||
return task.id;
|
return task.id;
|
||||||
@ -1376,7 +1385,7 @@ struct llama_server_context
|
|||||||
{
|
{
|
||||||
LOG_TEE("slot unavailable\n");
|
LOG_TEE("slot unavailable\n");
|
||||||
// send error result
|
// send error result
|
||||||
send_error(task.id, "slot unavaliable");
|
send_error(task.id, "slot unavailable");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1388,6 +1397,7 @@ struct llama_server_context
|
|||||||
slot->reset();
|
slot->reset();
|
||||||
|
|
||||||
slot->infill = task.infill_mode;
|
slot->infill = task.infill_mode;
|
||||||
|
slot->embedding = task.embedding_mode;
|
||||||
slot->task_id = task.id;
|
slot->task_id = task.id;
|
||||||
|
|
||||||
if (!launch_slot_with_data(slot, task.data))
|
if (!launch_slot_with_data(slot, task.data))
|
||||||
@ -1547,20 +1557,11 @@ struct llama_server_context
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.num_prompt_tokens = prompt_tokens.size();
|
slot.num_prompt_tokens = prompt_tokens.size();
|
||||||
|
|
||||||
if (!slot.params.cache_prompt)
|
|
||||||
{
|
|
||||||
llama_sampling_reset(slot.ctx_sampling);
|
|
||||||
|
|
||||||
slot.n_past = 0;
|
|
||||||
slot.num_prompt_tokens_processed = slot.num_prompt_tokens;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (slot.params.n_keep < 0)
|
if (slot.params.n_keep < 0)
|
||||||
{
|
{
|
||||||
slot.params.n_keep = slot.num_prompt_tokens;
|
slot.params.n_keep = slot.num_prompt_tokens;
|
||||||
@ -1590,6 +1591,15 @@ struct llama_server_context
|
|||||||
GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
|
GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!slot.params.cache_prompt)
|
||||||
|
{
|
||||||
|
llama_sampling_reset(slot.ctx_sampling);
|
||||||
|
|
||||||
|
slot.n_past = 0;
|
||||||
|
slot.num_prompt_tokens_processed = slot.num_prompt_tokens;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
// push the prompt into the sampling context (do not apply grammar)
|
// push the prompt into the sampling context (do not apply grammar)
|
||||||
for (auto &token : prompt_tokens)
|
for (auto &token : prompt_tokens)
|
||||||
{
|
{
|
||||||
@ -1624,7 +1634,7 @@ struct llama_server_context
|
|||||||
const bool has_images = process_images(slot);
|
const bool has_images = process_images(slot);
|
||||||
|
|
||||||
// process the prefix of first image
|
// process the prefix of first image
|
||||||
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, true) : prompt_tokens;
|
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
|
||||||
for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
|
for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
|
||||||
{
|
{
|
||||||
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot.n_past, { slot.id }, false);
|
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot.n_past, { slot.id }, false);
|
||||||
@ -1695,7 +1705,7 @@ struct llama_server_context
|
|||||||
}
|
}
|
||||||
|
|
||||||
// prompt evaluated for embedding
|
// prompt evaluated for embedding
|
||||||
if (params.embedding)
|
if (slot.embedding)
|
||||||
{
|
{
|
||||||
send_embedding(slot);
|
send_embedding(slot);
|
||||||
slot.release();
|
slot.release();
|
||||||
@ -1754,8 +1764,14 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||||
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
|
printf(" --rope-scaling {none,linear,yarn}\n");
|
||||||
|
printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
|
||||||
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
||||||
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
|
printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
|
||||||
|
printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
|
||||||
|
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
|
||||||
|
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
|
||||||
|
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
||||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||||
@ -1877,6 +1893,19 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||||||
}
|
}
|
||||||
params.n_ctx = std::stoi(argv[i]);
|
params.n_ctx = std::stoi(argv[i]);
|
||||||
}
|
}
|
||||||
|
else if (arg == "--rope-scaling")
|
||||||
|
{
|
||||||
|
if (++i >= argc)
|
||||||
|
{
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
std::string value(argv[i]);
|
||||||
|
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
|
||||||
|
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
|
||||||
|
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
|
||||||
|
else { invalid_param = true; break; }
|
||||||
|
}
|
||||||
else if (arg == "--rope-freq-base")
|
else if (arg == "--rope-freq-base")
|
||||||
{
|
{
|
||||||
if (++i >= argc)
|
if (++i >= argc)
|
||||||
@ -1895,6 +1924,38 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||||||
}
|
}
|
||||||
params.rope_freq_scale = std::stof(argv[i]);
|
params.rope_freq_scale = std::stof(argv[i]);
|
||||||
}
|
}
|
||||||
|
else if (arg == "--yarn-ext-factor")
|
||||||
|
{
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.yarn_ext_factor = std::stof(argv[i]);
|
||||||
|
}
|
||||||
|
else if (arg == "--yarn-attn-factor")
|
||||||
|
{
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.yarn_attn_factor = std::stof(argv[i]);
|
||||||
|
}
|
||||||
|
else if (arg == "--yarn-beta-fast")
|
||||||
|
{
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.yarn_beta_fast = std::stof(argv[i]);
|
||||||
|
}
|
||||||
|
else if (arg == "--yarn-beta-slow")
|
||||||
|
{
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.yarn_beta_slow = std::stof(argv[i]);
|
||||||
|
}
|
||||||
else if (arg == "--memory-f32" || arg == "--memory_f32")
|
else if (arg == "--memory-f32" || arg == "--memory_f32")
|
||||||
{
|
{
|
||||||
params.memory_f16 = false;
|
params.memory_f16 = false;
|
||||||
@ -2209,8 +2270,8 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
llama_backend_init(params.numa);
|
llama_backend_init(params.numa);
|
||||||
|
|
||||||
LOG_INFO("build info", {{"build", BUILD_NUMBER},
|
LOG_INFO("build info", {{"build", LLAMA_BUILD_NUMBER},
|
||||||
{"commit", BUILD_COMMIT}});
|
{"commit", LLAMA_COMMIT}});
|
||||||
|
|
||||||
LOG_INFO("system info", {
|
LOG_INFO("system info", {
|
||||||
{"n_threads", params.n_threads},
|
{"n_threads", params.n_threads},
|
||||||
@ -2274,7 +2335,7 @@ int main(int argc, char **argv)
|
|||||||
svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
|
svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||||
{
|
{
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
const int task_id = llama.request_completion(data, false);
|
const int task_id = llama.request_completion(data, false, false);
|
||||||
if (!json_value(data, "stream", false)) {
|
if (!json_value(data, "stream", false)) {
|
||||||
std::string completion_text;
|
std::string completion_text;
|
||||||
task_result result = llama.next_result(task_id);
|
task_result result = llama.next_result(task_id);
|
||||||
@ -2309,6 +2370,17 @@ int main(int argc, char **argv)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
const std::string str =
|
||||||
|
"error: " +
|
||||||
|
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||||
|
"\n\n";
|
||||||
|
LOG_VERBOSE("data stream", {
|
||||||
|
{ "to_send", str }
|
||||||
|
});
|
||||||
|
if (!sink.write(str.c_str(), str.size()))
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2329,7 +2401,7 @@ int main(int argc, char **argv)
|
|||||||
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
|
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||||
{
|
{
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
const int task_id = llama.request_completion(data, true);
|
const int task_id = llama.request_completion(data, true, false);
|
||||||
if (!json_value(data, "stream", false)) {
|
if (!json_value(data, "stream", false)) {
|
||||||
std::string completion_text;
|
std::string completion_text;
|
||||||
task_result result = llama.next_result(task_id);
|
task_result result = llama.next_result(task_id);
|
||||||
@ -2433,7 +2505,7 @@ int main(int argc, char **argv)
|
|||||||
{
|
{
|
||||||
prompt = "";
|
prompt = "";
|
||||||
}
|
}
|
||||||
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false);
|
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true);
|
||||||
task_result result = llama.next_result(task_id);
|
task_result result = llama.next_result(task_id);
|
||||||
return res.set_content(result.result_json.dump(), "application/json");
|
return res.set_content(result.result_json.dump(), "application/json");
|
||||||
});
|
});
|
||||||
|
@ -3,6 +3,3 @@ add_executable(${TARGET} speculative.cpp)
|
|||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
#include "build-info.h"
|
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
@ -39,9 +37,11 @@ int main(int argc, char ** argv) {
|
|||||||
// max number of parallel drafting sequences (i.e. tree branches)
|
// max number of parallel drafting sequences (i.e. tree branches)
|
||||||
const int n_seq_dft = params.n_parallel;
|
const int n_seq_dft = params.n_parallel;
|
||||||
|
|
||||||
// TODO: make this configurable
|
// probability threshold for accepting a token from the draft model
|
||||||
const float p_accept = 0.80f;
|
const float p_accept = params.p_accept;
|
||||||
const float p_split = 0.10f;
|
|
||||||
|
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
|
||||||
|
const float p_split = params.p_split;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
log_set_target(log_filename_generator("speculative", "log"));
|
log_set_target(log_filename_generator("speculative", "log"));
|
||||||
@ -94,9 +94,22 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// tokenize the prompt
|
|
||||||
|
// Tokenize the prompt
|
||||||
|
const bool add_bos_tgt = llama_should_add_bos_token(model_tgt);
|
||||||
|
LOG("add_bos tgt: %d\n", add_bos_tgt);
|
||||||
|
|
||||||
|
const bool add_bos_dft = llama_should_add_bos_token(model_dft);
|
||||||
|
LOG("add_bos dft: %d\n", add_bos_dft);
|
||||||
|
|
||||||
|
if (add_bos_tgt != add_bos_dft) {
|
||||||
|
fprintf(stderr, "%s: error: draft model add_bos must match target model to use speculation but ", __func__);
|
||||||
|
fprintf(stderr, "add_bos_dft = %d while add_bos_tgt = %d\n", add_bos_dft, add_bos_tgt);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
|
inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true);
|
||||||
|
|
||||||
const int max_context_size = llama_n_ctx(ctx_tgt);
|
const int max_context_size = llama_n_ctx(ctx_tgt);
|
||||||
const int max_tokens_list_size = max_context_size - 4;
|
const int max_tokens_list_size = max_context_size - 4;
|
||||||
|
5
examples/tokenize/CMakeLists.txt
Normal file
5
examples/tokenize/CMakeLists.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
set(TARGET tokenize)
|
||||||
|
add_executable(${TARGET} tokenize.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
44
examples/tokenize/tokenize.cpp
Normal file
44
examples/tokenize/tokenize.cpp
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
if (argc < 3 || argv[1][0] == '-') {
|
||||||
|
printf("usage: %s MODEL_PATH PROMPT [--ids]\n" , argv[0]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char * model_path = argv[1];
|
||||||
|
const char * prompt = argv[2];
|
||||||
|
|
||||||
|
const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids";
|
||||||
|
|
||||||
|
llama_backend_init(false);
|
||||||
|
|
||||||
|
llama_model_params model_params = llama_model_default_params();
|
||||||
|
model_params.vocab_only = true;
|
||||||
|
llama_model * model = llama_load_model_from_file(model_path, model_params);
|
||||||
|
|
||||||
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
|
const bool add_bos = llama_should_add_bos_token(model);
|
||||||
|
|
||||||
|
std::vector<llama_token> tokens;
|
||||||
|
|
||||||
|
tokens = ::llama_tokenize(model, prompt, add_bos, true);
|
||||||
|
|
||||||
|
for (int i = 0; i < (int) tokens.size(); i++) {
|
||||||
|
if (printing_ids) {
|
||||||
|
printf("%d\n", tokens[i]);
|
||||||
|
} else {
|
||||||
|
printf("%6d -> '%s'\n", tokens[i], llama_token_to_piece(ctx, tokens[i]).c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
@ -9,7 +9,7 @@ import numpy as np
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py' / 'gguf'))
|
sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py'))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
# gguf constants
|
# gguf constants
|
||||||
|
@ -349,9 +349,9 @@ static struct ggml_tensor * llama_build_train_graphs(
|
|||||||
// not capturing these, to silcence warnings
|
// not capturing these, to silcence warnings
|
||||||
const int rope_mode = 0;
|
const int rope_mode = 0;
|
||||||
|
|
||||||
return ggml_rope_custom(ctx,
|
return ggml_rope_custom(
|
||||||
t, KQ_pos, n_rot, rope_mode, n_ctx,
|
ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
|
||||||
rope_freq_base, rope_freq_scale);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
set_name(tokens_input, "tokens_input");
|
set_name(tokens_input, "tokens_input");
|
||||||
@ -436,7 +436,7 @@ static struct ggml_tensor * llama_build_train_graphs(
|
|||||||
if (enable_checkpointing) {
|
if (enable_checkpointing) {
|
||||||
ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
|
ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
|
||||||
} else {
|
} else {
|
||||||
*gb = *gf;
|
ggml_graph_cpy(gf, gb);
|
||||||
ggml_build_backward_expand(ctx, gf, gb, true);
|
ggml_build_backward_expand(ctx, gf, gb, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1006,6 +1006,7 @@ int main(int argc, char ** argv) {
|
|||||||
opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
|
opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
|
||||||
opt->params.print_forward_graph = false;
|
opt->params.print_forward_graph = false;
|
||||||
opt->params.print_backward_graph = false;
|
opt->params.print_backward_graph = false;
|
||||||
|
opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
|
||||||
opt->params.n_threads = params.common.n_threads;
|
opt->params.n_threads = params.common.n_threads;
|
||||||
opt->params.past = params.common.opt_past;
|
opt->params.past = params.common.opt_past;
|
||||||
opt->params.delta = params.common.opt_delta;
|
opt->params.delta = params.common.opt_delta;
|
||||||
@ -1108,11 +1109,9 @@ int main(int argc, char ** argv) {
|
|||||||
ggml_allocr_free(alloc);
|
ggml_allocr_free(alloc);
|
||||||
|
|
||||||
// context for compute tensors without their data
|
// context for compute tensors without their data
|
||||||
size_t estimated_compute_size_wo_data = (
|
const size_t estimated_compute_size_wo_data = (
|
||||||
ggml_tensor_overhead()*GGML_MAX_NODES*2
|
2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
|
||||||
+ (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*(
|
(params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
|
||||||
params.common.use_checkpointing ? 3 : 2
|
|
||||||
)
|
|
||||||
);
|
);
|
||||||
struct ggml_init_params ctx_compute_params = {
|
struct ggml_init_params ctx_compute_params = {
|
||||||
estimated_compute_size_wo_data, // mem_size
|
estimated_compute_size_wo_data, // mem_size
|
||||||
@ -1135,11 +1134,11 @@ int main(int argc, char ** argv) {
|
|||||||
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
|
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
|
||||||
ctx_compute = ggml_init(ctx_compute_params);
|
ctx_compute = ggml_init(ctx_compute_params);
|
||||||
alloc = ggml_allocr_new_measure(tensor_alignment);
|
alloc = ggml_allocr_new_measure(tensor_alignment);
|
||||||
gf = ggml_new_graph(ctx_compute);
|
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||||
gf->order = (enum ggml_cgraph_eval_order) order;
|
gf->order = (enum ggml_cgraph_eval_order) order;
|
||||||
gb = ggml_new_graph(ctx_compute);
|
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||||
gb_tmp = params.common.use_checkpointing
|
gb_tmp = params.common.use_checkpointing
|
||||||
? ggml_new_graph(ctx_compute)
|
? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
|
||||||
: NULL;
|
: NULL;
|
||||||
loss = llama_build_train_graphs(
|
loss = llama_build_train_graphs(
|
||||||
&model, alloc, ctx_compute,
|
&model, alloc, ctx_compute,
|
||||||
@ -1168,11 +1167,11 @@ int main(int argc, char ** argv) {
|
|||||||
mem_compute_data.resize(max_compute_size);
|
mem_compute_data.resize(max_compute_size);
|
||||||
ctx_compute = ggml_init(ctx_compute_params);
|
ctx_compute = ggml_init(ctx_compute_params);
|
||||||
alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
|
alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
|
||||||
gf = ggml_new_graph(ctx_compute);
|
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||||
gf->order = best_order;
|
gf->order = best_order;
|
||||||
gb = ggml_new_graph(ctx_compute);
|
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||||
gb_tmp = params.common.use_checkpointing
|
gb_tmp = params.common.use_checkpointing
|
||||||
? ggml_new_graph(ctx_compute)
|
? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
|
||||||
: NULL;
|
: NULL;
|
||||||
loss = llama_build_train_graphs(
|
loss = llama_build_train_graphs(
|
||||||
&model, alloc, ctx_compute,
|
&model, alloc, ctx_compute,
|
||||||
|
12
flake.lock
12
flake.lock
@ -5,11 +5,11 @@
|
|||||||
"systems": "systems"
|
"systems": "systems"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1692799911,
|
"lastModified": 1694529238,
|
||||||
"narHash": "sha256-3eihraek4qL744EvQXsK1Ha6C3CR7nnT8X2qWap4RNk=",
|
"narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
|
||||||
"owner": "numtide",
|
"owner": "numtide",
|
||||||
"repo": "flake-utils",
|
"repo": "flake-utils",
|
||||||
"rev": "f9e7cf818399d17d347f847525c5a5a8032e4e44",
|
"rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@ -20,11 +20,11 @@
|
|||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1698134075,
|
"lastModified": 1698318101,
|
||||||
"narHash": "sha256-foCD+nuKzfh49bIoiCBur4+Fx1nozo+4C/6k8BYk4sg=",
|
"narHash": "sha256-gUihHt3yPD7bVqg+k/UVHgngyaJ3DMEBchbymBMvK1E=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "8efd5d1e283604f75a808a20e6cde0ef313d07d4",
|
"rev": "63678e9f3d3afecfeafa0acead6239cdb447574c",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
10
flake.nix
10
flake.nix
@ -11,8 +11,7 @@
|
|||||||
meta.mainProgram = "llama";
|
meta.mainProgram = "llama";
|
||||||
inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
|
inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
|
||||||
buildInputs = with pkgs; [ openmpi ];
|
buildInputs = with pkgs; [ openmpi ];
|
||||||
osSpecific = with pkgs; buildInputs ++
|
osSpecific = with pkgs; buildInputs ++ (
|
||||||
(
|
|
||||||
if isAarch64 && isDarwin then
|
if isAarch64 && isDarwin then
|
||||||
with pkgs.darwin.apple_sdk_11_0.frameworks; [
|
with pkgs.darwin.apple_sdk_11_0.frameworks; [
|
||||||
Accelerate
|
Accelerate
|
||||||
@ -96,12 +95,15 @@
|
|||||||
};
|
};
|
||||||
packages.rocm = pkgs.stdenv.mkDerivation {
|
packages.rocm = pkgs.stdenv.mkDerivation {
|
||||||
inherit name src meta postPatch nativeBuildInputs postInstall;
|
inherit name src meta postPatch nativeBuildInputs postInstall;
|
||||||
buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ];
|
buildInputs = with pkgs.rocmPackages; buildInputs ++ [ clr hipblas rocblas ];
|
||||||
cmakeFlags = cmakeFlags ++ [
|
cmakeFlags = cmakeFlags ++ [
|
||||||
"-DLLAMA_HIPBLAS=1"
|
"-DLLAMA_HIPBLAS=1"
|
||||||
"-DCMAKE_C_COMPILER=hipcc"
|
"-DCMAKE_C_COMPILER=hipcc"
|
||||||
"-DCMAKE_CXX_COMPILER=hipcc"
|
"-DCMAKE_CXX_COMPILER=hipcc"
|
||||||
"-DCMAKE_POSITION_INDEPENDENT_CODE=ON"
|
# Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
|
||||||
|
# in github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
|
||||||
|
# and select the line that matches the current nixpkgs version of rocBLAS.
|
||||||
|
"-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
apps.llama-server = {
|
apps.llama-server = {
|
||||||
|
473
ggml-alloc.c
473
ggml-alloc.c
@ -1,51 +1,21 @@
|
|||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend-impl.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "ggml-impl.h"
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <limits.h>
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
|
|
||||||
#define UNUSED(x) (void)(x)
|
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
|
#define MAX_FREE_BLOCKS 256
|
||||||
|
|
||||||
//#define GGML_ALLOCATOR_DEBUG
|
//#define GGML_ALLOCATOR_DEBUG
|
||||||
|
|
||||||
//#define AT_PRINTF printf
|
//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
|
||||||
#define AT_PRINTF(...) ((void)0)
|
#define AT_PRINTF(...)
|
||||||
|
|
||||||
struct hash_node {
|
|
||||||
struct ggml_tensor * t;
|
|
||||||
int n_children;
|
|
||||||
int n_views;
|
|
||||||
};
|
|
||||||
|
|
||||||
static size_t hash(void * p) {
|
|
||||||
return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) {
|
|
||||||
size_t h = hash(t);
|
|
||||||
|
|
||||||
// linear probing
|
|
||||||
size_t i = h;
|
|
||||||
while (hash_table[i].t != NULL) {
|
|
||||||
if (hash_table[i].t == t) {
|
|
||||||
return &hash_table[i];
|
|
||||||
}
|
|
||||||
i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
|
|
||||||
if (i == h) {
|
|
||||||
// hash table is full
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
hash_table[i].t = t;
|
|
||||||
return &hash_table[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: GGML_PAD ?
|
// TODO: GGML_PAD ?
|
||||||
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
|
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
|
||||||
@ -59,20 +29,18 @@ struct free_block {
|
|||||||
size_t size;
|
size_t size;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define MAX_FREE_BLOCKS 256
|
struct ggml_tallocr {
|
||||||
|
|
||||||
struct ggml_allocr {
|
|
||||||
struct ggml_backend_buffer * buffer;
|
struct ggml_backend_buffer * buffer;
|
||||||
bool buffer_owned;
|
bool buffer_owned;
|
||||||
void * data;
|
void * base;
|
||||||
size_t alignment;
|
size_t alignment;
|
||||||
|
|
||||||
int n_free_blocks;
|
int n_free_blocks;
|
||||||
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
||||||
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
|
||||||
size_t max_size;
|
size_t max_size;
|
||||||
|
|
||||||
bool measure;
|
bool measure;
|
||||||
int parse_seq[GGML_MAX_CONCUR];
|
|
||||||
int parse_seq_len;
|
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
struct ggml_tensor * allocated_tensors[1024];
|
struct ggml_tensor * allocated_tensors[1024];
|
||||||
@ -80,7 +48,7 @@ struct ggml_allocr {
|
|||||||
};
|
};
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
||||||
for (int i = 0; i < 1024; i++) {
|
for (int i = 0; i < 1024; i++) {
|
||||||
if (alloc->allocated_tensors[i] == NULL) {
|
if (alloc->allocated_tensors[i] == NULL) {
|
||||||
alloc->allocated_tensors[i] = tensor;
|
alloc->allocated_tensors[i] = tensor;
|
||||||
@ -89,7 +57,7 @@ static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor
|
|||||||
}
|
}
|
||||||
GGML_ASSERT(!"out of allocated_tensors");
|
GGML_ASSERT(!"out of allocated_tensors");
|
||||||
}
|
}
|
||||||
static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
||||||
for (int i = 0; i < 1024; i++) {
|
for (int i = 0; i < 1024; i++) {
|
||||||
if (alloc->allocated_tensors[i] == tensor ||
|
if (alloc->allocated_tensors[i] == tensor ||
|
||||||
(alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
|
(alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
|
||||||
@ -103,7 +71,7 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// check if a tensor is allocated by this buffer
|
// check if a tensor is allocated by this buffer
|
||||||
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
|
static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
|
||||||
return tensor->buffer == alloc->buffer;
|
return tensor->buffer == alloc->buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -111,7 +79,7 @@ static bool ggml_is_view(struct ggml_tensor * t) {
|
|||||||
return t->view_src != NULL;
|
return t->view_src != NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
||||||
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
||||||
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
||||||
|
|
||||||
@ -162,9 +130,10 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|||||||
}
|
}
|
||||||
|
|
||||||
tensor->data = addr;
|
tensor->data = addr;
|
||||||
AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
|
|
||||||
tensor->buffer = alloc->buffer;
|
tensor->buffer = alloc->buffer;
|
||||||
|
if (!alloc->measure) {
|
||||||
ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
|
ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
add_allocated_tensor(alloc, tensor);
|
add_allocated_tensor(alloc, tensor);
|
||||||
@ -180,16 +149,16 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size);
|
alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->base + size);
|
||||||
}
|
}
|
||||||
|
|
||||||
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
||||||
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
||||||
if (ggml_allocr_is_own(alloc, tensor) == false) {
|
if (ggml_tallocr_is_own(alloc, tensor) == false) {
|
||||||
// the tensor was not allocated in this buffer
|
// the tensor was not allocated in this buffer
|
||||||
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
||||||
// the easiest way to deal with this is just to ignore it
|
// the easiest way to deal with this is just to ignore it
|
||||||
AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
|
// AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -199,7 +168,9 @@ static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
|||||||
size = aligned_offset(NULL, size, alloc->alignment);
|
size = aligned_offset(NULL, size, alloc->alignment);
|
||||||
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
||||||
|
|
||||||
|
if (!alloc->measure) {
|
||||||
ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
|
ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
remove_allocated_tensor(alloc, tensor);
|
remove_allocated_tensor(alloc, tensor);
|
||||||
@ -253,91 +224,180 @@ static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
|||||||
alloc->n_free_blocks++;
|
alloc->n_free_blocks++;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
|
void ggml_tallocr_reset(ggml_tallocr_t alloc) {
|
||||||
for (int i = 0; i < n; i++) {
|
|
||||||
alloc->parse_seq[i] = list[i];
|
|
||||||
}
|
|
||||||
alloc->parse_seq_len = n;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
|
||||||
alloc->n_free_blocks = 1;
|
alloc->n_free_blocks = 1;
|
||||||
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
|
size_t align_offset = aligned_offset(alloc->base, 0, alloc->alignment);
|
||||||
alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
|
alloc->free_blocks[0].addr = (char *)alloc->base + align_offset;
|
||||||
|
|
||||||
|
if (alloc->measure) {
|
||||||
|
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
||||||
|
} else {
|
||||||
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
|
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
|
ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
|
||||||
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
|
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
|
||||||
|
|
||||||
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
|
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
||||||
|
|
||||||
*alloc = (struct ggml_allocr){
|
*alloc = (struct ggml_tallocr) {
|
||||||
/*.buffer = */ buffer,
|
/*.buffer = */ buffer,
|
||||||
/*.buffer_owned = */ true,
|
/*.buffer_owned = */ true,
|
||||||
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
||||||
/*.alignment = */ alignment,
|
/*.alignment = */ alignment,
|
||||||
/*.n_free_blocks = */ 0,
|
/*.n_free_blocks = */ 0,
|
||||||
/*.free_blocks = */ {{0}},
|
/*.free_blocks = */ {{0}},
|
||||||
/*.hash_table = */ {{0}},
|
|
||||||
/*.max_size = */ 0,
|
/*.max_size = */ 0,
|
||||||
/*.measure = */ false,
|
/*.measure = */ false,
|
||||||
/*.parse_seq = */ {0},
|
|
||||||
/*.parse_seq_len = */ 0,
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
/*.allocated_tensors = */ {0},
|
/*.allocated_tensors = */ {0},
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_allocr_reset(alloc);
|
ggml_tallocr_reset(alloc);
|
||||||
|
|
||||||
return alloc;
|
return alloc;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
|
||||||
struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
|
ggml_tallocr_t alloc = ggml_tallocr_new((void *)0x1000, SIZE_MAX/2, alignment);
|
||||||
alloc->measure = true;
|
alloc->measure = true;
|
||||||
|
|
||||||
return alloc;
|
return alloc;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
|
ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
|
||||||
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
|
// create a backend buffer to get the correct tensor allocation sizes
|
||||||
|
ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, 1);
|
||||||
|
|
||||||
*alloc = (struct ggml_allocr){
|
// TODO: move alloc initialization to a common ggml_tallocr_new_impl function
|
||||||
|
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
||||||
|
alloc->buffer_owned = true;
|
||||||
|
alloc->measure = true;
|
||||||
|
ggml_tallocr_reset(alloc);
|
||||||
|
return alloc;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
|
||||||
|
ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, size);
|
||||||
|
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
||||||
|
alloc->buffer_owned = true;
|
||||||
|
return alloc;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
|
||||||
|
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
||||||
|
|
||||||
|
*alloc = (struct ggml_tallocr) {
|
||||||
/*.buffer = */ buffer,
|
/*.buffer = */ buffer,
|
||||||
/*.buffer_owned = */ false,
|
/*.buffer_owned = */ false,
|
||||||
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
||||||
/*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
|
/*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
|
||||||
/*.n_free_blocks = */ 0,
|
/*.n_free_blocks = */ 0,
|
||||||
/*.free_blocks = */ {{0}},
|
/*.free_blocks = */ {{0}},
|
||||||
/*.hash_table = */ {{0}},
|
|
||||||
/*.max_size = */ 0,
|
/*.max_size = */ 0,
|
||||||
/*.measure = */ false,
|
/*.measure = */ false,
|
||||||
/*.parse_seq = */ {0},
|
|
||||||
/*.parse_seq_len = */ 0,
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
/*.allocated_tensors = */ {0},
|
/*.allocated_tensors = */ {0},
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_allocr_reset(alloc);
|
ggml_tallocr_reset(alloc);
|
||||||
|
|
||||||
return alloc;
|
return alloc;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) {
|
||||||
|
return alloc->buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_tallocr_free(ggml_tallocr_t alloc) {
|
||||||
|
if (alloc == NULL) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (alloc->buffer_owned) {
|
if (alloc->buffer_owned) {
|
||||||
ggml_backend_buffer_free(alloc->buffer);
|
ggml_backend_buffer_free(alloc->buffer);
|
||||||
}
|
}
|
||||||
free(alloc);
|
free(alloc);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
|
bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
|
||||||
return alloc->measure;
|
return alloc->measure;
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////// compute graph allocator
|
size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
|
||||||
|
return alloc->max_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
// graph allocator
|
||||||
|
|
||||||
|
struct hash_node {
|
||||||
|
int n_children;
|
||||||
|
int n_views;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_gallocr {
|
||||||
|
ggml_tallocr_t talloc;
|
||||||
|
struct ggml_hash_set hash_set;
|
||||||
|
struct hash_node * hash_values;
|
||||||
|
size_t hash_values_size;
|
||||||
|
ggml_tallocr_t * hash_allocs;
|
||||||
|
int * parse_seq;
|
||||||
|
int parse_seq_len;
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_gallocr_t ggml_gallocr_new(void) {
|
||||||
|
ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr));
|
||||||
|
|
||||||
|
*galloc = (struct ggml_gallocr) {
|
||||||
|
/*.talloc = */ NULL,
|
||||||
|
/*.hash_set = */ {0},
|
||||||
|
/*.hash_values = */ NULL,
|
||||||
|
/*.hash_values_size = */ 0,
|
||||||
|
/*.hash_allocs = */ NULL,
|
||||||
|
/*.parse_seq = */ NULL,
|
||||||
|
/*.parse_seq_len = */ 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
return galloc;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||||
|
if (galloc == NULL) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (galloc->hash_set.keys != NULL) {
|
||||||
|
free(galloc->hash_set.keys);
|
||||||
|
}
|
||||||
|
if (galloc->hash_values != NULL) {
|
||||||
|
free(galloc->hash_values);
|
||||||
|
}
|
||||||
|
if (galloc->hash_allocs != NULL) {
|
||||||
|
free(galloc->hash_allocs);
|
||||||
|
}
|
||||||
|
if (galloc->parse_seq != NULL) {
|
||||||
|
free(galloc->parse_seq);
|
||||||
|
}
|
||||||
|
free(galloc);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) {
|
||||||
|
free(galloc->parse_seq);
|
||||||
|
galloc->parse_seq = malloc(sizeof(int) * n);
|
||||||
|
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
galloc->parse_seq[i] = list[i];
|
||||||
|
}
|
||||||
|
galloc->parse_seq_len = n;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct hash_node * hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
||||||
|
size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
|
||||||
|
return &galloc->hash_values[i];
|
||||||
|
}
|
||||||
|
|
||||||
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
||||||
if (a->type != b->type) {
|
if (a->type != b->type) {
|
||||||
@ -378,23 +438,40 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
|
static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * node) {
|
||||||
assert(view->view_src != NULL && view->view_src->data != NULL);
|
if (galloc->talloc != NULL) {
|
||||||
|
return galloc->talloc;
|
||||||
|
}
|
||||||
|
|
||||||
|
return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
|
||||||
|
}
|
||||||
|
|
||||||
|
static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
|
||||||
|
ggml_tallocr_t alloc = node_tallocr(galloc, view);
|
||||||
|
|
||||||
|
//printf("init_view: %s from src %s\n", view->name, view->view_src->name);
|
||||||
|
GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
|
||||||
|
if (update_backend) {
|
||||||
view->backend = view->view_src->backend;
|
view->backend = view->view_src->backend;
|
||||||
|
}
|
||||||
view->buffer = view->view_src->buffer;
|
view->buffer = view->view_src->buffer;
|
||||||
view->data = (char *)view->view_src->data + view->view_offs;
|
view->data = (char *)view->view_src->data + view->view_offs;
|
||||||
|
|
||||||
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
||||||
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
||||||
assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
|
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
|
||||||
|
|
||||||
|
if (!alloc->measure) {
|
||||||
ggml_backend_buffer_init_tensor(alloc->buffer, view);
|
ggml_backend_buffer_init_tensor(alloc->buffer, view);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
|
||||||
|
ggml_tallocr_t alloc = node_tallocr(galloc, node);
|
||||||
|
|
||||||
static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
|
|
||||||
struct hash_node * ht = alloc->hash_table;
|
|
||||||
if (node->data == NULL) {
|
if (node->data == NULL) {
|
||||||
if (ggml_is_view(node)) {
|
if (ggml_is_view(node)) {
|
||||||
init_view(alloc, node);
|
init_view(galloc, node, true);
|
||||||
} else {
|
} else {
|
||||||
// see if we can reuse a parent's buffer (inplace)
|
// see if we can reuse a parent's buffer (inplace)
|
||||||
if (ggml_op_can_inplace(node->op)) {
|
if (ggml_op_can_inplace(node->op)) {
|
||||||
@ -405,16 +482,16 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// if the node's data is external, then we cannot re-use it
|
// if the node's data is external, then we cannot re-use it
|
||||||
if (ggml_allocr_is_own(alloc, parent) == false) {
|
if (ggml_tallocr_is_own(alloc, parent) == false) {
|
||||||
AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
|
AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct hash_node * p_hn = hash_get(ht, parent);
|
struct hash_node * p_hn = hash_get(galloc, parent);
|
||||||
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
|
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
|
||||||
if (ggml_is_view(parent)) {
|
if (ggml_is_view(parent)) {
|
||||||
struct ggml_tensor * view_src = parent->view_src;
|
struct ggml_tensor * view_src = parent->view_src;
|
||||||
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
struct hash_node * view_src_hn = hash_get(galloc, view_src);
|
||||||
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
||||||
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
|
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
|
||||||
// the parent's data that it will need later (same layout requirement). the problem is that then
|
// the parent's data that it will need later (same layout requirement). the problem is that then
|
||||||
@ -424,46 +501,44 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|||||||
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
||||||
node->view_src = view_src;
|
node->view_src = view_src;
|
||||||
view_src_hn->n_views += 1;
|
view_src_hn->n_views += 1;
|
||||||
init_view(alloc, node);
|
init_view(galloc, node, false);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
||||||
node->view_src = parent;
|
node->view_src = parent;
|
||||||
p_hn->n_views += 1;
|
p_hn->n_views += 1;
|
||||||
init_view(alloc, node);
|
init_view(galloc, node, false);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ggml_allocr_alloc(alloc, node);
|
ggml_tallocr_alloc(alloc, node);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_allocr_alloc_graph_n(
|
static void free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
|
||||||
struct ggml_allocr * alloc,
|
ggml_tallocr_t alloc = node_tallocr(galloc, node);
|
||||||
struct ggml_cgraph ** graphs, int n_graphs,
|
|
||||||
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
|
||||||
|
|
||||||
// reset hash table
|
ggml_tallocr_free_tensor(alloc, node);
|
||||||
struct hash_node * ht = alloc->hash_table;
|
}
|
||||||
memset(ht, 0, sizeof(struct hash_node) * GGML_GRAPH_HASHTABLE_SIZE);
|
|
||||||
|
static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * gf) {
|
||||||
|
const int * parse_seq = galloc->parse_seq;
|
||||||
|
int parse_seq_len = galloc->parse_seq_len;
|
||||||
|
|
||||||
// count number of children and views
|
// count number of children and views
|
||||||
for (int g = 0; g < n_graphs; g++) {
|
|
||||||
struct ggml_cgraph * gf = graphs[g];
|
|
||||||
for (int i = 0; i < gf->n_nodes; i++) {
|
for (int i = 0; i < gf->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = gf->nodes[i];
|
struct ggml_tensor * node = gf->nodes[i];
|
||||||
|
|
||||||
if (ggml_is_view(node)) {
|
if (ggml_is_view(node)) {
|
||||||
struct ggml_tensor * view_src = node->view_src;
|
struct ggml_tensor * view_src = node->view_src;
|
||||||
hash_get(ht, view_src)->n_views += 1;
|
hash_get(galloc, view_src)->n_views += 1;
|
||||||
if (node->buffer == NULL && node->data != NULL) {
|
if (node->buffer == NULL && node->data != NULL) {
|
||||||
// view of a pre-allocated tensor, didn't call init_view() yet
|
// view of a pre-allocated tensor, didn't call init_view() yet
|
||||||
init_view(alloc, node);
|
init_view(galloc, node, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -472,34 +547,22 @@ size_t ggml_allocr_alloc_graph_n(
|
|||||||
if (parent == NULL) {
|
if (parent == NULL) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
hash_get(ht, parent)->n_children += 1;
|
hash_get(galloc, parent)->n_children += 1;
|
||||||
if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
|
if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
|
||||||
init_view(alloc, parent);
|
init_view(galloc, parent, true);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// allocate tensors
|
// allocate tensors
|
||||||
for (int g = 0; g < n_graphs; g++) {
|
|
||||||
struct ggml_cgraph * gf = graphs[g];
|
|
||||||
AT_PRINTF("####### graph %d/%d\n", g, n_graphs);
|
|
||||||
// graph inputs are allocated first to ensure that they are not overwritten by each other
|
|
||||||
if (inputs != NULL && inputs[g] != NULL) {
|
|
||||||
for (int i = 0; inputs[g][i] != NULL; i++) {
|
|
||||||
struct ggml_tensor * input = inputs[g][i];
|
|
||||||
AT_PRINTF("input: %s\n", input->name);
|
|
||||||
allocate_node(alloc, input);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
|
// if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
|
||||||
int last_barrier_pos = 0;
|
int last_barrier_pos = 0;
|
||||||
int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
|
int n_nodes = parse_seq_len ? parse_seq_len : gf->n_nodes;
|
||||||
|
|
||||||
for (int ind = 0; ind < n_nodes; ind++) {
|
for (int ind = 0; ind < n_nodes; ind++) {
|
||||||
// allocate a node if there is no parse_seq or this is not a barrier
|
// allocate a node if there is no parse_seq or this is not a barrier
|
||||||
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
|
if (parse_seq_len == 0 || parse_seq[ind] != -1) {
|
||||||
int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
|
int i = parse_seq_len ? parse_seq[ind] : ind;
|
||||||
struct ggml_tensor * node = gf->nodes[i];
|
struct ggml_tensor * node = gf->nodes[i];
|
||||||
|
|
||||||
// allocate parents (leafs)
|
// allocate parents (leafs)
|
||||||
@ -508,11 +571,11 @@ size_t ggml_allocr_alloc_graph_n(
|
|||||||
if (parent == NULL) {
|
if (parent == NULL) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
allocate_node(alloc, parent);
|
allocate_node(galloc, parent);
|
||||||
}
|
}
|
||||||
|
|
||||||
// allocate node
|
// allocate node
|
||||||
allocate_node(alloc, node);
|
allocate_node(galloc, node);
|
||||||
|
|
||||||
AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
|
AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
@ -531,11 +594,11 @@ size_t ggml_allocr_alloc_graph_n(
|
|||||||
// update parents
|
// update parents
|
||||||
// update immediately if there is no parse_seq
|
// update immediately if there is no parse_seq
|
||||||
// update only at barriers if there is parse_seq
|
// update only at barriers if there is parse_seq
|
||||||
if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
|
if ((parse_seq_len == 0) || parse_seq[ind] == -1) {
|
||||||
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
|
int update_start = parse_seq_len ? last_barrier_pos : ind;
|
||||||
int update_end = alloc->parse_seq_len ? ind : ind + 1;
|
int update_end = parse_seq_len ? ind : ind + 1;
|
||||||
for (int i = update_start; i < update_end; i++) {
|
for (int i = update_start; i < update_end; i++) {
|
||||||
int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
|
int node_i = parse_seq_len ? parse_seq[i] : i;
|
||||||
struct ggml_tensor * node = gf->nodes[node_i];
|
struct ggml_tensor * node = gf->nodes[node_i];
|
||||||
|
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
@ -543,7 +606,7 @@ size_t ggml_allocr_alloc_graph_n(
|
|||||||
if (parent == NULL) {
|
if (parent == NULL) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
struct hash_node * p_hn = hash_get(ht, parent);
|
struct hash_node * p_hn = hash_get(galloc, parent);
|
||||||
p_hn->n_children -= 1;
|
p_hn->n_children -= 1;
|
||||||
|
|
||||||
//AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
|
//AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
|
||||||
@ -551,44 +614,154 @@ size_t ggml_allocr_alloc_graph_n(
|
|||||||
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
||||||
if (ggml_is_view(parent)) {
|
if (ggml_is_view(parent)) {
|
||||||
struct ggml_tensor * view_src = parent->view_src;
|
struct ggml_tensor * view_src = parent->view_src;
|
||||||
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
struct hash_node * view_src_hn = hash_get(galloc, view_src);
|
||||||
view_src_hn->n_views -= 1;
|
view_src_hn->n_views -= 1;
|
||||||
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
||||||
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0) {
|
||||||
ggml_allocr_free_tensor(alloc, view_src);
|
free_node(galloc, view_src);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if (parent->data != node->data) {
|
free_node(galloc, parent);
|
||||||
ggml_allocr_free_tensor(alloc, parent);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
AT_PRINTF("\n");
|
AT_PRINTF("\n");
|
||||||
if (alloc->parse_seq_len) {
|
if (parse_seq_len) {
|
||||||
last_barrier_pos = ind + 1;
|
last_barrier_pos = ind + 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// free graph outputs here that wouldn't be freed otherwise because they have no children
|
|
||||||
if (outputs != NULL && outputs[g] != NULL) {
|
|
||||||
for (int i = 0; outputs[g][i] != NULL; i++) {
|
|
||||||
struct ggml_tensor * output = outputs[g][i];
|
|
||||||
AT_PRINTF("output: %s\n", output->name);
|
|
||||||
ggml_allocr_free_tensor(alloc, output);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return alloc->max_size;
|
size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph) {
|
||||||
|
size_t hash_size = graph->visited_hash_table.size;
|
||||||
|
|
||||||
|
// check if the hash table is initialized and large enough
|
||||||
|
if (galloc->hash_set.size < hash_size) {
|
||||||
|
if (galloc->hash_set.keys != NULL) {
|
||||||
|
free(galloc->hash_set.keys);
|
||||||
|
}
|
||||||
|
if (galloc->hash_values != NULL) {
|
||||||
|
free(galloc->hash_values);
|
||||||
|
}
|
||||||
|
galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size);
|
||||||
|
galloc->hash_set.size = hash_size;
|
||||||
|
galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
// reset hash table
|
||||||
return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
|
memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * hash_size);
|
||||||
|
memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
|
||||||
|
|
||||||
|
galloc->talloc = talloc;
|
||||||
|
ggml_tallocr_alloc_graph_impl(galloc, graph);
|
||||||
|
galloc->talloc = NULL;
|
||||||
|
|
||||||
|
size_t max_size = ggml_tallocr_max_size(talloc);
|
||||||
|
|
||||||
|
return max_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
|
void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
|
||||||
return alloc->max_size;
|
const size_t hash_size = hash_set.size;
|
||||||
|
|
||||||
|
GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
|
||||||
|
|
||||||
|
galloc->talloc = NULL;
|
||||||
|
|
||||||
|
// alloc hash_values if needed
|
||||||
|
if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) {
|
||||||
|
free(galloc->hash_values);
|
||||||
|
galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
|
||||||
|
galloc->hash_values_size = hash_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
// free hash_set.keys if needed
|
||||||
|
if (galloc->hash_set.keys != NULL) {
|
||||||
|
free(galloc->hash_set.keys);
|
||||||
|
}
|
||||||
|
galloc->hash_set = hash_set;
|
||||||
|
|
||||||
|
// reset hash values
|
||||||
|
memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
|
||||||
|
|
||||||
|
galloc->hash_allocs = hash_node_talloc;
|
||||||
|
|
||||||
|
ggml_tallocr_alloc_graph_impl(galloc, graph);
|
||||||
|
|
||||||
|
// remove unowned resources
|
||||||
|
galloc->hash_set.keys = NULL;
|
||||||
|
galloc->hash_allocs = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// legacy API wrapper
|
||||||
|
|
||||||
|
struct ggml_allocr {
|
||||||
|
ggml_tallocr_t talloc;
|
||||||
|
ggml_gallocr_t galloc;
|
||||||
|
};
|
||||||
|
|
||||||
|
static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) {
|
||||||
|
ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr));
|
||||||
|
*alloc = (struct ggml_allocr) {
|
||||||
|
/*.talloc = */ talloc,
|
||||||
|
/*.galloc = */ ggml_gallocr_new(),
|
||||||
|
};
|
||||||
|
return alloc;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment) {
|
||||||
|
return ggml_allocr_new_impl(ggml_tallocr_new(data, size, alignment));
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_allocr_t ggml_allocr_new_measure(size_t alignment) {
|
||||||
|
return ggml_allocr_new_impl(ggml_tallocr_new_measure(alignment));
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
|
||||||
|
return ggml_allocr_new_impl(ggml_tallocr_new_from_buffer(buffer));
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size) {
|
||||||
|
return ggml_allocr_new_impl(ggml_tallocr_new_from_backend(backend, size));
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend) {
|
||||||
|
return ggml_allocr_new_impl(ggml_tallocr_new_measure_from_backend(backend));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc) {
|
||||||
|
return ggml_tallocr_get_buffer(alloc->talloc);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
|
||||||
|
ggml_gallocr_set_parse_seq(alloc->galloc, list, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_allocr_free(ggml_allocr_t alloc) {
|
||||||
|
ggml_gallocr_free(alloc->galloc);
|
||||||
|
ggml_tallocr_free(alloc->talloc);
|
||||||
|
free(alloc);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_allocr_is_measure(ggml_allocr_t alloc) {
|
||||||
|
return ggml_tallocr_is_measure(alloc->talloc);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_allocr_reset(ggml_allocr_t alloc) {
|
||||||
|
ggml_tallocr_reset(alloc->talloc);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_allocr_alloc(ggml_allocr_t alloc, struct ggml_tensor * tensor) {
|
||||||
|
ggml_tallocr_alloc(alloc->talloc, tensor);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t ggml_allocr_max_size(ggml_allocr_t alloc) {
|
||||||
|
return ggml_tallocr_max_size(alloc->talloc);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
|
||||||
|
return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
|
||||||
}
|
}
|
||||||
|
80
ggml-alloc.h
80
ggml-alloc.h
@ -6,27 +6,79 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
struct ggml_backend;
|
||||||
struct ggml_backend_buffer;
|
struct ggml_backend_buffer;
|
||||||
|
|
||||||
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
|
//
|
||||||
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
// Legacy API
|
||||||
GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
//
|
||||||
|
|
||||||
|
typedef struct ggml_allocr * ggml_allocr_t;
|
||||||
|
|
||||||
|
// initialize allocator for use with CPU backend only
|
||||||
|
GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment);
|
||||||
|
GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment);
|
||||||
|
|
||||||
|
// initialize allocator for use with ggml-backend
|
||||||
|
GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
||||||
|
GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
|
||||||
|
GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend);
|
||||||
|
|
||||||
|
GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc);
|
||||||
|
|
||||||
// tell the allocator to parse nodes following the order described in the list
|
// tell the allocator to parse nodes following the order described in the list
|
||||||
// you should call this if your graph are optimized to execute out-of-order
|
// you should call this if your graph are optimized to execute out-of-order
|
||||||
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
|
GGML_API void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n);
|
||||||
|
|
||||||
GGML_API void ggml_allocr_free (struct ggml_allocr * alloc);
|
GGML_API void ggml_allocr_free (ggml_allocr_t alloc);
|
||||||
GGML_API bool ggml_allocr_is_measure (struct ggml_allocr * alloc);
|
GGML_API bool ggml_allocr_is_measure (ggml_allocr_t alloc);
|
||||||
GGML_API void ggml_allocr_reset (struct ggml_allocr * alloc);
|
GGML_API void ggml_allocr_reset (ggml_allocr_t alloc);
|
||||||
GGML_API void ggml_allocr_alloc (struct ggml_allocr * alloc, struct ggml_tensor * tensor);
|
GGML_API void ggml_allocr_alloc (ggml_allocr_t alloc, struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
|
GGML_API size_t ggml_allocr_max_size (ggml_allocr_t alloc);
|
||||||
GGML_API size_t ggml_allocr_max_size (struct ggml_allocr * alloc);
|
|
||||||
|
|
||||||
GGML_API size_t ggml_allocr_alloc_graph_n(
|
GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph);
|
||||||
struct ggml_allocr * alloc,
|
|
||||||
struct ggml_cgraph ** graphs, int n_graphs,
|
//
|
||||||
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);
|
// ggml-backend v2 API
|
||||||
|
//
|
||||||
|
|
||||||
|
// Seperate tensor and graph allocator objects
|
||||||
|
// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
|
||||||
|
// The original API is kept as a wrapper around the new API
|
||||||
|
|
||||||
|
// Tensor allocator
|
||||||
|
typedef struct ggml_tallocr * ggml_tallocr_t;
|
||||||
|
|
||||||
|
GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
|
||||||
|
GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
|
||||||
|
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
||||||
|
GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
|
||||||
|
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
|
||||||
|
|
||||||
|
GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
|
||||||
|
|
||||||
|
GGML_API void ggml_tallocr_free (ggml_tallocr_t talloc);
|
||||||
|
GGML_API bool ggml_tallocr_is_measure (ggml_tallocr_t talloc);
|
||||||
|
GGML_API void ggml_tallocr_reset (ggml_tallocr_t talloc);
|
||||||
|
GGML_API void ggml_tallocr_alloc (ggml_tallocr_t talloc, struct ggml_tensor * tensor);
|
||||||
|
GGML_API size_t ggml_tallocr_max_size (ggml_tallocr_t talloc);
|
||||||
|
|
||||||
|
|
||||||
|
// Graph allocator
|
||||||
|
typedef struct ggml_gallocr * ggml_gallocr_t;
|
||||||
|
|
||||||
|
GGML_API ggml_gallocr_t ggml_gallocr_new(void);
|
||||||
|
GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
|
||||||
|
|
||||||
|
GGML_API void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n);
|
||||||
|
GGML_API size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph);
|
||||||
|
|
||||||
|
// Allocate tensors from the allocators given by the hash table
|
||||||
|
GGML_API void ggml_gallocr_alloc_graph_n(
|
||||||
|
ggml_gallocr_t galloc,
|
||||||
|
struct ggml_cgraph * graph,
|
||||||
|
struct ggml_hash_set hash_set,
|
||||||
|
ggml_tallocr_t * hash_node_talloc);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
87
ggml-backend-impl.h
Normal file
87
ggml-backend-impl.h
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
// ggml-backend internal header
|
||||||
|
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//
|
||||||
|
// Backend buffer
|
||||||
|
//
|
||||||
|
|
||||||
|
typedef void * ggml_backend_buffer_context_t;
|
||||||
|
|
||||||
|
struct ggml_backend_buffer_i {
|
||||||
|
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
||||||
|
void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer
|
||||||
|
size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
|
||||||
|
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
|
||||||
|
void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_backend_buffer {
|
||||||
|
struct ggml_backend_buffer_i iface;
|
||||||
|
|
||||||
|
ggml_backend_t backend;
|
||||||
|
ggml_backend_buffer_context_t context;
|
||||||
|
|
||||||
|
size_t size;
|
||||||
|
};
|
||||||
|
|
||||||
|
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||||
|
struct ggml_backend * backend,
|
||||||
|
struct ggml_backend_buffer_i iface,
|
||||||
|
ggml_backend_buffer_context_t context,
|
||||||
|
size_t size);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Backend
|
||||||
|
//
|
||||||
|
|
||||||
|
typedef void * ggml_backend_context_t;
|
||||||
|
|
||||||
|
struct ggml_backend_i {
|
||||||
|
const char * (*get_name)(ggml_backend_t backend);
|
||||||
|
|
||||||
|
void (*free)(ggml_backend_t backend);
|
||||||
|
|
||||||
|
// buffer allocation
|
||||||
|
ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
|
||||||
|
|
||||||
|
// get buffer alignment
|
||||||
|
size_t (*get_alignment)(ggml_backend_t backend);
|
||||||
|
|
||||||
|
// tensor data access
|
||||||
|
// these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
|
||||||
|
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
|
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
|
void (*synchronize) (ggml_backend_t backend);
|
||||||
|
|
||||||
|
// (optional) copy tensor between different backends, allow for single-copy tranfers
|
||||||
|
void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
|
||||||
|
// compute graph with a plan
|
||||||
|
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
|
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
|
|
||||||
|
// compute graph without a plan
|
||||||
|
void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
|
// check if the backend supports an operation
|
||||||
|
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_backend {
|
||||||
|
struct ggml_backend_i iface;
|
||||||
|
|
||||||
|
ggml_backend_context_t context;
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
591
ggml-backend.c
591
ggml-backend.c
@ -1,7 +1,9 @@
|
|||||||
#include "ggml-backend.h"
|
#include "ggml-backend-impl.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
|
#include "ggml-impl.h"
|
||||||
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <limits.h>
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
@ -33,6 +35,10 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
|||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||||
|
if (buffer == NULL) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (buffer->iface.free_buffer != NULL) {
|
if (buffer->iface.free_buffer != NULL) {
|
||||||
buffer->iface.free_buffer(buffer);
|
buffer->iface.free_buffer(buffer);
|
||||||
}
|
}
|
||||||
@ -43,15 +49,20 @@ size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
|
|||||||
return ggml_backend_get_alignment(buffer->backend);
|
return ggml_backend_get_alignment(buffer->backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
||||||
return buffer->iface.get_base(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
||||||
return buffer->size;
|
return buffer->size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
|
void * base = buffer->iface.get_base(buffer);
|
||||||
|
|
||||||
|
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
||||||
|
|
||||||
|
return base;
|
||||||
|
}
|
||||||
|
|
||||||
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||||
|
// get_alloc_size is optional, defaults to ggml_nbytes
|
||||||
if (buffer->iface.get_alloc_size) {
|
if (buffer->iface.get_alloc_size) {
|
||||||
return buffer->iface.get_alloc_size(buffer, tensor);
|
return buffer->iface.get_alloc_size(buffer, tensor);
|
||||||
}
|
}
|
||||||
@ -59,12 +70,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
|
|||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||||
|
// init_tensor is optional
|
||||||
if (buffer->iface.init_tensor) {
|
if (buffer->iface.init_tensor) {
|
||||||
buffer->iface.init_tensor(buffer, tensor);
|
buffer->iface.init_tensor(buffer, tensor);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||||
|
// free_tensor is optional
|
||||||
if (buffer->iface.free_tensor) {
|
if (buffer->iface.free_tensor) {
|
||||||
buffer->iface.free_tensor(buffer, tensor);
|
buffer->iface.free_tensor(buffer, tensor);
|
||||||
}
|
}
|
||||||
@ -73,14 +86,21 @@ void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_t
|
|||||||
// backend
|
// backend
|
||||||
|
|
||||||
ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) {
|
ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) {
|
||||||
return tensor->buffer->backend;
|
return tensor->buffer ? tensor->buffer->backend : NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * ggml_backend_name(ggml_backend_t backend) {
|
const char * ggml_backend_name(ggml_backend_t backend) {
|
||||||
|
if (backend == NULL) {
|
||||||
|
return "NULL";
|
||||||
|
}
|
||||||
return backend->iface.get_name(backend);
|
return backend->iface.get_name(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_free(ggml_backend_t backend) {
|
void ggml_backend_free(ggml_backend_t backend) {
|
||||||
|
if (backend == NULL) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
backend->iface.free(backend);
|
backend->iface.free(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -101,13 +121,23 @@ void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * dat
|
|||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
|
ggml_backend_t backend = ggml_get_backend(tensor);
|
||||||
ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
|
|
||||||
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||||
|
GGML_ASSERT(backend != NULL && "tensor backend not set");
|
||||||
|
|
||||||
|
backend->iface.set_tensor_async(backend, tensor, data, offset, size);
|
||||||
|
backend->iface.synchronize(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
|
ggml_backend_t backend = ggml_get_backend(tensor);
|
||||||
ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
|
|
||||||
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||||
|
GGML_ASSERT(backend != NULL && "tensor backend not set");
|
||||||
|
|
||||||
|
backend->iface.get_tensor_async(backend, tensor, data, offset, size);
|
||||||
|
backend->iface.synchronize(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_synchronize(ggml_backend_t backend) {
|
void ggml_backend_synchronize(ggml_backend_t backend) {
|
||||||
@ -156,7 +186,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
|||||||
//printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
|
//printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
|
||||||
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
||||||
|
|
||||||
// printf("cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
|
// fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
|
||||||
|
|
||||||
if (src == dst) {
|
if (src == dst) {
|
||||||
return;
|
return;
|
||||||
@ -234,6 +264,8 @@ static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backen
|
|||||||
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
||||||
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
||||||
|
|
||||||
|
GGML_ASSERT(data != NULL && "failed to allocate buffer");
|
||||||
|
|
||||||
return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
|
return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -271,8 +303,7 @@ static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||||
// for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends
|
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
|
||||||
ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
|
|
||||||
|
|
||||||
UNUSED(backend);
|
UNUSED(backend);
|
||||||
}
|
}
|
||||||
@ -383,3 +414,537 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
|||||||
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) {
|
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) {
|
||||||
return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size);
|
return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// scheduler
|
||||||
|
|
||||||
|
#define GGML_MAX_BACKENDS 4
|
||||||
|
#define GGML_MAX_SPLITS 256
|
||||||
|
#define GGML_MAX_SPLIT_INPUTS 16
|
||||||
|
|
||||||
|
struct ggml_backend_sched_split {
|
||||||
|
ggml_tallocr_t tallocr;
|
||||||
|
int i_start;
|
||||||
|
int i_end;
|
||||||
|
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
|
||||||
|
int n_inputs;
|
||||||
|
struct ggml_cgraph * graph;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_backend_sched {
|
||||||
|
int n_backends;
|
||||||
|
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
||||||
|
ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
|
||||||
|
|
||||||
|
ggml_gallocr_t galloc;
|
||||||
|
|
||||||
|
struct ggml_hash_set hash_set;
|
||||||
|
ggml_tallocr_t * node_talloc; // [hash_set.size]
|
||||||
|
struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // [hash_set.size][GGML_MAX_BACKENDS]
|
||||||
|
|
||||||
|
struct ggml_cgraph * graph;
|
||||||
|
struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
|
||||||
|
int n_splits;
|
||||||
|
|
||||||
|
struct ggml_context * ctx;
|
||||||
|
|
||||||
|
// align context_buffer to GGML_MEM_ALIGN
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
__declspec(align(GGML_MEM_ALIGN))
|
||||||
|
#else
|
||||||
|
__attribute__((aligned(GGML_MEM_ALIGN)))
|
||||||
|
#endif
|
||||||
|
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + GGML_MAX_SPLITS*sizeof(struct ggml_cgraph)];
|
||||||
|
};
|
||||||
|
|
||||||
|
#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
|
||||||
|
#define node_allocr(node) sched->node_talloc[hash_id(node)]
|
||||||
|
|
||||||
|
static bool ggml_is_view_op(enum ggml_op op) {
|
||||||
|
return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
// returns the priority of the backend, lower is better
|
||||||
|
static int sched_backend_prio(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||||
|
for (int i = 0; i < sched->n_backends; i++) {
|
||||||
|
if (sched->backends[i] == backend) {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return INT_MAX;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
|
||||||
|
for (int i = 0; i < sched->n_backends; i++) {
|
||||||
|
if (sched->tallocs[i] == allocr) {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return INT_MAX;
|
||||||
|
}
|
||||||
|
|
||||||
|
// returns the backend that should be used for the node based on the current locations
|
||||||
|
char causes[GGML_DEFAULT_GRAPH_SIZE*4 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
|
||||||
|
static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
||||||
|
// if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there
|
||||||
|
// ie. kv cache updates
|
||||||
|
// note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
|
||||||
|
// dst
|
||||||
|
ggml_backend_t cur_backend = ggml_get_backend(node);
|
||||||
|
if (cur_backend != NULL) {
|
||||||
|
sprintf(causes[hash_id(node)], "1.dst");
|
||||||
|
return cur_backend;
|
||||||
|
}
|
||||||
|
|
||||||
|
// view_src
|
||||||
|
if (node->view_src != NULL && ggml_get_backend(node->view_src) != NULL) {
|
||||||
|
sprintf(causes[hash_id(node)], "1.vsrc");
|
||||||
|
return ggml_get_backend(node->view_src);
|
||||||
|
}
|
||||||
|
|
||||||
|
// src
|
||||||
|
int cur_prio = INT_MAX;
|
||||||
|
size_t cur_size = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
|
const struct ggml_tensor * src = node->src[i];
|
||||||
|
if (src == NULL) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
ggml_backend_t src_backend = ggml_get_backend(src);
|
||||||
|
if (src_backend != NULL) {
|
||||||
|
int src_prio = sched_backend_prio(sched, src_backend);
|
||||||
|
size_t src_size = ggml_nbytes(src);
|
||||||
|
if (src_prio < cur_prio && src_size >= cur_size) {
|
||||||
|
cur_prio = src_prio;
|
||||||
|
cur_size = src_size;
|
||||||
|
cur_backend = src_backend;
|
||||||
|
sprintf(causes[hash_id(node)], "1.src%d", i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return cur_backend;
|
||||||
|
}
|
||||||
|
|
||||||
|
static char * fmt_size(size_t size) {
|
||||||
|
static char buffer[128];
|
||||||
|
if (size >= 1024*1024) {
|
||||||
|
sprintf(buffer, "%zuM", size/1024/1024);
|
||||||
|
} else {
|
||||||
|
sprintf(buffer, "%zuK", size/1024);
|
||||||
|
}
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||||
|
int cur_split = 0;
|
||||||
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
|
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
||||||
|
ggml_backend_t split_backend = ggml_tallocr_get_buffer(sched->splits[cur_split].tallocr)->backend;
|
||||||
|
fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend), sched->splits[cur_split].n_inputs);
|
||||||
|
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
||||||
|
fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
||||||
|
}
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
cur_split++;
|
||||||
|
}
|
||||||
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
|
if (ggml_is_view_op(node->op)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
ggml_tallocr_t node_allocr = node_allocr(node);
|
||||||
|
ggml_backend_t node_backend = node_allocr ? ggml_tallocr_get_buffer(node_allocr)->backend : NULL;
|
||||||
|
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, ggml_op_name(node->op), node->name, fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", causes[hash_id(node)]);
|
||||||
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
struct ggml_tensor * src = node->src[j];
|
||||||
|
if (src == NULL) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
ggml_tallocr_t src_allocr = node_allocr(src);
|
||||||
|
ggml_backend_t src_backend = src_allocr ? ggml_tallocr_get_buffer(src_allocr)->backend : NULL;
|
||||||
|
fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name, fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", causes[hash_id(src)]);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// creates a copy of the tensor with the same memory layout
|
||||||
|
static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
|
||||||
|
struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
|
||||||
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||||
|
dup->nb[i] = tensor->nb[i];
|
||||||
|
}
|
||||||
|
return dup;
|
||||||
|
}
|
||||||
|
|
||||||
|
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
||||||
|
// TODO: merge passes
|
||||||
|
static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||||
|
// reset state
|
||||||
|
size_t hash_size = sched->hash_set.size;
|
||||||
|
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
|
||||||
|
memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
|
||||||
|
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
|
||||||
|
sched->n_splits = 0;
|
||||||
|
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
/*.mem_size = */ sizeof(sched->context_buffer),
|
||||||
|
/*.mem_buffer = */ sched->context_buffer,
|
||||||
|
/*.no_alloc = */ true
|
||||||
|
};
|
||||||
|
|
||||||
|
if (sched->ctx != NULL) {
|
||||||
|
ggml_free(sched->ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
sched->ctx = ggml_init(params);
|
||||||
|
|
||||||
|
// pass 1: assign backends to ops with allocated inputs
|
||||||
|
for (int i = 0; i < graph->n_leafs; i++) {
|
||||||
|
struct ggml_tensor * leaf = graph->leafs[i];
|
||||||
|
if (node_allocr(leaf) != NULL) {
|
||||||
|
// do not overwrite user assignments
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
ggml_backend_t leaf_backend = ggml_get_backend(leaf);
|
||||||
|
if (leaf_backend == NULL && leaf->view_src != NULL) {
|
||||||
|
leaf_backend = ggml_get_backend(leaf->view_src);
|
||||||
|
}
|
||||||
|
if (leaf_backend != NULL) {
|
||||||
|
node_allocr(leaf) = ggml_backend_sched_get_tallocr(sched, leaf_backend);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
|
if (node_allocr(node) != NULL) {
|
||||||
|
// do not overwrite user assignments
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
ggml_backend_t node_backend = sched_backend_from_cur(sched, node);
|
||||||
|
if (node_backend != NULL) {
|
||||||
|
node_allocr(node) = ggml_backend_sched_get_tallocr(sched, node_backend);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
||||||
|
|
||||||
|
// pass 2: assign backends to ops from current assignments
|
||||||
|
// TODO:
|
||||||
|
// - reuse sched_backend_from_cur
|
||||||
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
|
ggml_tallocr_t node_allocr = node_allocr(node);
|
||||||
|
if (node_allocr == NULL) {
|
||||||
|
int cur_prio = INT_MAX;
|
||||||
|
size_t cur_size = 0;
|
||||||
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
struct ggml_tensor * src = node->src[j];
|
||||||
|
if (src == NULL) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
ggml_tallocr_t src_allocr = node_allocr(src);
|
||||||
|
if (src_allocr != NULL) {
|
||||||
|
int src_prio = sched_allocr_prio(sched, src_allocr);
|
||||||
|
size_t src_size = ggml_nbytes(src);
|
||||||
|
if (src_prio < cur_prio && src_size >= cur_size) {
|
||||||
|
cur_prio = src_prio;
|
||||||
|
cur_size = src_size;
|
||||||
|
node_allocr = src_allocr;
|
||||||
|
sprintf(causes[hash_id(node)], "2.src%d", j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (node_allocr != NULL) {
|
||||||
|
node_allocr(node) = node_allocr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
||||||
|
|
||||||
|
// pass 3: assign backends to remaining src from dst (should only be leafs)
|
||||||
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
|
ggml_tallocr_t node_allocr = node_allocr(node);
|
||||||
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
struct ggml_tensor * src = node->src[j];
|
||||||
|
if (src == NULL) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
ggml_tallocr_t src_allocr = node_allocr(src);
|
||||||
|
if (src_allocr == NULL) {
|
||||||
|
node_allocr(src) = node_allocr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
||||||
|
|
||||||
|
// pass 4: split graph, find tensors that need to be copied
|
||||||
|
// TODO:
|
||||||
|
// - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost
|
||||||
|
// find first backend
|
||||||
|
int cur_split = 0;
|
||||||
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
|
if (node->view_src == NULL) {
|
||||||
|
sched->splits[0].tallocr = node_allocr(node);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sched->splits[0].i_start = 0;
|
||||||
|
sched->splits[0].n_inputs = 0;
|
||||||
|
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
||||||
|
ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
|
||||||
|
size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
||||||
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
|
|
||||||
|
if (ggml_is_view_op(node->op)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tallocr_t node_allocr = node_allocr(node);
|
||||||
|
|
||||||
|
if (node_allocr != cur_allocr) {
|
||||||
|
sched->splits[cur_split].i_end = i;
|
||||||
|
cur_split++;
|
||||||
|
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
|
||||||
|
sched->splits[cur_split].tallocr = node_allocr;
|
||||||
|
sched->splits[cur_split].i_start = i;
|
||||||
|
sched->splits[cur_split].n_inputs = 0;
|
||||||
|
memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK
|
||||||
|
cur_allocr = node_allocr;
|
||||||
|
cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
||||||
|
}
|
||||||
|
|
||||||
|
// find inputs that are not on the same backend
|
||||||
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
struct ggml_tensor * src = node->src[j];
|
||||||
|
if (src == NULL) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
ggml_tallocr_t src_allocr = node_allocr(src);
|
||||||
|
if (src_allocr != node_allocr) {
|
||||||
|
int n_inputs = sched->splits[cur_split].n_inputs++;
|
||||||
|
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
||||||
|
sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
|
||||||
|
|
||||||
|
// create copies
|
||||||
|
size_t id = hash_id(src);
|
||||||
|
if (sched->node_copies[id][cur_backend_id] == NULL) {
|
||||||
|
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
||||||
|
sched->node_copies[id][cur_backend_id] = tensor_copy;
|
||||||
|
node_allocr(tensor_copy) = cur_allocr;
|
||||||
|
ggml_backend_t backend = ggml_tallocr_get_buffer(cur_allocr)->backend;
|
||||||
|
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
||||||
|
}
|
||||||
|
node->src[j] = sched->node_copies[id][cur_backend_id];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sched->splits[cur_split].i_end = graph->n_nodes;
|
||||||
|
sched->n_splits = cur_split + 1;
|
||||||
|
|
||||||
|
//fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
// sanity check: all sources should have the same backend as the node
|
||||||
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
|
ggml_tallocr_t node_allocr = node_allocr(node);
|
||||||
|
if (node_allocr == NULL) {
|
||||||
|
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
||||||
|
}
|
||||||
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
struct ggml_tensor * src = node->src[j];
|
||||||
|
if (src == NULL) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
ggml_tallocr_t src_allocr = node_allocr(src);
|
||||||
|
if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now
|
||||||
|
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
||||||
|
node->name, node_allocr ? ggml_backend_name(ggml_tallocr_get_buffer(node_allocr)->backend) : "NULL",
|
||||||
|
j, src->name, src_allocr ? ggml_backend_name(ggml_tallocr_get_buffer(src_allocr)->backend) : "NULL");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// create copies of the graph for each split
|
||||||
|
// FIXME: avoid this copy, pass split inputs to ggml_gallocr_alloc_graph_n in some other way
|
||||||
|
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_MAX_SPLIT_INPUTS, false);
|
||||||
|
for (int i = 0; i < sched->n_splits; i++) {
|
||||||
|
struct ggml_backend_sched_split * split = &sched->splits[i];
|
||||||
|
split->graph = ggml_graph_view(sched->ctx, graph, split->i_start, split->i_end);
|
||||||
|
|
||||||
|
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
||||||
|
for (int j = 0; j < split->n_inputs; j++) {
|
||||||
|
struct ggml_tensor * input = split->inputs[j];
|
||||||
|
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
|
||||||
|
input_cpy->src[0] = input;
|
||||||
|
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = split->i_start; j < split->i_end; j++) {
|
||||||
|
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sched->graph = graph_copy;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void sched_alloc_splits(ggml_backend_sched_t sched) {
|
||||||
|
ggml_gallocr_alloc_graph_n(
|
||||||
|
sched->galloc,
|
||||||
|
sched->graph,
|
||||||
|
sched->hash_set,
|
||||||
|
sched->node_talloc);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void sched_compute_splits(ggml_backend_sched_t sched) {
|
||||||
|
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
|
||||||
|
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
|
||||||
|
|
||||||
|
struct ggml_backend_sched_split * splits = sched->splits;
|
||||||
|
|
||||||
|
for (int i = 0; i < sched->n_splits; i++) {
|
||||||
|
struct ggml_backend_sched_split * split = &splits[i];
|
||||||
|
ggml_backend_t split_backend = ggml_tallocr_get_buffer(split->tallocr)->backend;
|
||||||
|
int split_backend_id = sched_backend_prio(sched, split_backend);
|
||||||
|
|
||||||
|
// copy the input tensors to the split backend
|
||||||
|
uint64_t copy_start_us = ggml_time_us();
|
||||||
|
for (int j = 0; j < split->n_inputs; j++) {
|
||||||
|
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(split->inputs[j])][sched_backend_prio(sched, split_backend)];
|
||||||
|
if (split->inputs[j]->buffer == NULL) {
|
||||||
|
if (split->inputs[j]->view_src == NULL) {
|
||||||
|
fprintf(stderr, "input %s has no buffer and no view_src\n", split->inputs[j]->name);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
struct ggml_tensor * view = split->inputs[j];
|
||||||
|
view->backend = view->view_src->backend;
|
||||||
|
view->buffer = view->view_src->buffer;
|
||||||
|
view->data = (char *)view->view_src->data + view->view_offs;
|
||||||
|
ggml_backend_buffer_init_tensor(ggml_backend_sched_get_buffer(sched, view->buffer->backend), view);
|
||||||
|
}
|
||||||
|
if (input_cpy->buffer == NULL) {
|
||||||
|
fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
GGML_ASSERT(split->inputs[j]->buffer->backend != input_cpy->buffer->backend);
|
||||||
|
GGML_ASSERT(input_cpy->buffer->backend == split_backend);
|
||||||
|
ggml_backend_tensor_copy(split->inputs[j], input_cpy);
|
||||||
|
}
|
||||||
|
// ggml_backend_synchronize(split_backend);
|
||||||
|
int64_t copy_end_us = ggml_time_us();
|
||||||
|
copy_us[split_backend_id] += copy_end_us - copy_start_us;
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
char split_filename[GGML_MAX_NAME];
|
||||||
|
snprintf(split_filename, GGML_MAX_NAME, "split_%i_%s.dot", i, ggml_backend_name(split_backend));
|
||||||
|
ggml_graph_dump_dot(split->graph, NULL, split_filename);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
uint64_t compute_start_us = ggml_time_us();
|
||||||
|
ggml_backend_graph_compute(split_backend, split->graph);
|
||||||
|
// ggml_backend_synchronize(split_backend);
|
||||||
|
uint64_t compute_end_us = ggml_time_us();
|
||||||
|
compute_us[split_backend_id] += compute_end_us - compute_start_us;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// per-backend timings
|
||||||
|
fprintf(stderr, "sched_compute_splits times (%d splits):\n", sched->n_splits);
|
||||||
|
for (int i = 0; i < sched->n_backends; i++) {
|
||||||
|
if (copy_us[i] > 0 || compute_us[i] > 0) {
|
||||||
|
fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static void sched_reset(ggml_backend_sched_t sched) {
|
||||||
|
for (int i = 0; i < sched->n_backends; i++) {
|
||||||
|
ggml_tallocr_reset(sched->tallocs[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
|
||||||
|
GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
|
||||||
|
|
||||||
|
struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched));
|
||||||
|
memset(sched, 0, sizeof(struct ggml_backend_sched));
|
||||||
|
|
||||||
|
fprintf(stderr, "ggml_backend_sched size: %lu KB\n", sizeof(struct ggml_backend_sched)/1024);
|
||||||
|
|
||||||
|
sched->n_backends = n_backends;
|
||||||
|
for (int i = 0; i < n_backends; i++) {
|
||||||
|
sched->backends[i] = backends[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
sched->galloc = ggml_gallocr_new();
|
||||||
|
|
||||||
|
// init measure allocs for each backend
|
||||||
|
for (int i = 0; i < n_backends; i++) {
|
||||||
|
sched->tallocs[i] = ggml_tallocr_new_measure_from_backend(backends[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return sched;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||||
|
if (sched == NULL) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < sched->n_backends; i++) {
|
||||||
|
ggml_tallocr_free(sched->tallocs[i]);
|
||||||
|
}
|
||||||
|
ggml_gallocr_free(sched->galloc);
|
||||||
|
free(sched->hash_set.keys);
|
||||||
|
free(sched->node_talloc);
|
||||||
|
free(sched->node_copies);
|
||||||
|
free(sched);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
||||||
|
// initialize hash tables
|
||||||
|
size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
|
||||||
|
sched->hash_set.size = hash_size;
|
||||||
|
sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
|
||||||
|
sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size);
|
||||||
|
sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size);
|
||||||
|
|
||||||
|
sched_split_graph(sched, measure_graph);
|
||||||
|
sched_alloc_splits(sched);
|
||||||
|
|
||||||
|
// allocate buffers and reset allocators
|
||||||
|
for (int i = 0; i < sched->n_backends; i++) {
|
||||||
|
size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
|
||||||
|
ggml_tallocr_free(sched->tallocs[i]);
|
||||||
|
sched->tallocs[i] = ggml_tallocr_new_from_backend(sched->backends[i], size);
|
||||||
|
}
|
||||||
|
|
||||||
|
sched_reset(sched);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||||
|
GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
||||||
|
|
||||||
|
sched_split_graph(sched, graph);
|
||||||
|
sched_alloc_splits(sched);
|
||||||
|
sched_compute_splits(sched);
|
||||||
|
sched_reset(sched);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||||
|
int backend_index = sched_backend_prio(sched, backend);
|
||||||
|
return sched->tallocs[backend_index];
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||||
|
int backend_index = sched_backend_prio(sched, backend);
|
||||||
|
return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||||
|
int backend_index = sched_backend_prio(sched, backend);
|
||||||
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||||
|
node_allocr(node) = sched->tallocs[backend_index];
|
||||||
|
}
|
||||||
|
151
ggml-backend.h
151
ggml-backend.h
@ -1,51 +1,20 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "ggml-alloc.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
struct ggml_backend;
|
|
||||||
|
//
|
||||||
|
// Backend buffer
|
||||||
|
//
|
||||||
|
|
||||||
struct ggml_backend_buffer;
|
struct ggml_backend_buffer;
|
||||||
|
|
||||||
// type-erased backend-specific types / wrappers
|
|
||||||
typedef void * ggml_backend_context_t;
|
|
||||||
typedef void * ggml_backend_graph_plan_t;
|
|
||||||
typedef void * ggml_backend_buffer_context_t;
|
|
||||||
|
|
||||||
// avoid accessing internals of these types
|
|
||||||
typedef struct ggml_backend * ggml_backend_t;
|
|
||||||
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||||
|
|
||||||
//
|
|
||||||
// backend buffer
|
|
||||||
//
|
|
||||||
|
|
||||||
struct ggml_backend_buffer_i {
|
|
||||||
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
|
||||||
void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer
|
|
||||||
size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
|
|
||||||
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
|
|
||||||
void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
|
|
||||||
};
|
|
||||||
|
|
||||||
// TODO: hide behind API
|
|
||||||
struct ggml_backend_buffer {
|
|
||||||
struct ggml_backend_buffer_i iface;
|
|
||||||
|
|
||||||
ggml_backend_t backend;
|
|
||||||
ggml_backend_buffer_context_t context;
|
|
||||||
|
|
||||||
size_t size;
|
|
||||||
};
|
|
||||||
|
|
||||||
// backend buffer functions
|
// backend buffer functions
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
||||||
struct ggml_backend * backend,
|
|
||||||
struct ggml_backend_buffer_i iface,
|
|
||||||
ggml_backend_buffer_context_t context,
|
|
||||||
size_t size);
|
|
||||||
|
|
||||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||||
@ -55,50 +24,13 @@ extern "C" {
|
|||||||
GGML_API void ggml_backend_buffer_free_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API void ggml_backend_buffer_free_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
//
|
//
|
||||||
// backend
|
// Backend
|
||||||
//
|
//
|
||||||
|
|
||||||
struct ggml_backend_i {
|
struct ggml_backend;
|
||||||
const char * (*get_name)(ggml_backend_t backend);
|
typedef struct ggml_backend * ggml_backend_t;
|
||||||
|
typedef void * ggml_backend_graph_plan_t;
|
||||||
|
|
||||||
void (*free)(ggml_backend_t backend);
|
|
||||||
|
|
||||||
// buffer allocation
|
|
||||||
ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
|
|
||||||
|
|
||||||
// get buffer alignment
|
|
||||||
size_t (*get_alignment)(ggml_backend_t backend);
|
|
||||||
|
|
||||||
// tensor data access
|
|
||||||
// these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
|
|
||||||
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
|
||||||
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
|
||||||
void (*synchronize) (ggml_backend_t backend);
|
|
||||||
|
|
||||||
// (optional) copy tensor between different backends, allow for single-copy tranfers
|
|
||||||
void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
|
||||||
void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
|
||||||
|
|
||||||
// compute graph with a plan
|
|
||||||
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
|
||||||
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
||||||
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
||||||
|
|
||||||
// compute graph without a plan
|
|
||||||
void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
|
||||||
|
|
||||||
// check if the backend supports an operation
|
|
||||||
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
|
||||||
};
|
|
||||||
|
|
||||||
// TODO: hide behind API
|
|
||||||
struct ggml_backend {
|
|
||||||
struct ggml_backend_i iface;
|
|
||||||
|
|
||||||
ggml_backend_context_t context;
|
|
||||||
};
|
|
||||||
|
|
||||||
// backend helper functions
|
|
||||||
GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
|
GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
||||||
@ -133,11 +65,72 @@ extern "C" {
|
|||||||
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
|
GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
||||||
|
|
||||||
|
// Create a backend buffer from an existing pointer
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
|
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// Backend scheduler
|
||||||
|
//
|
||||||
|
|
||||||
|
// The backend scheduler allows for multiple backends to be used together
|
||||||
|
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
|
||||||
|
// The backends are selected based on:
|
||||||
|
// - the backend that supports the operation
|
||||||
|
// - the location of the pre-allocated tensors (e.g. the weights)
|
||||||
|
/*
|
||||||
|
Example usage:
|
||||||
|
|
||||||
|
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
|
||||||
|
// sched is initialized with measure allocators and cannot be used until allocated with a measure graph
|
||||||
|
|
||||||
|
// initialize buffers from a measure graph
|
||||||
|
measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
|
||||||
|
|
||||||
|
// in build_graph:
|
||||||
|
build_graph(...) {
|
||||||
|
// allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
|
||||||
|
alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
|
||||||
|
ggml_allocr_alloc(alloc_cpu, tensor);
|
||||||
|
|
||||||
|
// manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
|
||||||
|
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
|
||||||
|
ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate backend buffers from measure graph
|
||||||
|
ggml_backend_sched_init_measure(sched, measure_graph);
|
||||||
|
|
||||||
|
// the scheduler is now ready to compute graphs
|
||||||
|
|
||||||
|
// compute
|
||||||
|
graph = build_graph(sched);
|
||||||
|
ggml_backend_sched_graph_compute(sched, graph);
|
||||||
|
*/
|
||||||
|
|
||||||
|
struct ggml_backend_sched;
|
||||||
|
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
||||||
|
|
||||||
|
// Initialize a backend scheduler
|
||||||
|
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends);
|
||||||
|
|
||||||
|
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||||
|
|
||||||
|
// Initialize backend buffers from a measure graph
|
||||||
|
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
||||||
|
|
||||||
|
GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||||
|
GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||||
|
|
||||||
|
// Allocate a graph on the backend scheduler
|
||||||
|
GGML_API void ggml_backend_sched_graph_compute(
|
||||||
|
ggml_backend_sched_t sched,
|
||||||
|
struct ggml_cgraph * graph);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
645
ggml-cuda.cu
645
ggml-cuda.cu
File diff suppressed because it is too large
Load Diff
@ -17,7 +17,12 @@ extern "C" {
|
|||||||
|
|
||||||
#define GGML_CUDA_MAX_DEVICES 16
|
#define GGML_CUDA_MAX_DEVICES 16
|
||||||
|
|
||||||
|
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
|
||||||
GGML_API void ggml_init_cublas(void);
|
GGML_API void ggml_init_cublas(void);
|
||||||
|
|
||||||
|
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
|
||||||
|
GGML_API bool ggml_cublas_loaded(void);
|
||||||
|
|
||||||
GGML_API void * ggml_cuda_host_malloc(size_t size);
|
GGML_API void * ggml_cuda_host_malloc(size_t size);
|
||||||
GGML_API void ggml_cuda_host_free(void * ptr);
|
GGML_API void ggml_cuda_host_free(void * ptr);
|
||||||
|
|
||||||
|
20
ggml-impl.h
20
ggml-impl.h
@ -39,12 +39,6 @@ extern "C" {
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#undef MIN
|
|
||||||
#undef MAX
|
|
||||||
|
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
|
||||||
|
|
||||||
// 16-bit float
|
// 16-bit float
|
||||||
// on Arm, we use __fp16
|
// on Arm, we use __fp16
|
||||||
// on x86, we use uint16_t
|
// on x86, we use uint16_t
|
||||||
@ -230,7 +224,19 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// TODO: backend v2 PR
|
#define GGML_HASHTABLE_FULL ((size_t)-1)
|
||||||
|
#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
|
||||||
|
|
||||||
|
bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
||||||
|
|
||||||
|
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
||||||
|
size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
||||||
|
|
||||||
|
// returns GGML_HAHSHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
||||||
|
size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
||||||
|
|
||||||
|
// return index, asserts if table is full
|
||||||
|
size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
@ -26,7 +26,7 @@
|
|||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
|
||||||
// max memory buffers that can be mapped to the device
|
// max memory buffers that can be mapped to the device
|
||||||
#define GGML_METAL_MAX_BUFFERS 16
|
#define GGML_METAL_MAX_BUFFERS 64
|
||||||
#define GGML_METAL_MAX_COMMAND_BUFFERS 32
|
#define GGML_METAL_MAX_COMMAND_BUFFERS 32
|
||||||
|
|
||||||
struct ggml_tensor;
|
struct ggml_tensor;
|
||||||
|
163
ggml-metal.m
163
ggml-metal.m
@ -1,5 +1,6 @@
|
|||||||
#import "ggml-metal.h"
|
#import "ggml-metal.h"
|
||||||
|
|
||||||
|
#import "ggml-backend-impl.h"
|
||||||
#import "ggml.h"
|
#import "ggml.h"
|
||||||
|
|
||||||
#import <Foundation/Foundation.h>
|
#import <Foundation/Foundation.h>
|
||||||
@ -23,7 +24,7 @@
|
|||||||
|
|
||||||
#define UNUSED(x) (void)(x)
|
#define UNUSED(x) (void)(x)
|
||||||
|
|
||||||
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
|
#define GGML_MAX_CONCUR (2*GGML_DEFAULT_GRAPH_SIZE)
|
||||||
|
|
||||||
struct ggml_metal_buffer {
|
struct ggml_metal_buffer {
|
||||||
const char * name;
|
const char * name;
|
||||||
@ -85,6 +86,7 @@ struct ggml_metal_context {
|
|||||||
GGML_METAL_DECL_KERNEL(rms_norm);
|
GGML_METAL_DECL_KERNEL(rms_norm);
|
||||||
GGML_METAL_DECL_KERNEL(norm);
|
GGML_METAL_DECL_KERNEL(norm);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mv_f32_f32);
|
GGML_METAL_DECL_KERNEL(mul_mv_f32_f32);
|
||||||
|
GGML_METAL_DECL_KERNEL(mul_mv_f16_f16);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32);
|
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_1row);
|
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_1row);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
|
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
|
||||||
@ -113,6 +115,7 @@ struct ggml_metal_context {
|
|||||||
GGML_METAL_DECL_KERNEL(rope_f32);
|
GGML_METAL_DECL_KERNEL(rope_f32);
|
||||||
GGML_METAL_DECL_KERNEL(rope_f16);
|
GGML_METAL_DECL_KERNEL(rope_f16);
|
||||||
GGML_METAL_DECL_KERNEL(alibi_f32);
|
GGML_METAL_DECL_KERNEL(alibi_f32);
|
||||||
|
GGML_METAL_DECL_KERNEL(im2col_f16);
|
||||||
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
||||||
GGML_METAL_DECL_KERNEL(cpy_f32_f32);
|
GGML_METAL_DECL_KERNEL(cpy_f32_f32);
|
||||||
GGML_METAL_DECL_KERNEL(cpy_f16_f16);
|
GGML_METAL_DECL_KERNEL(cpy_f16_f16);
|
||||||
@ -125,7 +128,7 @@ struct ggml_metal_context {
|
|||||||
// MSL code
|
// MSL code
|
||||||
// TODO: move the contents here when ready
|
// TODO: move the contents here when ready
|
||||||
// for now it is easier to work in a separate file
|
// for now it is easier to work in a separate file
|
||||||
static NSString * const msl_library_source = @"see metal.metal";
|
//static NSString * const msl_library_source = @"see metal.metal";
|
||||||
|
|
||||||
// Here to assist with NSBundle Path Hack
|
// Here to assist with NSBundle Path Hack
|
||||||
@interface GGMLMetalClass : NSObject
|
@interface GGMLMetalClass : NSObject
|
||||||
@ -141,6 +144,7 @@ void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_dat
|
|||||||
ggml_metal_log_user_data = user_data;
|
ggml_metal_log_user_data = user_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_ATTRIBUTE_FORMAT(2, 3)
|
||||||
static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
|
static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
|
||||||
if (ggml_metal_log_callback != NULL) {
|
if (ggml_metal_log_callback != NULL) {
|
||||||
va_list args;
|
va_list args;
|
||||||
@ -209,7 +213,13 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
} else {
|
} else {
|
||||||
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||||
|
|
||||||
NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
NSString * sourcePath;
|
||||||
|
NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
||||||
|
if (ggmlMetalPathResources) {
|
||||||
|
sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"];
|
||||||
|
} else {
|
||||||
|
sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||||
|
}
|
||||||
if (sourcePath == nil) {
|
if (sourcePath == nil) {
|
||||||
GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
|
GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
|
||||||
sourcePath = @"ggml-metal.metal";
|
sourcePath = @"ggml-metal.metal";
|
||||||
@ -238,12 +248,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
// load kernels
|
// load kernels
|
||||||
{
|
{
|
||||||
NSError * error = nil;
|
NSError * error = nil;
|
||||||
#define GGML_METAL_ADD_KERNEL(name) \
|
|
||||||
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
/*
|
||||||
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
|
||||||
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
|
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
|
||||||
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
|
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
|
||||||
(int) ctx->pipeline_##name.threadExecutionWidth); \
|
(int) ctx->pipeline_##name.threadExecutionWidth); \
|
||||||
|
*/
|
||||||
|
#define GGML_METAL_ADD_KERNEL(name) \
|
||||||
|
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
||||||
|
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
||||||
if (error) { \
|
if (error) { \
|
||||||
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||||
return NULL; \
|
return NULL; \
|
||||||
@ -277,6 +290,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
GGML_METAL_ADD_KERNEL(rms_norm);
|
GGML_METAL_ADD_KERNEL(rms_norm);
|
||||||
GGML_METAL_ADD_KERNEL(norm);
|
GGML_METAL_ADD_KERNEL(norm);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mv_f32_f32);
|
GGML_METAL_ADD_KERNEL(mul_mv_f32_f32);
|
||||||
|
GGML_METAL_ADD_KERNEL(mul_mv_f16_f16);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32);
|
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_1row);
|
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_1row);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
|
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
|
||||||
@ -307,6 +321,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
GGML_METAL_ADD_KERNEL(rope_f32);
|
GGML_METAL_ADD_KERNEL(rope_f32);
|
||||||
GGML_METAL_ADD_KERNEL(rope_f16);
|
GGML_METAL_ADD_KERNEL(rope_f16);
|
||||||
GGML_METAL_ADD_KERNEL(alibi_f32);
|
GGML_METAL_ADD_KERNEL(alibi_f32);
|
||||||
|
GGML_METAL_ADD_KERNEL(im2col_f16);
|
||||||
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
||||||
GGML_METAL_ADD_KERNEL(cpy_f32_f32);
|
GGML_METAL_ADD_KERNEL(cpy_f32_f32);
|
||||||
GGML_METAL_ADD_KERNEL(cpy_f16_f16);
|
GGML_METAL_ADD_KERNEL(cpy_f16_f16);
|
||||||
@ -325,15 +340,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
// https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
// https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
||||||
for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
|
for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
|
||||||
if ([ctx->device supportsFamily:i]) {
|
if ([ctx->device supportsFamily:i]) {
|
||||||
GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i);
|
GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||||
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MiB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||||
if (ctx->device.maxTransferRate != 0) {
|
if (ctx->device.maxTransferRate != 0) {
|
||||||
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MiB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
||||||
} else {
|
} else {
|
||||||
GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
|
GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
|
||||||
}
|
}
|
||||||
@ -376,6 +391,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|||||||
GGML_METAL_DEL_KERNEL(rms_norm);
|
GGML_METAL_DEL_KERNEL(rms_norm);
|
||||||
GGML_METAL_DEL_KERNEL(norm);
|
GGML_METAL_DEL_KERNEL(norm);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mv_f32_f32);
|
GGML_METAL_DEL_KERNEL(mul_mv_f32_f32);
|
||||||
|
GGML_METAL_DEL_KERNEL(mul_mv_f16_f16);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32);
|
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_1row);
|
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_1row);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
|
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
|
||||||
@ -406,6 +422,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|||||||
GGML_METAL_DEL_KERNEL(rope_f32);
|
GGML_METAL_DEL_KERNEL(rope_f32);
|
||||||
GGML_METAL_DEL_KERNEL(rope_f16);
|
GGML_METAL_DEL_KERNEL(rope_f16);
|
||||||
GGML_METAL_DEL_KERNEL(alibi_f32);
|
GGML_METAL_DEL_KERNEL(alibi_f32);
|
||||||
|
GGML_METAL_DEL_KERNEL(im2col_f16);
|
||||||
GGML_METAL_DEL_KERNEL(cpy_f32_f16);
|
GGML_METAL_DEL_KERNEL(cpy_f32_f16);
|
||||||
GGML_METAL_DEL_KERNEL(cpy_f32_f32);
|
GGML_METAL_DEL_KERNEL(cpy_f32_f32);
|
||||||
GGML_METAL_DEL_KERNEL(cpy_f16_f16);
|
GGML_METAL_DEL_KERNEL(cpy_f16_f16);
|
||||||
@ -463,6 +480,10 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
|||||||
|
|
||||||
const int64_t tsize = ggml_nbytes(t);
|
const int64_t tsize = ggml_nbytes(t);
|
||||||
|
|
||||||
|
if (t->buffer && t->buffer->backend && t->buffer->backend->context) {
|
||||||
|
ctx = t->buffer->backend->context;
|
||||||
|
}
|
||||||
|
|
||||||
// find the view that contains the tensor fully
|
// find the view that contains the tensor fully
|
||||||
for (int i = 0; i < ctx->n_buffers; ++i) {
|
for (int i = 0; i < ctx->n_buffers; ++i) {
|
||||||
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
||||||
@ -520,11 +541,11 @@ bool ggml_metal_add_buffer(
|
|||||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
|
||||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||||
|
|
||||||
++ctx->n_buffers;
|
++ctx->n_buffers;
|
||||||
} else {
|
} else {
|
||||||
@ -544,11 +565,11 @@ bool ggml_metal_add_buffer(
|
|||||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
|
||||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
||||||
if (i + size_step < size) {
|
if (i + size_step < size) {
|
||||||
GGML_METAL_LOG_INFO("\n");
|
GGML_METAL_LOG_INFO("\n");
|
||||||
}
|
}
|
||||||
@ -563,7 +584,7 @@ bool ggml_metal_add_buffer(
|
|||||||
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||||
|
|
||||||
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
|
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
|
||||||
GGML_METAL_LOG_WARN(", warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
||||||
} else {
|
} else {
|
||||||
GGML_METAL_LOG_INFO("\n");
|
GGML_METAL_LOG_INFO("\n");
|
||||||
}
|
}
|
||||||
@ -741,6 +762,20 @@ void ggml_metal_graph_compute(
|
|||||||
struct ggml_tensor * src1 = gf->nodes[i]->src[1];
|
struct ggml_tensor * src1 = gf->nodes[i]->src[1];
|
||||||
struct ggml_tensor * dst = gf->nodes[i];
|
struct ggml_tensor * dst = gf->nodes[i];
|
||||||
|
|
||||||
|
switch (dst->op) {
|
||||||
|
case GGML_OP_NONE:
|
||||||
|
case GGML_OP_RESHAPE:
|
||||||
|
case GGML_OP_VIEW:
|
||||||
|
case GGML_OP_TRANSPOSE:
|
||||||
|
case GGML_OP_PERMUTE:
|
||||||
|
{
|
||||||
|
// noop -> next node
|
||||||
|
} continue;
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
|
||||||
const int64_t ne00 = src0 ? src0->ne[0] : 0;
|
const int64_t ne00 = src0 ? src0->ne[0] : 0;
|
||||||
const int64_t ne01 = src0 ? src0->ne[1] : 0;
|
const int64_t ne01 = src0 ? src0->ne[1] : 0;
|
||||||
const int64_t ne02 = src0 ? src0->ne[2] : 0;
|
const int64_t ne02 = src0 ? src0->ne[2] : 0;
|
||||||
@ -794,14 +829,6 @@ void ggml_metal_graph_compute(
|
|||||||
//}
|
//}
|
||||||
|
|
||||||
switch (dst->op) {
|
switch (dst->op) {
|
||||||
case GGML_OP_NONE:
|
|
||||||
case GGML_OP_RESHAPE:
|
|
||||||
case GGML_OP_VIEW:
|
|
||||||
case GGML_OP_TRANSPOSE:
|
|
||||||
case GGML_OP_PERMUTE:
|
|
||||||
{
|
|
||||||
// noop
|
|
||||||
} break;
|
|
||||||
case GGML_OP_CONCAT:
|
case GGML_OP_CONCAT:
|
||||||
{
|
{
|
||||||
const int64_t nb = ne00;
|
const int64_t nb = ne00;
|
||||||
@ -998,11 +1025,15 @@ void ggml_metal_graph_compute(
|
|||||||
} break;
|
} break;
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
{
|
{
|
||||||
const int nth = MIN(32, ne00);
|
int nth = 32; // SIMD width
|
||||||
|
|
||||||
if (ne00%4 == 0) {
|
if (ne00%4 == 0) {
|
||||||
[encoder setComputePipelineState:ctx->pipeline_soft_max_4];
|
[encoder setComputePipelineState:ctx->pipeline_soft_max_4];
|
||||||
} else {
|
} else {
|
||||||
|
do {
|
||||||
|
nth *= 2;
|
||||||
|
} while (nth <= ne00 && nth <= 1024);
|
||||||
|
nth /= 2;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_soft_max];
|
[encoder setComputePipelineState:ctx->pipeline_soft_max];
|
||||||
}
|
}
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
@ -1010,8 +1041,9 @@ void ggml_metal_graph_compute(
|
|||||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||||
|
[encoder setThreadgroupMemoryLength:GGML_PAD(nth/32*sizeof(float), 16) atIndex:0];
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
{
|
{
|
||||||
@ -1118,6 +1150,7 @@ void ggml_metal_graph_compute(
|
|||||||
switch (src0t) {
|
switch (src0t) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
{
|
{
|
||||||
|
GGML_ASSERT(src1t == GGML_TYPE_F32);
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f32_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f32_f32];
|
||||||
nrows = 4;
|
nrows = 4;
|
||||||
} break;
|
} break;
|
||||||
@ -1125,6 +1158,7 @@ void ggml_metal_graph_compute(
|
|||||||
{
|
{
|
||||||
nth0 = 32;
|
nth0 = 32;
|
||||||
nth1 = 1;
|
nth1 = 1;
|
||||||
|
if (src1t == GGML_TYPE_F32) {
|
||||||
if (ne11 * ne12 < 4) {
|
if (ne11 * ne12 < 4) {
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row];
|
||||||
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
|
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
|
||||||
@ -1134,6 +1168,10 @@ void ggml_metal_graph_compute(
|
|||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32];
|
||||||
nrows = 4;
|
nrows = 4;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f16];
|
||||||
|
nrows = 4;
|
||||||
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
{
|
{
|
||||||
@ -1321,7 +1359,7 @@ void ggml_metal_graph_compute(
|
|||||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
||||||
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
||||||
[encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
|
[encoder setThreadgroupMemoryLength:GGML_PAD(nth/32*sizeof(float), 16) atIndex:0];
|
||||||
|
|
||||||
const int64_t nrows = ggml_nrows(src0);
|
const int64_t nrows = ggml_nrows(src0);
|
||||||
|
|
||||||
@ -1340,7 +1378,7 @@ void ggml_metal_graph_compute(
|
|||||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
||||||
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
||||||
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
[encoder setThreadgroupMemoryLength:GGML_PAD(nth*sizeof(float), 16) atIndex:0];
|
||||||
|
|
||||||
const int64_t nrows = ggml_nrows(src0);
|
const int64_t nrows = ggml_nrows(src0);
|
||||||
|
|
||||||
@ -1395,11 +1433,15 @@ void ggml_metal_graph_compute(
|
|||||||
const int n_past = ((int32_t *) dst->op_params)[0];
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
||||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||||
const int mode = ((int32_t *) dst->op_params)[2];
|
const int mode = ((int32_t *) dst->op_params)[2];
|
||||||
|
const int n_orig_ctx = ((int32_t *) dst->op_params)[3];
|
||||||
|
|
||||||
float freq_base;
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
||||||
float freq_scale;
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||||
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
||||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
||||||
|
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
||||||
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||||
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||||
|
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break;
|
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break;
|
||||||
@ -1429,11 +1471,68 @@ void ggml_metal_graph_compute(
|
|||||||
[encoder setBytes:&n_past length:sizeof( int) atIndex:19];
|
[encoder setBytes:&n_past length:sizeof( int) atIndex:19];
|
||||||
[encoder setBytes:&n_dims length:sizeof( int) atIndex:20];
|
[encoder setBytes:&n_dims length:sizeof( int) atIndex:20];
|
||||||
[encoder setBytes:&mode length:sizeof( int) atIndex:21];
|
[encoder setBytes:&mode length:sizeof( int) atIndex:21];
|
||||||
[encoder setBytes:&freq_base length:sizeof(float) atIndex:22];
|
[encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:22];
|
||||||
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:23];
|
[encoder setBytes:&freq_base length:sizeof( float) atIndex:23];
|
||||||
|
[encoder setBytes:&freq_scale length:sizeof( float) atIndex:24];
|
||||||
|
[encoder setBytes:&ext_factor length:sizeof( float) atIndex:25];
|
||||||
|
[encoder setBytes:&attn_factor length:sizeof( float) atIndex:26];
|
||||||
|
[encoder setBytes:&beta_fast length:sizeof( float) atIndex:27];
|
||||||
|
[encoder setBytes:&beta_slow length:sizeof( float) atIndex:28];
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_IM2COL:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||||
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
||||||
|
|
||||||
|
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
|
||||||
|
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
|
||||||
|
const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
|
||||||
|
const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
|
||||||
|
const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
|
||||||
|
const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
|
||||||
|
const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
|
||||||
|
|
||||||
|
const int32_t N = src1->ne[is_2D ? 3 : 2];
|
||||||
|
const int32_t IC = src1->ne[is_2D ? 2 : 1];
|
||||||
|
const int32_t IH = is_2D ? src1->ne[1] : 1;
|
||||||
|
const int32_t IW = src1->ne[0];
|
||||||
|
|
||||||
|
const int32_t KH = is_2D ? src0->ne[1] : 1;
|
||||||
|
const int32_t KW = src0->ne[0];
|
||||||
|
|
||||||
|
const int32_t OH = is_2D ? dst->ne[2] : 1;
|
||||||
|
const int32_t OW = dst->ne[1];
|
||||||
|
|
||||||
|
const int32_t CHW = IC * KH * KW;
|
||||||
|
|
||||||
|
const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4;
|
||||||
|
const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4;
|
||||||
|
|
||||||
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_F32: GGML_ASSERT(false && "not implemented"); break;
|
||||||
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_im2col_f16]; break;
|
||||||
|
default: GGML_ASSERT(false);
|
||||||
|
};
|
||||||
|
|
||||||
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:0];
|
||||||
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
[encoder setBytes:&ofs0 length:sizeof( int32_t) atIndex:2];
|
||||||
|
[encoder setBytes:&ofs1 length:sizeof( int32_t) atIndex:3];
|
||||||
|
[encoder setBytes:&IW length:sizeof( int32_t) atIndex:4];
|
||||||
|
[encoder setBytes:&IH length:sizeof( int32_t) atIndex:5];
|
||||||
|
[encoder setBytes:&CHW length:sizeof( int32_t) atIndex:6];
|
||||||
|
[encoder setBytes:&s0 length:sizeof( int32_t) atIndex:7];
|
||||||
|
[encoder setBytes:&s1 length:sizeof( int32_t) atIndex:8];
|
||||||
|
[encoder setBytes:&p0 length:sizeof( int32_t) atIndex:9];
|
||||||
|
[encoder setBytes:&p1 length:sizeof( int32_t) atIndex:10];
|
||||||
|
[encoder setBytes:&d0 length:sizeof( int32_t) atIndex:11];
|
||||||
|
[encoder setBytes:&d1 length:sizeof( int32_t) atIndex:12];
|
||||||
|
|
||||||
|
[encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
|
||||||
|
} break;
|
||||||
case GGML_OP_DUP:
|
case GGML_OP_DUP:
|
||||||
case GGML_OP_CPY:
|
case GGML_OP_CPY:
|
||||||
case GGML_OP_CONT:
|
case GGML_OP_CONT:
|
||||||
|
300
ggml-metal.metal
300
ggml-metal.metal
@ -184,36 +184,73 @@ kernel void kernel_soft_max(
|
|||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne01,
|
constant int64_t & ne01,
|
||||||
constant int64_t & ne02,
|
constant int64_t & ne02,
|
||||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
threadgroup float * buf [[threadgroup(0)]],
|
||||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
uint tgpig[[threadgroup_position_in_grid]],
|
||||||
uint3 ntg[[threads_per_threadgroup]]) {
|
uint tpitg[[thread_position_in_threadgroup]],
|
||||||
const int64_t i03 = tgpig[2];
|
uint sgitg[[simdgroup_index_in_threadgroup]],
|
||||||
const int64_t i02 = tgpig[1];
|
uint tiisg[[thread_index_in_simdgroup]],
|
||||||
const int64_t i01 = tgpig[0];
|
uint ntg[[threads_per_threadgroup]]) {
|
||||||
|
const int64_t i03 = (tgpig) / (ne02*ne01);
|
||||||
|
const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
|
||||||
|
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
|
||||||
|
|
||||||
device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||||
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||||
|
|
||||||
// parallel max
|
// parallel max
|
||||||
float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY;
|
float lmax = tpitg < ne00 ? psrc0[tpitg] : -INFINITY;
|
||||||
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
|
|
||||||
|
for (int i00 = tpitg + ntg; i00 < ne00; i00 += ntg) {
|
||||||
lmax = MAX(lmax, psrc0[i00]);
|
lmax = MAX(lmax, psrc0[i00]);
|
||||||
}
|
}
|
||||||
const float max = simd_max(lmax);
|
|
||||||
|
float max = simd_max(lmax);
|
||||||
|
if (tiisg == 0) {
|
||||||
|
buf[sgitg] = max;
|
||||||
|
}
|
||||||
|
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
|
// broadcast, simd group number is ntg / 32
|
||||||
|
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
||||||
|
if (tpitg < i) {
|
||||||
|
buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
|
max = buf[0];
|
||||||
|
|
||||||
// parallel sum
|
// parallel sum
|
||||||
float lsum = 0.0f;
|
float lsum = 0.0f;
|
||||||
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||||
const float exp_psrc0 = exp(psrc0[i00] - max);
|
const float exp_psrc0 = exp(psrc0[i00] - max);
|
||||||
lsum += exp_psrc0;
|
lsum += exp_psrc0;
|
||||||
// Remember the result of exp here. exp is expensive, so we really do not
|
// Remember the result of exp here. exp is expensive, so we really do not
|
||||||
// whish to compute it twice.
|
// wish to compute it twice.
|
||||||
pdst[i00] = exp_psrc0;
|
pdst[i00] = exp_psrc0;
|
||||||
}
|
}
|
||||||
|
|
||||||
const float sum = simd_sum(lsum);
|
float sum = simd_sum(lsum);
|
||||||
|
if (tiisg == 0) {
|
||||||
|
buf[sgitg] = sum;
|
||||||
|
}
|
||||||
|
|
||||||
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
|
// broadcast, simd group number is ntg / 32
|
||||||
|
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
||||||
|
if (tpitg < i) {
|
||||||
|
buf[tpitg] += buf[tpitg + i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
|
sum = buf[0];
|
||||||
|
|
||||||
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||||
pdst[i00] /= sum;
|
pdst[i00] /= sum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -224,37 +261,73 @@ kernel void kernel_soft_max_4(
|
|||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne01,
|
constant int64_t & ne01,
|
||||||
constant int64_t & ne02,
|
constant int64_t & ne02,
|
||||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
threadgroup float * buf [[threadgroup(0)]],
|
||||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
uint tgpig[[threadgroup_position_in_grid]],
|
||||||
uint3 ntg[[threads_per_threadgroup]]) {
|
uint tpitg[[thread_position_in_threadgroup]],
|
||||||
const int64_t i03 = tgpig[2];
|
uint sgitg[[simdgroup_index_in_threadgroup]],
|
||||||
const int64_t i02 = tgpig[1];
|
uint tiisg[[thread_index_in_simdgroup]],
|
||||||
const int64_t i01 = tgpig[0];
|
uint ntg[[threads_per_threadgroup]]) {
|
||||||
|
const int64_t i03 = (tgpig) / (ne02*ne01);
|
||||||
|
const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
|
||||||
|
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
|
||||||
|
|
||||||
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||||
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||||
|
|
||||||
// parallel max
|
// parallel max
|
||||||
float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY;
|
float4 lmax4 = tpitg < ne00/4 ? psrc4[tpitg] : -INFINITY;
|
||||||
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
|
|
||||||
|
for (int i00 = tpitg + ntg; i00 < ne00/4; i00 += ntg) {
|
||||||
lmax4 = fmax(lmax4, psrc4[i00]);
|
lmax4 = fmax(lmax4, psrc4[i00]);
|
||||||
}
|
}
|
||||||
float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
|
||||||
|
|
||||||
const float max = simd_max(lmax);
|
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
||||||
|
float max = simd_max(lmax);
|
||||||
|
if (tiisg == 0) {
|
||||||
|
buf[sgitg] = max;
|
||||||
|
}
|
||||||
|
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
|
// broadcast, simd group number is ntg / 32
|
||||||
|
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
||||||
|
if (tpitg < i) {
|
||||||
|
buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
|
max = buf[0];
|
||||||
|
|
||||||
// parallel sum
|
// parallel sum
|
||||||
float4 lsum4 = 0.0f;
|
float4 lsum4 = 0.0f;
|
||||||
for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||||
const float4 exp_psrc4 = exp(psrc4[i00] - max);
|
const float4 exp_psrc4 = exp(psrc4[i00] - max);
|
||||||
lsum4 += exp_psrc4;
|
lsum4 += exp_psrc4;
|
||||||
pdst4[i00] = exp_psrc4;
|
pdst4[i00] = exp_psrc4;
|
||||||
}
|
}
|
||||||
float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
|
|
||||||
|
|
||||||
const float sum = simd_sum(lsum);
|
const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
|
||||||
|
float sum = simd_sum(lsum);
|
||||||
|
if (tiisg == 0) {
|
||||||
|
buf[sgitg] = sum;
|
||||||
|
}
|
||||||
|
|
||||||
for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
|
// broadcast, simd group number is ntg / 32
|
||||||
|
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
||||||
|
if (tpitg < i) {
|
||||||
|
buf[tpitg] += buf[tpitg + i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
|
sum = buf[0];
|
||||||
|
|
||||||
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||||
pdst4[i00] /= sum;
|
pdst4[i00] /= sum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -771,6 +844,79 @@ kernel void kernel_mul_mv_f32_f32(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define N_F16_F16 4
|
||||||
|
|
||||||
|
kernel void kernel_mul_mv_f16_f16(
|
||||||
|
device const char * src0,
|
||||||
|
device const char * src1,
|
||||||
|
device float * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant int64_t & ne01,
|
||||||
|
constant int64_t & ne02,
|
||||||
|
constant uint64_t & nb00,
|
||||||
|
constant uint64_t & nb01,
|
||||||
|
constant uint64_t & nb02,
|
||||||
|
constant int64_t & ne10,
|
||||||
|
constant int64_t & ne11,
|
||||||
|
constant int64_t & ne12,
|
||||||
|
constant uint64_t & nb10,
|
||||||
|
constant uint64_t & nb11,
|
||||||
|
constant uint64_t & nb12,
|
||||||
|
constant int64_t & ne0,
|
||||||
|
constant int64_t & ne1,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint tiisg[[thread_index_in_simdgroup]]) {
|
||||||
|
|
||||||
|
const int64_t r0 = tgpig.x;
|
||||||
|
const int64_t rb = tgpig.y*N_F16_F16;
|
||||||
|
const int64_t im = tgpig.z;
|
||||||
|
|
||||||
|
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
||||||
|
|
||||||
|
if (ne00 < 128) {
|
||||||
|
for (int row = 0; row < N_F16_F16; ++row) {
|
||||||
|
int r1 = rb + row;
|
||||||
|
if (r1 >= ne11) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
device const half * y = (device const half *) (src1 + r1*nb11 + im*nb12);
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
for (int i = tiisg; i < ne00; i += 32) {
|
||||||
|
sumf += (half) x[i] * (half) y[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
float all_sum = simd_sum(sumf);
|
||||||
|
if (tiisg == 0) {
|
||||||
|
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
device const half4 * x4 = (device const half4 *)x;
|
||||||
|
for (int row = 0; row < N_F16_F16; ++row) {
|
||||||
|
int r1 = rb + row;
|
||||||
|
if (r1 >= ne11) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
device const half * y = (device const half *) (src1 + r1*nb11 + im*nb12);
|
||||||
|
device const half4 * y4 = (device const half4 *) y;
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
for (int i = tiisg; i < ne00/4; i += 32) {
|
||||||
|
for (int k = 0; k < 4; ++k) sumf += (half) x4[i][k] * y4[i][k];
|
||||||
|
}
|
||||||
|
|
||||||
|
float all_sum = simd_sum(sumf);
|
||||||
|
if (tiisg == 0) {
|
||||||
|
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (half) x[i] * y[i];
|
||||||
|
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
kernel void kernel_mul_mv_f16_f32_1row(
|
kernel void kernel_mul_mv_f16_f32_1row(
|
||||||
device const char * src0,
|
device const char * src0,
|
||||||
device const char * src1,
|
device const char * src1,
|
||||||
@ -988,6 +1134,45 @@ kernel void kernel_alibi_f32(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
||||||
|
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
||||||
|
return 1.0f - min(1.0f, max(0.0f, y));
|
||||||
|
}
|
||||||
|
|
||||||
|
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
||||||
|
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
||||||
|
static void rope_yarn(
|
||||||
|
float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
|
||||||
|
thread float * cos_theta, thread float * sin_theta
|
||||||
|
) {
|
||||||
|
// Get n-d rotational scaling corrected for extrapolation
|
||||||
|
float theta_interp = freq_scale * theta_extrap;
|
||||||
|
float theta = theta_interp;
|
||||||
|
if (ext_factor != 0.0f) {
|
||||||
|
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
|
||||||
|
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||||
|
|
||||||
|
// Get n-d magnitude scaling corrected for interpolation
|
||||||
|
mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
|
||||||
|
}
|
||||||
|
*cos_theta = cos(theta) * mscale;
|
||||||
|
*sin_theta = sin(theta) * mscale;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
||||||
|
// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
|
||||||
|
static float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
|
||||||
|
return n_dims * log(n_orig_ctx / (n_rot * 2 * M_PI_F)) / (2 * log(base));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void rope_yarn_corr_dims(
|
||||||
|
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
||||||
|
) {
|
||||||
|
// start and end correction dims
|
||||||
|
dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
|
||||||
|
dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
|
||||||
|
}
|
||||||
|
|
||||||
typedef void (rope_t)(
|
typedef void (rope_t)(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const int32_t * src1,
|
device const int32_t * src1,
|
||||||
@ -1011,8 +1196,13 @@ typedef void (rope_t)(
|
|||||||
constant int & n_past,
|
constant int & n_past,
|
||||||
constant int & n_dims,
|
constant int & n_dims,
|
||||||
constant int & mode,
|
constant int & mode,
|
||||||
|
constant int & n_orig_ctx,
|
||||||
constant float & freq_base,
|
constant float & freq_base,
|
||||||
constant float & freq_scale,
|
constant float & freq_scale,
|
||||||
|
constant float & ext_factor,
|
||||||
|
constant float & attn_factor,
|
||||||
|
constant float & beta_fast,
|
||||||
|
constant float & beta_slow,
|
||||||
uint tiitg[[thread_index_in_threadgroup]],
|
uint tiitg[[thread_index_in_threadgroup]],
|
||||||
uint3 tptg[[threads_per_threadgroup]],
|
uint3 tptg[[threads_per_threadgroup]],
|
||||||
uint3 tgpig[[threadgroup_position_in_grid]]);
|
uint3 tgpig[[threadgroup_position_in_grid]]);
|
||||||
@ -1041,8 +1231,13 @@ kernel void kernel_rope(
|
|||||||
constant int & n_past,
|
constant int & n_past,
|
||||||
constant int & n_dims,
|
constant int & n_dims,
|
||||||
constant int & mode,
|
constant int & mode,
|
||||||
|
constant int & n_orig_ctx,
|
||||||
constant float & freq_base,
|
constant float & freq_base,
|
||||||
constant float & freq_scale,
|
constant float & freq_scale,
|
||||||
|
constant float & ext_factor,
|
||||||
|
constant float & attn_factor,
|
||||||
|
constant float & beta_fast,
|
||||||
|
constant float & beta_slow,
|
||||||
uint tiitg[[thread_index_in_threadgroup]],
|
uint tiitg[[thread_index_in_threadgroup]],
|
||||||
uint3 tptg[[threads_per_threadgroup]],
|
uint3 tptg[[threads_per_threadgroup]],
|
||||||
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
||||||
@ -1052,19 +1247,22 @@ kernel void kernel_rope(
|
|||||||
|
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
|
|
||||||
|
float corr_dims[2];
|
||||||
|
rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
|
||||||
|
|
||||||
device const int32_t * pos = src1;
|
device const int32_t * pos = src1;
|
||||||
|
|
||||||
const int64_t p = pos[i2];
|
const int64_t p = pos[i2];
|
||||||
|
|
||||||
const float theta_0 = freq_scale * (float)p;
|
const float theta_0 = (float)p;
|
||||||
const float inv_ndims = -1.f/n_dims;
|
const float inv_ndims = -1.f/n_dims;
|
||||||
|
|
||||||
if (!is_neox) {
|
if (!is_neox) {
|
||||||
for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
|
for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
|
||||||
|
|
||||||
const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
|
const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
|
||||||
const float cos_theta = cos(theta);
|
float cos_theta, sin_theta;
|
||||||
const float sin_theta = sin(theta);
|
rope_yarn(theta, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||||
|
|
||||||
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
@ -1079,9 +1277,12 @@ kernel void kernel_rope(
|
|||||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||||
for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
|
for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
|
||||||
|
|
||||||
const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
||||||
const float cos_theta = cos(theta);
|
const float cur_rot = inv_ndims*ic - ib;
|
||||||
const float sin_theta = sin(theta);
|
|
||||||
|
const float theta = theta_0 * pow(freq_base, cur_rot);
|
||||||
|
float cos_theta, sin_theta;
|
||||||
|
rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||||
|
|
||||||
const int64_t i0 = ib*n_dims + ic/2;
|
const int64_t i0 = ib*n_dims + ic/2;
|
||||||
|
|
||||||
@ -1101,6 +1302,39 @@ kernel void kernel_rope(
|
|||||||
template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope<float>;
|
template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope<float>;
|
||||||
template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope<half>;
|
template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope<half>;
|
||||||
|
|
||||||
|
kernel void kernel_im2col_f16(
|
||||||
|
device const float * x,
|
||||||
|
device half * dst,
|
||||||
|
constant int32_t & ofs0,
|
||||||
|
constant int32_t & ofs1,
|
||||||
|
constant int32_t & IW,
|
||||||
|
constant int32_t & IH,
|
||||||
|
constant int32_t & CHW,
|
||||||
|
constant int32_t & s0,
|
||||||
|
constant int32_t & s1,
|
||||||
|
constant int32_t & p0,
|
||||||
|
constant int32_t & p1,
|
||||||
|
constant int32_t & d0,
|
||||||
|
constant int32_t & d1,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint3 tgpg[[threadgroups_per_grid]],
|
||||||
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
const int32_t iiw = tgpig[2] * s0 + tpitg[2] * d0 - p0;
|
||||||
|
const int32_t iih = tgpig[1] * s1 + tpitg[1] * d1 - p1;
|
||||||
|
|
||||||
|
const int32_t offset_dst =
|
||||||
|
(tpitg[0] * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW +
|
||||||
|
(tgpig[0] * (ntg[1] * ntg[2]) + tpitg[1] * ntg[2] + tpitg[2]);
|
||||||
|
|
||||||
|
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||||
|
dst[offset_dst] = 0.0f;
|
||||||
|
} else {
|
||||||
|
const int32_t offset_src = tpitg[0] * ofs0 + tgpig[0] * ofs1;
|
||||||
|
dst[offset_dst] = x[offset_src + iih * IW + iiw];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
kernel void kernel_cpy_f16_f16(
|
kernel void kernel_cpy_f16_f16(
|
||||||
device const half * src0,
|
device const half * src0,
|
||||||
device half * dst,
|
device half * dst,
|
||||||
|
248
ggml-quants.c
248
ggml-quants.c
@ -14,32 +14,12 @@
|
|||||||
//
|
//
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
|
||||||
#if !defined(__aarch64__)
|
|
||||||
inline static int32_t vaddvq_s16(int16x8_t v) {
|
|
||||||
return
|
|
||||||
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
|
||||||
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
|
||||||
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
|
||||||
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
|
||||||
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
|
||||||
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
|
||||||
return vcombine_s16(a0, b0);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static int32_t vaddvq_s32(int32x4_t v) {
|
|
||||||
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#ifdef __wasm_simd128__
|
#ifdef __wasm_simd128__
|
||||||
#include <wasm_simd128.h>
|
#include <wasm_simd128.h>
|
||||||
#else
|
#else
|
||||||
#ifdef __POWER9_VECTOR__
|
#if defined(__POWER9_VECTOR__) || defined(__powerpc64__)
|
||||||
#include <altivec.h>
|
#include <altivec.h>
|
||||||
#undef bool
|
#undef bool
|
||||||
#define bool _Bool
|
#define bool _Bool
|
||||||
@ -47,13 +27,15 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
|||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
#include <intrin.h>
|
#include <intrin.h>
|
||||||
#else
|
#else
|
||||||
#if !defined(__riscv) && !defined(__s390__)
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
||||||
|
#if !defined(__riscv)
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __riscv_v_intrinsic
|
#ifdef __riscv_v_intrinsic
|
||||||
#include <riscv_vector.h>
|
#include <riscv_vector.h>
|
||||||
@ -61,6 +43,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
|||||||
|
|
||||||
#undef MIN
|
#undef MIN
|
||||||
#undef MAX
|
#undef MAX
|
||||||
|
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
|
||||||
@ -283,9 +266,31 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|||||||
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
||||||
|
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_NEON)
|
||||||
|
|
||||||
#if !defined(__aarch64__)
|
#if !defined(__aarch64__)
|
||||||
|
|
||||||
|
// 64-bit compatibility
|
||||||
|
|
||||||
|
// vaddvq_s16
|
||||||
|
// vpaddq_s16
|
||||||
|
// vaddvq_s32
|
||||||
|
// vaddvq_f32
|
||||||
|
// vmaxvq_f32
|
||||||
|
// vcvtnq_s32_f32
|
||||||
|
|
||||||
|
inline static int32_t vaddvq_s16(int16x8_t v) {
|
||||||
|
return
|
||||||
|
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
||||||
|
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
||||||
|
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
||||||
|
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
||||||
|
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
||||||
|
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
||||||
|
return vcombine_s16(a0, b0);
|
||||||
|
}
|
||||||
|
|
||||||
inline static int32_t vaddvq_s32(int32x4_t v) {
|
inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||||
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
||||||
}
|
}
|
||||||
@ -311,6 +316,96 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// vld1q_s16_x2
|
||||||
|
// vld1q_u8_x2
|
||||||
|
// vld1q_u8_x4
|
||||||
|
// vld1q_s8_x2
|
||||||
|
// vld1q_s8_x4
|
||||||
|
// TODO: double-check these work correctly
|
||||||
|
|
||||||
|
typedef struct ggml_int16x8x2_t {
|
||||||
|
int16x8_t val[2];
|
||||||
|
} ggml_int16x8x2_t;
|
||||||
|
|
||||||
|
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
||||||
|
ggml_int16x8x2_t res;
|
||||||
|
|
||||||
|
res.val[0] = vld1q_s16(ptr + 0);
|
||||||
|
res.val[1] = vld1q_s16(ptr + 8);
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef struct ggml_uint8x16x2_t {
|
||||||
|
uint8x16_t val[2];
|
||||||
|
} ggml_uint8x16x2_t;
|
||||||
|
|
||||||
|
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
||||||
|
ggml_uint8x16x2_t res;
|
||||||
|
|
||||||
|
res.val[0] = vld1q_u8(ptr + 0);
|
||||||
|
res.val[1] = vld1q_u8(ptr + 16);
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef struct ggml_uint8x16x4_t {
|
||||||
|
uint8x16_t val[4];
|
||||||
|
} ggml_uint8x16x4_t;
|
||||||
|
|
||||||
|
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
||||||
|
ggml_uint8x16x4_t res;
|
||||||
|
|
||||||
|
res.val[0] = vld1q_u8(ptr + 0);
|
||||||
|
res.val[1] = vld1q_u8(ptr + 16);
|
||||||
|
res.val[2] = vld1q_u8(ptr + 32);
|
||||||
|
res.val[3] = vld1q_u8(ptr + 48);
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef struct ggml_int8x16x2_t {
|
||||||
|
int8x16_t val[2];
|
||||||
|
} ggml_int8x16x2_t;
|
||||||
|
|
||||||
|
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
||||||
|
ggml_int8x16x2_t res;
|
||||||
|
|
||||||
|
res.val[0] = vld1q_s8(ptr + 0);
|
||||||
|
res.val[1] = vld1q_s8(ptr + 16);
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef struct ggml_int8x16x4_t {
|
||||||
|
int8x16_t val[4];
|
||||||
|
} ggml_int8x16x4_t;
|
||||||
|
|
||||||
|
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
||||||
|
ggml_int8x16x4_t res;
|
||||||
|
|
||||||
|
res.val[0] = vld1q_s8(ptr + 0);
|
||||||
|
res.val[1] = vld1q_s8(ptr + 16);
|
||||||
|
res.val[2] = vld1q_s8(ptr + 32);
|
||||||
|
res.val[3] = vld1q_s8(ptr + 48);
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define ggml_int16x8x2_t int16x8x2_t
|
||||||
|
#define ggml_uint8x16x2_t uint8x16x2_t
|
||||||
|
#define ggml_uint8x16x4_t uint8x16x4_t
|
||||||
|
#define ggml_int8x16x2_t int8x16x2_t
|
||||||
|
#define ggml_int8x16x4_t int8x16x4_t
|
||||||
|
|
||||||
|
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
||||||
|
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
||||||
|
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
||||||
|
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
||||||
|
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -716,6 +811,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
|
|||||||
__riscv_vse8_v_i8m1(y[i].qs , vs, vl);
|
__riscv_vse8_v_i8m1(y[i].qs , vs, vl);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
GGML_UNUSED(nb);
|
||||||
// scalar
|
// scalar
|
||||||
quantize_row_q8_0_reference(x, y, k);
|
quantize_row_q8_0_reference(x, y, k);
|
||||||
#endif
|
#endif
|
||||||
@ -969,6 +1065,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|||||||
y[i].s = sum*d;
|
y[i].s = sum*d;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
GGML_UNUSED(nb);
|
||||||
// scalar
|
// scalar
|
||||||
quantize_row_q8_1_reference(x, y, k);
|
quantize_row_q8_1_reference(x, y, k);
|
||||||
#endif
|
#endif
|
||||||
@ -1271,7 +1368,12 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
|
|||||||
float max = x[0];
|
float max = x[0];
|
||||||
float sum_w = weights[0];
|
float sum_w = weights[0];
|
||||||
float sum_x = sum_w * x[0];
|
float sum_x = sum_w * x[0];
|
||||||
|
#ifdef HAVE_BUGGY_APPLE_LINKER
|
||||||
|
// use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
|
||||||
|
for (volatile int i = 1; i < n; ++i) {
|
||||||
|
#else
|
||||||
for (int i = 1; i < n; ++i) {
|
for (int i = 1; i < n; ++i) {
|
||||||
|
#endif
|
||||||
if (x[i] < min) min = x[i];
|
if (x[i] < min) min = x[i];
|
||||||
if (x[i] > max) max = x[i];
|
if (x[i] > max) max = x[i];
|
||||||
float w = weights[i];
|
float w = weights[i];
|
||||||
@ -3555,7 +3657,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
const int32x4_t vzero = vdupq_n_s32(0);
|
const int32x4_t vzero = vdupq_n_s32(0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int8x16x2_t q2bytes;
|
ggml_int8x16x2_t q2bytes;
|
||||||
uint8_t aux[16];
|
uint8_t aux[16];
|
||||||
|
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
@ -3574,8 +3676,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
vst1q_u8(aux, scales);
|
vst1q_u8(aux, scales);
|
||||||
|
|
||||||
const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
|
const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
|
||||||
const int16x8x2_t q8sums = vld1q_s16_x2(y[i].bsums);
|
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
||||||
const int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))};
|
const ggml_int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))};
|
||||||
const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
|
const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
|
||||||
vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
|
vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
|
||||||
const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
|
const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
|
||||||
@ -3603,7 +3705,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
|
#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
|
||||||
q8bytes = vld1q_s8_x2(q8); q8 += 32;\
|
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
|
||||||
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\
|
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\
|
||||||
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
|
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
|
||||||
MULTIPLY_ACCUM_WITH_SCALE((index));
|
MULTIPLY_ACCUM_WITH_SCALE((index));
|
||||||
@ -3611,9 +3713,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
for (int j = 0; j < QK_K/128; ++j) {
|
for (int j = 0; j < QK_K/128; ++j) {
|
||||||
|
|
||||||
const uint8x16x2_t q2bits = vld1q_u8_x2(q2); q2 += 32;
|
const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
|
||||||
|
|
||||||
int8x16x2_t q8bytes = vld1q_s8_x2(q8); q8 += 32;
|
ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
||||||
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
|
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
|
||||||
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
|
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
|
||||||
MULTIPLY_ACCUM_WITH_SCALE(0);
|
MULTIPLY_ACCUM_WITH_SCALE(0);
|
||||||
@ -3947,7 +4049,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
const int32x4_t vzero = vdupq_n_s32(0);
|
const int32x4_t vzero = vdupq_n_s32(0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int8x16x4_t q2bytes;
|
ggml_int8x16x4_t q2bytes;
|
||||||
|
|
||||||
uint32_t aux32[2];
|
uint32_t aux32[2];
|
||||||
const uint8_t * scales = (const uint8_t *)aux32;
|
const uint8_t * scales = (const uint8_t *)aux32;
|
||||||
@ -3972,7 +4074,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
const uint8x16_t q2bits = vld1q_u8(q2);
|
const uint8x16_t q2bits = vld1q_u8(q2);
|
||||||
|
|
||||||
const int8x16x4_t q8bytes = vld1q_s8_x4(q8);
|
const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
|
||||||
|
|
||||||
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits, m3));
|
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits, m3));
|
||||||
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 2), m3));
|
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 2), m3));
|
||||||
@ -4236,7 +4338,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
const uint8x16_t m3 = vshlq_n_u8(m0, 3);
|
const uint8x16_t m3 = vshlq_n_u8(m0, 3);
|
||||||
const int8_t m32 = 32;
|
const int8_t m32 = 32;
|
||||||
|
|
||||||
int8x16x4_t q3bytes;
|
ggml_int8x16x4_t q3bytes;
|
||||||
|
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
|
|
||||||
@ -4248,9 +4350,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
const uint8_t * restrict qh = x[i].hmask;
|
const uint8_t * restrict qh = x[i].hmask;
|
||||||
const int8_t * restrict q8 = y[i].qs;
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
|
||||||
uint8x16x2_t qhbits = vld1q_u8_x2(qh);
|
ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
|
||||||
|
|
||||||
uint8x16x4_t q3h;
|
ggml_uint8x16x4_t q3h;
|
||||||
|
|
||||||
int32_t isum = 0;
|
int32_t isum = 0;
|
||||||
|
|
||||||
@ -4266,9 +4368,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
for (int j = 0; j < QK_K/128; ++j) {
|
for (int j = 0; j < QK_K/128; ++j) {
|
||||||
|
|
||||||
const uint8x16x2_t q3bits = vld1q_u8_x2(q3); q3 += 32;
|
const ggml_uint8x16x2_t q3bits = ggml_vld1q_u8_x2(q3); q3 += 32;
|
||||||
const int8x16x4_t q8bytes_1 = vld1q_s8_x4(q8); q8 += 64;
|
const ggml_int8x16x4_t q8bytes_1 = ggml_vld1q_s8_x4(q8); q8 += 64;
|
||||||
const int8x16x4_t q8bytes_2 = vld1q_s8_x4(q8); q8 += 64;
|
const ggml_int8x16x4_t q8bytes_2 = ggml_vld1q_s8_x4(q8); q8 += 64;
|
||||||
|
|
||||||
q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2);
|
q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2);
|
||||||
q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2);
|
q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2);
|
||||||
@ -4770,7 +4872,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
const uint8x16_t m3b = vdupq_n_u8(0x3);
|
const uint8x16_t m3b = vdupq_n_u8(0x3);
|
||||||
const uint8x16_t mh = vdupq_n_u8(4);
|
const uint8x16_t mh = vdupq_n_u8(4);
|
||||||
|
|
||||||
int8x16x4_t q3bytes;
|
ggml_int8x16x4_t q3bytes;
|
||||||
|
|
||||||
uint16_t aux16[2];
|
uint16_t aux16[2];
|
||||||
int8_t * scales = (int8_t *)aux16;
|
int8_t * scales = (int8_t *)aux16;
|
||||||
@ -4779,11 +4881,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
for (int i = 0; i < nb; ++i) {
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
uint8x16x4_t q3h;
|
ggml_uint8x16x4_t q3h;
|
||||||
|
|
||||||
const uint8x8_t hbits = vld1_u8(x[i].hmask);
|
const uint8x8_t hbits = vld1_u8(x[i].hmask);
|
||||||
const uint8x16_t q3bits = vld1q_u8(x[i].qs);
|
const uint8x16_t q3bits = vld1q_u8(x[i].qs);
|
||||||
const int8x16x4_t q8bytes = vld1q_s8_x4(y[i].qs);
|
const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(y[i].qs);
|
||||||
|
|
||||||
const uint16_t a = *(const uint16_t *)x[i].scales;
|
const uint16_t a = *(const uint16_t *)x[i].scales;
|
||||||
aux16[0] = a & 0x0f0f;
|
aux16[0] = a & 0x0f0f;
|
||||||
@ -5132,8 +5234,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
const int32x4_t mzero = vdupq_n_s32(0);
|
const int32x4_t mzero = vdupq_n_s32(0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int8x16x2_t q4bytes;
|
ggml_int8x16x2_t q4bytes;
|
||||||
int8x16x2_t q8bytes;
|
ggml_int8x16x2_t q8bytes;
|
||||||
|
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
|
||||||
@ -5168,17 +5270,17 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
for (int j = 0; j < QK_K/64; ++j) {
|
for (int j = 0; j < QK_K/64; ++j) {
|
||||||
|
|
||||||
const uint8x16x2_t q4bits = vld1q_u8_x2(q4); q4 += 32;
|
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
|
||||||
|
|
||||||
#ifdef __ARM_FEATURE_DOTPROD
|
#ifdef __ARM_FEATURE_DOTPROD
|
||||||
q8bytes = vld1q_s8_x2(q8); q8 += 32;
|
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
||||||
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
||||||
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
||||||
|
|
||||||
const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
||||||
sumi1 += vaddvq_s32(p1) * scales[2*j+0];
|
sumi1 += vaddvq_s32(p1) * scales[2*j+0];
|
||||||
|
|
||||||
q8bytes = vld1q_s8_x2(q8); q8 += 32;
|
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
||||||
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
||||||
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
||||||
|
|
||||||
@ -5186,7 +5288,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
sumi2 += vaddvq_s32(p2) * scales[2*j+1];
|
sumi2 += vaddvq_s32(p2) * scales[2*j+1];
|
||||||
#else
|
#else
|
||||||
q8bytes = vld1q_s8_x2(q8); q8 += 32;
|
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
||||||
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
||||||
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
||||||
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
||||||
@ -5195,7 +5297,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
||||||
sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) * scales[2*j+0];
|
sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) * scales[2*j+0];
|
||||||
|
|
||||||
q8bytes = vld1q_s8_x2(q8); q8 += 32;
|
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
||||||
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
||||||
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
||||||
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
||||||
@ -5510,8 +5612,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
|
||||||
int8x16x2_t q4bytes;
|
ggml_int8x16x2_t q4bytes;
|
||||||
int8x16x4_t q8bytes;
|
ggml_int8x16x4_t q8bytes;
|
||||||
|
|
||||||
float sum_mins = 0.f;
|
float sum_mins = 0.f;
|
||||||
|
|
||||||
@ -5532,10 +5634,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
const float d = y[i].d * (float)x[i].d[0];
|
const float d = y[i].d * (float)x[i].d[0];
|
||||||
|
|
||||||
const uint8x16x2_t q4bits = vld1q_u8_x2(q4);
|
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
|
||||||
|
|
||||||
#ifdef __ARM_FEATURE_DOTPROD
|
#ifdef __ARM_FEATURE_DOTPROD
|
||||||
q8bytes = vld1q_s8_x4(q8);
|
q8bytes = ggml_vld1q_s8_x4(q8);
|
||||||
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
||||||
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
||||||
|
|
||||||
@ -5549,7 +5651,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
|
const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
|
||||||
|
|
||||||
#else
|
#else
|
||||||
q8bytes = vld1q_s8_x4(q8);
|
q8bytes = ggml_vld1q_s8_x4(q8);
|
||||||
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
||||||
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
||||||
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
||||||
@ -5783,7 +5885,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
const int32x4_t mzero = vdupq_n_s32(0);
|
const int32x4_t mzero = vdupq_n_s32(0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int8x16x4_t q5bytes;
|
ggml_int8x16x4_t q5bytes;
|
||||||
|
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
|
||||||
@ -5813,16 +5915,16 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
const uint8_t * restrict qh = x[i].qh;
|
const uint8_t * restrict qh = x[i].qh;
|
||||||
const int8_t * restrict q8 = y[i].qs;
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
|
||||||
uint8x16x2_t qhbits = vld1q_u8_x2(qh);
|
ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
|
||||||
|
|
||||||
uint8x16x4_t q5h;
|
ggml_uint8x16x4_t q5h;
|
||||||
|
|
||||||
int32_t sumi = 0;
|
int32_t sumi = 0;
|
||||||
|
|
||||||
for (int j = 0; j < QK_K/64; ++j) {
|
for (int j = 0; j < QK_K/64; ++j) {
|
||||||
|
|
||||||
const uint8x16x2_t q5bits = vld1q_u8_x2(q5); q5 += 32;
|
const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5); q5 += 32;
|
||||||
const int8x16x4_t q8bytes = vld1q_s8_x4(q8); q8 += 64;
|
const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
|
||||||
|
|
||||||
q5h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
|
q5h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
|
||||||
q5h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
|
q5h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
|
||||||
@ -6216,8 +6318,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
const int32x4_t mzero = vdupq_n_s32(0);
|
const int32x4_t mzero = vdupq_n_s32(0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int8x16x4_t q5bytes;
|
ggml_int8x16x4_t q5bytes;
|
||||||
uint8x16x4_t q5h;
|
ggml_uint8x16x4_t q5h;
|
||||||
|
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
|
||||||
@ -6232,8 +6334,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
const uint8x8_t qhbits = vld1_u8(qh);
|
const uint8x8_t qhbits = vld1_u8(qh);
|
||||||
|
|
||||||
const uint8x16x2_t q5bits = vld1q_u8_x2(q5);
|
const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5);
|
||||||
const int8x16x4_t q8bytes = vld1q_s8_x4(q8);
|
const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
|
||||||
|
|
||||||
const uint8x16_t htmp = vcombine_u8(qhbits, vshr_n_u8(qhbits, 1));
|
const uint8x16_t htmp = vcombine_u8(qhbits, vshr_n_u8(qhbits, 1));
|
||||||
q5h.val[0] = vbicq_u8(mh, vshlq_n_u8(htmp, 4));
|
q5h.val[0] = vbicq_u8(mh, vshlq_n_u8(htmp, 4));
|
||||||
@ -6509,8 +6611,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
const uint8x16_t mone = vdupq_n_u8(3);
|
const uint8x16_t mone = vdupq_n_u8(3);
|
||||||
|
|
||||||
int8x16x4_t q6bytes;
|
ggml_int8x16x4_t q6bytes;
|
||||||
uint8x16x4_t q6h;
|
ggml_uint8x16x4_t q6h;
|
||||||
|
|
||||||
for (int i = 0; i < nb; ++i) {
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
@ -6522,9 +6624,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
const int8_t * restrict scale = x[i].scales;
|
const int8_t * restrict scale = x[i].scales;
|
||||||
|
|
||||||
const int16x8x2_t q8sums = vld1q_s16_x2(y[i].bsums);
|
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
||||||
const int8x16_t scales = vld1q_s8(scale);
|
const int8x16_t scales = vld1q_s8(scale);
|
||||||
const int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
|
const ggml_int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
|
||||||
|
|
||||||
const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
|
const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
|
||||||
vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
|
vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
|
||||||
@ -6536,9 +6638,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
for (int j = 0; j < QK_K/128; ++j) {
|
for (int j = 0; j < QK_K/128; ++j) {
|
||||||
|
|
||||||
uint8x16x2_t qhbits = vld1q_u8_x2(qh); qh += 32;
|
ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); qh += 32;
|
||||||
uint8x16x4_t q6bits = vld1q_u8_x4(q6); q6 += 64;
|
ggml_uint8x16x4_t q6bits = ggml_vld1q_u8_x4(q6); q6 += 64;
|
||||||
int8x16x4_t q8bytes = vld1q_s8_x4(q8); q8 += 64;
|
ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
|
||||||
|
|
||||||
q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
|
q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
|
||||||
q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
|
q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
|
||||||
@ -6581,7 +6683,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
scale += 2;
|
scale += 2;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
q8bytes = vld1q_s8_x4(q8); q8 += 64;
|
q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
|
||||||
|
|
||||||
shifted = vshrq_n_u8(qhbits.val[0], 4);
|
shifted = vshrq_n_u8(qhbits.val[0], 4);
|
||||||
q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
|
q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
|
||||||
@ -6985,8 +7087,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
const uint8x16_t mone = vdupq_n_u8(3);
|
const uint8x16_t mone = vdupq_n_u8(3);
|
||||||
|
|
||||||
int8x16x4_t q6bytes;
|
ggml_int8x16x4_t q6bytes;
|
||||||
uint8x16x4_t q6h;
|
ggml_uint8x16x4_t q6h;
|
||||||
|
|
||||||
for (int i = 0; i < nb; ++i) {
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
@ -7001,8 +7103,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
int32_t isum = 0;
|
int32_t isum = 0;
|
||||||
|
|
||||||
uint8x16_t qhbits = vld1q_u8(qh);
|
uint8x16_t qhbits = vld1q_u8(qh);
|
||||||
uint8x16x2_t q6bits = vld1q_u8_x2(q6);
|
ggml_uint8x16x2_t q6bits = ggml_vld1q_u8_x2(q6);
|
||||||
int8x16x4_t q8bytes = vld1q_s8_x4(q8);
|
ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
|
||||||
|
|
||||||
q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits), 4);
|
q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits), 4);
|
||||||
uint8x16_t shifted = vshrq_n_u8(qhbits, 2);
|
uint8x16_t shifted = vshrq_n_u8(qhbits, 2);
|
||||||
|
114
ggml.h
114
ggml.h
@ -58,7 +58,8 @@
|
|||||||
// {
|
// {
|
||||||
// ...
|
// ...
|
||||||
//
|
//
|
||||||
// struct ggml_cgraph gf = ggml_build_forward(f);
|
// struct ggml_cgraph * gf = ggml_new_graph(ctx);
|
||||||
|
// ggml_build_forward_expand(gf, f);
|
||||||
//
|
//
|
||||||
// // set the input variable and parameter values
|
// // set the input variable and parameter values
|
||||||
// ggml_set_f32(x, 2.0f);
|
// ggml_set_f32(x, 2.0f);
|
||||||
@ -214,14 +215,13 @@
|
|||||||
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
||||||
|
|
||||||
#define GGML_MAX_DIMS 4
|
#define GGML_MAX_DIMS 4
|
||||||
#define GGML_MAX_NODES 16384
|
|
||||||
#define GGML_MAX_PARAMS 1024
|
#define GGML_MAX_PARAMS 1024
|
||||||
#define GGML_MAX_CONTEXTS 64
|
#define GGML_MAX_CONTEXTS 64
|
||||||
#define GGML_MAX_SRC 6
|
#define GGML_MAX_SRC 6
|
||||||
#define GGML_MAX_NAME 64
|
#define GGML_MAX_NAME 64
|
||||||
#define GGML_MAX_OP_PARAMS 32
|
#define GGML_MAX_OP_PARAMS 64
|
||||||
#define GGML_DEFAULT_N_THREADS 4
|
#define GGML_DEFAULT_N_THREADS 4
|
||||||
|
#define GGML_DEFAULT_GRAPH_SIZE 2048
|
||||||
#if UINTPTR_MAX == 0xFFFFFFFF
|
#if UINTPTR_MAX == 0xFFFFFFFF
|
||||||
#define GGML_MEM_ALIGN 4
|
#define GGML_MEM_ALIGN 4
|
||||||
#else
|
#else
|
||||||
@ -245,7 +245,10 @@
|
|||||||
do { \
|
do { \
|
||||||
if (!(x)) { \
|
if (!(x)) { \
|
||||||
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
||||||
abort(); \
|
fflush(stderr); \
|
||||||
|
fflush(stdout); \
|
||||||
|
ggml_print_backtrace(); \
|
||||||
|
exit(1); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
@ -400,13 +403,8 @@ extern "C" {
|
|||||||
GGML_OP_ROPE_BACK,
|
GGML_OP_ROPE_BACK,
|
||||||
GGML_OP_ALIBI,
|
GGML_OP_ALIBI,
|
||||||
GGML_OP_CLAMP,
|
GGML_OP_CLAMP,
|
||||||
GGML_OP_CONV_1D,
|
|
||||||
GGML_OP_CONV_1D_STAGE_0, // internal
|
|
||||||
GGML_OP_CONV_1D_STAGE_1, // internal
|
|
||||||
GGML_OP_CONV_TRANSPOSE_1D,
|
GGML_OP_CONV_TRANSPOSE_1D,
|
||||||
GGML_OP_CONV_2D,
|
GGML_OP_IM2COL,
|
||||||
GGML_OP_CONV_2D_STAGE_0, // internal
|
|
||||||
GGML_OP_CONV_2D_STAGE_1, // internal
|
|
||||||
GGML_OP_CONV_TRANSPOSE_2D,
|
GGML_OP_CONV_TRANSPOSE_2D,
|
||||||
GGML_OP_POOL_1D,
|
GGML_OP_POOL_1D,
|
||||||
GGML_OP_POOL_2D,
|
GGML_OP_POOL_2D,
|
||||||
@ -451,6 +449,7 @@ extern "C" {
|
|||||||
GGML_UNARY_OP_GELU,
|
GGML_UNARY_OP_GELU,
|
||||||
GGML_UNARY_OP_GELU_QUICK,
|
GGML_UNARY_OP_GELU_QUICK,
|
||||||
GGML_UNARY_OP_SILU,
|
GGML_UNARY_OP_SILU,
|
||||||
|
GGML_UNARY_OP_LEAKY
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ggml_object_type {
|
enum ggml_object_type {
|
||||||
@ -531,37 +530,33 @@ extern "C" {
|
|||||||
|
|
||||||
int n_threads;
|
int n_threads;
|
||||||
|
|
||||||
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
|
|
||||||
int n_tasks[GGML_MAX_NODES];
|
|
||||||
|
|
||||||
// abort ggml_graph_compute when true
|
// abort ggml_graph_compute when true
|
||||||
bool (*abort_callback)(void * data);
|
bool (*abort_callback)(void * data);
|
||||||
void * abort_callback_data;
|
void * abort_callback_data;
|
||||||
};
|
};
|
||||||
|
|
||||||
// next prime after GGML_MAX_NODES
|
|
||||||
// #define GGML_GRAPH_HASHTABLE_SIZE 4099
|
|
||||||
// next prime after GGML_MAX_NODES * 2 (nodes + leafs)
|
|
||||||
// #define GGML_GRAPH_HASHTABLE_SIZE 8273
|
|
||||||
// #define GGML_GRAPH_HASHTABLE_SIZE 16411
|
|
||||||
#define GGML_GRAPH_HASHTABLE_SIZE 32771
|
|
||||||
|
|
||||||
enum ggml_cgraph_eval_order {
|
enum ggml_cgraph_eval_order {
|
||||||
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
||||||
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
||||||
GGML_CGRAPH_EVAL_ORDER_COUNT
|
GGML_CGRAPH_EVAL_ORDER_COUNT
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct ggml_hash_set {
|
||||||
|
size_t size;
|
||||||
|
struct ggml_tensor ** keys;
|
||||||
|
};
|
||||||
|
|
||||||
// computation graph
|
// computation graph
|
||||||
struct ggml_cgraph {
|
struct ggml_cgraph {
|
||||||
|
int size;
|
||||||
int n_nodes;
|
int n_nodes;
|
||||||
int n_leafs;
|
int n_leafs;
|
||||||
|
|
||||||
struct ggml_tensor * nodes[GGML_MAX_NODES];
|
struct ggml_tensor ** nodes;
|
||||||
struct ggml_tensor * grads[GGML_MAX_NODES];
|
struct ggml_tensor ** grads;
|
||||||
struct ggml_tensor * leafs[GGML_MAX_NODES];
|
struct ggml_tensor ** leafs;
|
||||||
|
|
||||||
void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
struct ggml_hash_set visited_hash_table;
|
||||||
|
|
||||||
enum ggml_cgraph_eval_order order;
|
enum ggml_cgraph_eval_order order;
|
||||||
|
|
||||||
@ -571,8 +566,6 @@ extern "C" {
|
|||||||
int64_t perf_time_us;
|
int64_t perf_time_us;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
|
|
||||||
|
|
||||||
// scratch buffer
|
// scratch buffer
|
||||||
struct ggml_scratch {
|
struct ggml_scratch {
|
||||||
size_t offs;
|
size_t offs;
|
||||||
@ -617,6 +610,8 @@ extern "C" {
|
|||||||
GGML_API int64_t ggml_cycles(void);
|
GGML_API int64_t ggml_cycles(void);
|
||||||
GGML_API int64_t ggml_cycles_per_ms(void);
|
GGML_API int64_t ggml_cycles_per_ms(void);
|
||||||
|
|
||||||
|
GGML_API void ggml_print_backtrace(void);
|
||||||
|
|
||||||
GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
|
GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
|
||||||
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
||||||
|
|
||||||
@ -943,6 +938,10 @@ extern "C" {
|
|||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_leaky(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
@ -1326,8 +1325,13 @@ extern "C" {
|
|||||||
int n_dims,
|
int n_dims,
|
||||||
int mode,
|
int mode,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
|
int n_orig_ctx,
|
||||||
float freq_base,
|
float freq_base,
|
||||||
float freq_scale);
|
float freq_scale,
|
||||||
|
float ext_factor,
|
||||||
|
float attn_factor,
|
||||||
|
float beta_fast,
|
||||||
|
float beta_slow);
|
||||||
|
|
||||||
// in-place, returns view(a)
|
// in-place, returns view(a)
|
||||||
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
||||||
@ -1337,8 +1341,17 @@ extern "C" {
|
|||||||
int n_dims,
|
int n_dims,
|
||||||
int mode,
|
int mode,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
|
int n_orig_ctx,
|
||||||
float freq_base,
|
float freq_base,
|
||||||
float freq_scale);
|
float freq_scale,
|
||||||
|
float ext_factor,
|
||||||
|
float attn_factor,
|
||||||
|
float beta_fast,
|
||||||
|
float beta_slow);
|
||||||
|
|
||||||
|
// compute correction dims for YaRN RoPE scaling
|
||||||
|
void ggml_rope_yarn_corr_dims(
|
||||||
|
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
||||||
|
|
||||||
// xPos RoPE, in-place, returns view(a)
|
// xPos RoPE, in-place, returns view(a)
|
||||||
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
||||||
@ -1358,8 +1371,13 @@ extern "C" {
|
|||||||
int n_dims,
|
int n_dims,
|
||||||
int mode,
|
int mode,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
|
int n_orig_ctx,
|
||||||
float freq_base,
|
float freq_base,
|
||||||
float freq_scale,
|
float freq_scale,
|
||||||
|
float ext_factor,
|
||||||
|
float attn_factor,
|
||||||
|
float beta_fast,
|
||||||
|
float beta_slow,
|
||||||
float xpos_base,
|
float xpos_base,
|
||||||
bool xpos_down);
|
bool xpos_down);
|
||||||
|
|
||||||
@ -1380,6 +1398,18 @@ extern "C" {
|
|||||||
float min,
|
float min,
|
||||||
float max);
|
float max);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_im2col(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
int s0,
|
||||||
|
int s1,
|
||||||
|
int p0,
|
||||||
|
int p1,
|
||||||
|
int d0,
|
||||||
|
int d1,
|
||||||
|
bool is_2D);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_conv_1d(
|
GGML_API struct ggml_tensor * ggml_conv_1d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
@ -1463,6 +1493,8 @@ extern "C" {
|
|||||||
int s0, // stride
|
int s0, // stride
|
||||||
int p0); // padding
|
int p0); // padding
|
||||||
|
|
||||||
|
// the result will have 2*p0 padding for the first dimension
|
||||||
|
// and 2*p1 padding for the second dimension
|
||||||
GGML_API struct ggml_tensor * ggml_pool_2d(
|
GGML_API struct ggml_tensor * ggml_pool_2d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
@ -1471,8 +1503,8 @@ extern "C" {
|
|||||||
int k1,
|
int k1,
|
||||||
int s0,
|
int s0,
|
||||||
int s1,
|
int s1,
|
||||||
int p0,
|
float p0,
|
||||||
int p1);
|
float p1);
|
||||||
|
|
||||||
// nearest interpolate
|
// nearest interpolate
|
||||||
// used in stable-diffusion
|
// used in stable-diffusion
|
||||||
@ -1713,19 +1745,22 @@ extern "C" {
|
|||||||
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||||
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
||||||
|
|
||||||
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
|
||||||
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
|
||||||
|
|
||||||
// graph allocation in a context
|
// graph allocation in a context
|
||||||
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx);
|
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
||||||
GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);
|
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
|
||||||
|
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||||
|
GGML_API struct ggml_cgraph * ggml_graph_view (struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i0, int i1);
|
||||||
|
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
||||||
|
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
||||||
|
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
GGML_API size_t ggml_graph_overhead(void);
|
GGML_API size_t ggml_graph_overhead(void);
|
||||||
|
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
||||||
|
|
||||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||||
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
||||||
GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
|
||||||
|
|
||||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||||
@ -1734,7 +1769,7 @@ extern "C" {
|
|||||||
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
||||||
|
|
||||||
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
||||||
GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
|
GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
|
||||||
|
|
||||||
// print info and performance information for the graph
|
// print info and performance information for the graph
|
||||||
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
||||||
@ -1797,6 +1832,8 @@ extern "C" {
|
|||||||
struct ggml_opt_params {
|
struct ggml_opt_params {
|
||||||
enum ggml_opt_type type;
|
enum ggml_opt_type type;
|
||||||
|
|
||||||
|
size_t graph_size;
|
||||||
|
|
||||||
int n_threads;
|
int n_threads;
|
||||||
|
|
||||||
// delta-based convergence test
|
// delta-based convergence test
|
||||||
@ -2008,6 +2045,7 @@ extern "C" {
|
|||||||
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
|
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
|
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
|
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
|
||||||
|
GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
|
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
||||||
|
@ -11,6 +11,16 @@ as an example for its usage.
|
|||||||
pip install gguf
|
pip install gguf
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## API Examples/Simple Tools
|
||||||
|
|
||||||
|
[examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model.
|
||||||
|
|
||||||
|
[scripts/gguf-dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-dump.py) — Dumps a GGUF file's metadata to the console.
|
||||||
|
|
||||||
|
[scripts/gguf-set-metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-set-metadata.py) — Allows changing simple metadata values in a GGUF file by key.
|
||||||
|
|
||||||
|
[scripts/gguf-convert-endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-convert-endian.py) — Allows converting the endianness of GGUF files.
|
||||||
|
|
||||||
## Development
|
## Development
|
||||||
Maintainers who participate in development of this package are advised to install it in editable mode:
|
Maintainers who participate in development of this package are advised to install it in editable mode:
|
||||||
|
|
||||||
|
40
gguf-py/examples/writer.py
Executable file
40
gguf-py/examples/writer.py
Executable file
@ -0,0 +1,40 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# Necessary to load the local gguf package
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from gguf import GGUFWriter # noqa: E402
|
||||||
|
|
||||||
|
|
||||||
|
# Example usage:
|
||||||
|
def writer_example() -> None:
|
||||||
|
# Example usage with a file
|
||||||
|
gguf_writer = GGUFWriter("example.gguf", "llama")
|
||||||
|
|
||||||
|
gguf_writer.add_architecture()
|
||||||
|
gguf_writer.add_block_count(12)
|
||||||
|
gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer
|
||||||
|
gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float
|
||||||
|
gguf_writer.add_custom_alignment(64)
|
||||||
|
|
||||||
|
tensor1 = np.ones((32,), dtype=np.float32) * 100.0
|
||||||
|
tensor2 = np.ones((64,), dtype=np.float32) * 101.0
|
||||||
|
tensor3 = np.ones((96,), dtype=np.float32) * 102.0
|
||||||
|
|
||||||
|
gguf_writer.add_tensor("tensor1", tensor1)
|
||||||
|
gguf_writer.add_tensor("tensor2", tensor2)
|
||||||
|
gguf_writer.add_tensor("tensor3", tensor3)
|
||||||
|
|
||||||
|
gguf_writer.write_header_to_file()
|
||||||
|
gguf_writer.write_kv_data_to_file()
|
||||||
|
gguf_writer.write_tensors_to_file()
|
||||||
|
|
||||||
|
gguf_writer.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
writer_example()
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user