mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-27 03:44:35 +00:00
Merge branch 'master' into cuda-cublas-opts
ggml-ci
This commit is contained in:
commit
e75889a9b8
@ -15,6 +15,9 @@ indent_size = 4
|
|||||||
[Makefile]
|
[Makefile]
|
||||||
indent_style = tab
|
indent_style = tab
|
||||||
|
|
||||||
|
[scripts/*.mk]
|
||||||
|
indent_style = tab
|
||||||
|
|
||||||
[prompts/*.txt]
|
[prompts/*.txt]
|
||||||
insert_final_newline = unset
|
insert_final_newline = unset
|
||||||
|
|
||||||
|
15
.github/workflows/build.yml
vendored
15
.github/workflows/build.yml
vendored
@ -143,6 +143,9 @@ jobs:
|
|||||||
cd build
|
cd build
|
||||||
ctest --verbose
|
ctest --verbose
|
||||||
|
|
||||||
|
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||||
|
# how to debug it.
|
||||||
|
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
|
||||||
macOS-latest-make:
|
macOS-latest-make:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
|
||||||
@ -160,14 +163,18 @@ jobs:
|
|||||||
- name: Build
|
- name: Build
|
||||||
id: make_build
|
id: make_build
|
||||||
run: |
|
run: |
|
||||||
make -j $(sysctl -n hw.logicalcpu)
|
LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
id: make_test
|
id: make_test
|
||||||
run: |
|
run: |
|
||||||
make tests -j $(sysctl -n hw.logicalcpu)
|
LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
|
||||||
make test -j $(sysctl -n hw.logicalcpu)
|
LLAMA_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
|
# TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||||
|
# how to debug it.
|
||||||
|
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
|
||||||
|
# would be great if we fix these
|
||||||
macOS-latest-cmake:
|
macOS-latest-cmake:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
|
||||||
@ -188,7 +195,7 @@ jobs:
|
|||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake ..
|
cmake -DLLAMA_METAL=OFF ..
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -101,3 +101,4 @@ poetry.toml
|
|||||||
/tests/test-tokenizer-1-llama
|
/tests/test-tokenizer-1-llama
|
||||||
/tests/test-tokenizer-1-bpe
|
/tests/test-tokenizer-1-bpe
|
||||||
/tests/test-rope
|
/tests/test-rope
|
||||||
|
/tests/test-backend-ops
|
||||||
|
147
CMakeLists.txt
147
CMakeLists.txt
@ -97,9 +97,9 @@ option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging"
|
|||||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||||
|
|
||||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||||
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
|
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
|
||||||
|
|
||||||
# Required for relocatable CMake package
|
# Required for relocatable CMake package
|
||||||
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
||||||
@ -397,57 +397,102 @@ if (LLAMA_HIPBLAS)
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_ALL_WARNINGS)
|
function(get_flags CCID CCVER)
|
||||||
if (NOT MSVC)
|
set(C_FLAGS "")
|
||||||
set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
|
set(CXX_FLAGS "")
|
||||||
set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration)
|
|
||||||
set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
|
|
||||||
set(host_cxx_flags "")
|
|
||||||
|
|
||||||
if (CMAKE_C_COMPILER_ID MATCHES "Clang")
|
if (CCID MATCHES "Clang")
|
||||||
set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
|
set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return)
|
||||||
set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi)
|
set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
|
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
|
||||||
(CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3.0)
|
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
|
||||||
)
|
)
|
||||||
set(c_flags ${c_flags} -Wdouble-promotion)
|
set(C_FLAGS ${C_FLAGS} -Wdouble-promotion)
|
||||||
endif()
|
endif()
|
||||||
elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
|
elseif (CCID STREQUAL "GNU")
|
||||||
set(c_flags ${c_flags} -Wdouble-promotion)
|
set(C_FLAGS -Wdouble-promotion)
|
||||||
set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds)
|
set(CXX_FLAGS -Wno-array-bounds)
|
||||||
|
|
||||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
|
if (CCVER VERSION_GREATER_EQUAL 7.1.0)
|
||||||
set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation)
|
set(CXX_FLAGS ${CXX_FLAGS} -Wno-format-truncation)
|
||||||
endif()
|
endif()
|
||||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
|
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
|
||||||
set(host_cxx_flags ${host_cxx_flags} -Wextra-semi)
|
set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
|
||||||
endif()
|
|
||||||
endif()
|
endif()
|
||||||
else()
|
|
||||||
# todo : msvc
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(c_flags ${c_flags} ${warning_flags})
|
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
|
||||||
set(cxx_flags ${cxx_flags} ${warning_flags})
|
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
|
endfunction()
|
||||||
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
|
|
||||||
"$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
|
|
||||||
|
|
||||||
|
if (LLAMA_ALL_WARNINGS)
|
||||||
|
if (NOT MSVC)
|
||||||
|
set(WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
|
||||||
|
set(C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
|
||||||
|
-Werror=implicit-int -Werror=implicit-function-declaration)
|
||||||
|
set(CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
|
||||||
|
|
||||||
|
set(C_FLAGS ${WARNING_FLAGS} ${C_FLAGS})
|
||||||
|
set(CXX_FLAGS ${WARNING_FLAGS} ${CXX_FLAGS})
|
||||||
|
|
||||||
|
get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
|
||||||
|
|
||||||
|
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
|
||||||
|
"$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
|
||||||
|
else()
|
||||||
|
# todo : msvc
|
||||||
|
set(C_FLAGS "")
|
||||||
|
set(CXX_FLAGS "")
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (NOT MSVC)
|
if (LLAMA_CUBLAS)
|
||||||
set(cuda_flags -Wno-pedantic)
|
set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
|
||||||
endif()
|
if (NOT MSVC)
|
||||||
set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
|
set(CUDA_FLAGS ${CUDA_FLAGS} -Wno-pedantic)
|
||||||
|
endif()
|
||||||
|
|
||||||
list(JOIN host_cxx_flags " " cuda_host_flags) # pass host compiler flags as a single argument
|
if (LLAMA_ALL_WARNINGS AND NOT MSVC)
|
||||||
if (NOT cuda_host_flags STREQUAL "")
|
set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
|
||||||
set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
|
if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
|
||||||
endif()
|
set(NVCC_CMD ${NVCC_CMD} -ccbin ${CMAKE_CUDA_HOST_COMPILER})
|
||||||
|
endif()
|
||||||
|
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
|
execute_process(
|
||||||
|
COMMAND ${NVCC_CMD} -Xcompiler --version
|
||||||
|
OUTPUT_VARIABLE CUDA_CCFULLVER
|
||||||
|
ERROR_QUIET
|
||||||
|
)
|
||||||
|
|
||||||
|
if (NOT CUDA_CCFULLVER MATCHES clang)
|
||||||
|
set(CUDA_CCID "GNU")
|
||||||
|
execute_process(
|
||||||
|
COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
|
||||||
|
OUTPUT_VARIABLE CUDA_CCVER
|
||||||
|
ERROR_QUIET
|
||||||
|
)
|
||||||
|
else()
|
||||||
|
if (CUDA_CCFULLVER MATCHES Apple)
|
||||||
|
set(CUDA_CCID "AppleClang")
|
||||||
|
else()
|
||||||
|
set(CUDA_CCID "Clang")
|
||||||
|
endif()
|
||||||
|
string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
|
||||||
|
|
||||||
|
get_flags(${CUDA_CCID} ${CUDA_CCVER})
|
||||||
|
list(JOIN GF_CXX_FLAGS " " CUDA_CXX_FLAGS) # pass host compiler flags as a single argument
|
||||||
|
if (NOT CUDA_CXX_FLAGS STREQUAL "")
|
||||||
|
set(CUDA_FLAGS ${CUDA_FLAGS} -Xcompiler ${CUDA_CXX_FLAGS})
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
|
||||||
|
endif()
|
||||||
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
||||||
@ -471,6 +516,7 @@ endif()
|
|||||||
execute_process(
|
execute_process(
|
||||||
COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
|
COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
|
||||||
ERROR_VARIABLE output
|
ERROR_VARIABLE output
|
||||||
|
OUTPUT_QUIET
|
||||||
)
|
)
|
||||||
if (output MATCHES "dyld-1015\.7")
|
if (output MATCHES "dyld-1015\.7")
|
||||||
add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
|
add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
|
||||||
@ -593,6 +639,11 @@ else()
|
|||||||
message(STATUS "Unknown architecture")
|
message(STATUS "Unknown architecture")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (MINGW)
|
||||||
|
# Target Windows 8 for PrefetchVirtualMemory
|
||||||
|
add_compile_definitions(_WIN32_WINNT=0x602)
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# POSIX conformance
|
# POSIX conformance
|
||||||
#
|
#
|
||||||
@ -662,11 +713,11 @@ add_library(ggml OBJECT
|
|||||||
ggml-backend.h
|
ggml-backend.h
|
||||||
ggml-quants.c
|
ggml-quants.c
|
||||||
ggml-quants.h
|
ggml-quants.h
|
||||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||||
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
||||||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||||
)
|
)
|
||||||
|
|
||||||
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
||||||
|
146
Makefile
146
Makefile
@ -8,7 +8,8 @@ BUILD_TARGETS = \
|
|||||||
TEST_TARGETS = \
|
TEST_TARGETS = \
|
||||||
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
||||||
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
||||||
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
||||||
|
tests/test-backend-ops
|
||||||
|
|
||||||
# Code coverage output files
|
# Code coverage output files
|
||||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||||
@ -25,20 +26,6 @@ ifndef UNAME_M
|
|||||||
UNAME_M := $(shell uname -m)
|
UNAME_M := $(shell uname -m)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq '' '$(findstring clang,$(shell $(CC) --version))'
|
|
||||||
CC_IS_GCC=1
|
|
||||||
CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
|
|
||||||
else
|
|
||||||
CC_IS_CLANG=1
|
|
||||||
ifeq '' '$(findstring Apple,$(shell $(CC) --version))'
|
|
||||||
CC_IS_LLVM_CLANG=1
|
|
||||||
else
|
|
||||||
CC_IS_APPLE_CLANG=1
|
|
||||||
endif
|
|
||||||
CC_VER := $(shell $(CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \
|
|
||||||
| awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
|
|
||||||
endif
|
|
||||||
|
|
||||||
# Mac OS + Arm can report x86_64
|
# Mac OS + Arm can report x86_64
|
||||||
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
|
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
|
||||||
ifeq ($(UNAME_S),Darwin)
|
ifeq ($(UNAME_S),Darwin)
|
||||||
@ -120,12 +107,12 @@ MK_CXXFLAGS = -std=c++11 -fPIC
|
|||||||
|
|
||||||
# -Ofast tends to produce faster code, but may not be available for some compilers.
|
# -Ofast tends to produce faster code, but may not be available for some compilers.
|
||||||
ifdef LLAMA_FAST
|
ifdef LLAMA_FAST
|
||||||
MK_CFLAGS += -Ofast
|
MK_CFLAGS += -Ofast
|
||||||
MK_HOST_CXXFLAGS += -Ofast
|
HOST_CXXFLAGS += -Ofast
|
||||||
MK_CUDA_CXXFLAGS += -O3
|
MK_NVCCFLAGS += -O3
|
||||||
else
|
else
|
||||||
MK_CFLAGS += -O3
|
MK_CFLAGS += -O3
|
||||||
MK_CXXFLAGS += -O3
|
MK_CXXFLAGS += -O3
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# clock_gettime came in POSIX.1b (1993)
|
# clock_gettime came in POSIX.1b (1993)
|
||||||
@ -219,30 +206,6 @@ MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
|
|||||||
-Werror=implicit-function-declaration
|
-Werror=implicit-function-declaration
|
||||||
MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
|
MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
|
||||||
|
|
||||||
ifeq ($(CC_IS_CLANG), 1)
|
|
||||||
# clang options
|
|
||||||
MK_CFLAGS += -Wunreachable-code-break -Wunreachable-code-return
|
|
||||||
MK_HOST_CXXFLAGS += -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
|
|
||||||
|
|
||||||
ifneq '' '$(and $(CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 030800)))'
|
|
||||||
MK_CFLAGS += -Wdouble-promotion
|
|
||||||
endif
|
|
||||||
ifneq '' '$(and $(CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 070300)))'
|
|
||||||
MK_CFLAGS += -Wdouble-promotion
|
|
||||||
endif
|
|
||||||
else
|
|
||||||
# gcc options
|
|
||||||
MK_CFLAGS += -Wdouble-promotion
|
|
||||||
MK_HOST_CXXFLAGS += -Wno-array-bounds
|
|
||||||
|
|
||||||
ifeq ($(shell expr $(CC_VER) \>= 070100), 1)
|
|
||||||
MK_HOST_CXXFLAGS += -Wno-format-truncation
|
|
||||||
endif
|
|
||||||
ifeq ($(shell expr $(CC_VER) \>= 080100), 1)
|
|
||||||
MK_HOST_CXXFLAGS += -Wextra-semi
|
|
||||||
endif
|
|
||||||
endif
|
|
||||||
|
|
||||||
# this version of Apple ld64 is buggy
|
# this version of Apple ld64 is buggy
|
||||||
ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
|
ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
|
||||||
MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
|
MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
|
||||||
@ -293,8 +256,8 @@ ifndef RISCV
|
|||||||
|
|
||||||
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
|
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
|
||||||
# Use all CPU extensions that are available:
|
# Use all CPU extensions that are available:
|
||||||
MK_CFLAGS += -march=native -mtune=native
|
MK_CFLAGS += -march=native -mtune=native
|
||||||
MK_HOST_CXXFLAGS += -march=native -mtune=native
|
HOST_CXXFLAGS += -march=native -mtune=native
|
||||||
|
|
||||||
# Usage AVX-only
|
# Usage AVX-only
|
||||||
#MK_CFLAGS += -mfma -mf16c -mavx
|
#MK_CFLAGS += -mfma -mf16c -mavx
|
||||||
@ -305,12 +268,15 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
|
|||||||
#MK_CXXFLAGS += -mssse3
|
#MK_CXXFLAGS += -mssse3
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
|
|
||||||
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
|
|
||||||
# https://github.com/ggerganov/llama.cpp/issues/2922
|
|
||||||
ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
|
ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
|
||||||
|
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
|
||||||
|
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
|
||||||
|
# https://github.com/ggerganov/llama.cpp/issues/2922
|
||||||
MK_CFLAGS += -Xassembler -muse-unaligned-vector-move
|
MK_CFLAGS += -Xassembler -muse-unaligned-vector-move
|
||||||
MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
|
MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
|
||||||
|
|
||||||
|
# Target Windows 8 for PrefetchVirtualMemory
|
||||||
|
MK_CPPFLAGS += -D_WIN32_WINNT=0x602
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifneq ($(filter aarch64%,$(UNAME_M)),)
|
ifneq ($(filter aarch64%,$(UNAME_M)),)
|
||||||
@ -394,61 +360,64 @@ ifdef LLAMA_CUBLAS
|
|||||||
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||||
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
||||||
OBJS += ggml-cuda.o
|
OBJS += ggml-cuda.o
|
||||||
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
|
MK_NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
|
||||||
|
|
||||||
|
ifdef LLAMA_DEBUG
|
||||||
|
MK_NVCCFLAGS += -lineinfo
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_CUDA_NVCC
|
ifdef LLAMA_CUDA_NVCC
|
||||||
NVCC = $(LLAMA_CUDA_NVCC)
|
NVCC = $(LLAMA_CUDA_NVCC)
|
||||||
else
|
else
|
||||||
NVCC = nvcc
|
NVCC = nvcc
|
||||||
endif #LLAMA_CUDA_NVCC
|
endif #LLAMA_CUDA_NVCC
|
||||||
ifdef CUDA_DOCKER_ARCH
|
ifdef CUDA_DOCKER_ARCH
|
||||||
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
|
MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
|
||||||
else ifdef CUDA_POWER_ARCH
|
else ifndef CUDA_POWER_ARCH
|
||||||
NVCCFLAGS +=
|
MK_NVCCFLAGS += -arch=native
|
||||||
else
|
|
||||||
NVCCFLAGS += -arch=native
|
|
||||||
endif # CUDA_DOCKER_ARCH
|
endif # CUDA_DOCKER_ARCH
|
||||||
ifdef LLAMA_CUDA_FORCE_DMMV
|
ifdef LLAMA_CUDA_FORCE_DMMV
|
||||||
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
|
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||||
endif # LLAMA_CUDA_FORCE_DMMV
|
endif # LLAMA_CUDA_FORCE_DMMV
|
||||||
ifdef LLAMA_CUDA_FORCE_MMQ
|
ifdef LLAMA_CUDA_FORCE_MMQ
|
||||||
NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
|
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
|
||||||
endif # LLAMA_CUDA_FORCE_MMQ
|
endif # LLAMA_CUDA_FORCE_MMQ
|
||||||
ifdef LLAMA_CUDA_DMMV_X
|
ifdef LLAMA_CUDA_DMMV_X
|
||||||
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||||
else
|
else
|
||||||
NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
|
MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
|
||||||
endif # LLAMA_CUDA_DMMV_X
|
endif # LLAMA_CUDA_DMMV_X
|
||||||
ifdef LLAMA_CUDA_MMV_Y
|
ifdef LLAMA_CUDA_MMV_Y
|
||||||
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
|
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
|
||||||
else ifdef LLAMA_CUDA_DMMV_Y
|
else ifdef LLAMA_CUDA_DMMV_Y
|
||||||
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
|
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
|
||||||
else
|
else
|
||||||
NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
|
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
|
||||||
endif # LLAMA_CUDA_MMV_Y
|
endif # LLAMA_CUDA_MMV_Y
|
||||||
ifdef LLAMA_CUDA_F16
|
ifdef LLAMA_CUDA_F16
|
||||||
NVCCFLAGS += -DGGML_CUDA_F16
|
MK_NVCCFLAGS += -DGGML_CUDA_F16
|
||||||
endif # LLAMA_CUDA_F16
|
endif # LLAMA_CUDA_F16
|
||||||
ifdef LLAMA_CUDA_DMMV_F16
|
ifdef LLAMA_CUDA_DMMV_F16
|
||||||
NVCCFLAGS += -DGGML_CUDA_F16
|
MK_NVCCFLAGS += -DGGML_CUDA_F16
|
||||||
endif # LLAMA_CUDA_DMMV_F16
|
endif # LLAMA_CUDA_DMMV_F16
|
||||||
ifdef LLAMA_CUDA_KQUANTS_ITER
|
ifdef LLAMA_CUDA_KQUANTS_ITER
|
||||||
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
||||||
else
|
else
|
||||||
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
||||||
endif
|
endif
|
||||||
ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
||||||
NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
|
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
|
||||||
else
|
else
|
||||||
NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
|
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
|
||||||
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
||||||
#ifdef LLAMA_CUDA_CUBLAS
|
#ifdef LLAMA_CUDA_CUBLAS
|
||||||
# NVCCFLAGS += -DGGML_CUDA_CUBLAS
|
# MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
|
||||||
#endif # LLAMA_CUDA_CUBLAS
|
#endif # LLAMA_CUDA_CUBLAS
|
||||||
ifdef LLAMA_CUDA_CCBIN
|
ifdef LLAMA_CUDA_CCBIN
|
||||||
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
||||||
endif
|
endif
|
||||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||||
$(NVCC) $(NVCCFLAGS) -c $< -o $@
|
$(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
||||||
endif # LLAMA_CUBLAS
|
endif # LLAMA_CUBLAS
|
||||||
|
|
||||||
ifdef LLAMA_CLBLAST
|
ifdef LLAMA_CLBLAST
|
||||||
@ -510,16 +479,22 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
|||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
endif # LLAMA_MPI
|
endif # LLAMA_MPI
|
||||||
|
|
||||||
# combine build flags with cmdline overrides
|
GF_CC := $(CC)
|
||||||
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
|
include scripts/get-flags.mk
|
||||||
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
|
|
||||||
override CUDA_CXXFLAGS := $(MK_CUDA_CXXFLAGS) $(CUDA_CXXFLAGS)
|
|
||||||
override HOST_CXXFLAGS := $(MK_HOST_CXXFLAGS) $(HOST_CXXFLAGS)
|
|
||||||
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
|
||||||
|
|
||||||
# save CXXFLAGS before we add host-only options
|
# combine build flags with cmdline overrides
|
||||||
NVCCFLAGS := $(NVCCFLAGS) $(CXXFLAGS) $(CUDA_CXXFLAGS) -Wno-pedantic -Xcompiler "$(HOST_CXXFLAGS)"
|
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
|
||||||
override CXXFLAGS += $(HOST_CXXFLAGS)
|
BASE_CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
|
||||||
|
override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS)
|
||||||
|
override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
|
||||||
|
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
||||||
|
|
||||||
|
# identify CUDA host compiler
|
||||||
|
ifdef LLAMA_CUBLAS
|
||||||
|
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
|
||||||
|
include scripts/get-flags.mk
|
||||||
|
CUDA_CXXFLAGS := $(GF_CXXFLAGS)
|
||||||
|
endif
|
||||||
|
|
||||||
#
|
#
|
||||||
# Print build information
|
# Print build information
|
||||||
@ -729,16 +704,16 @@ tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS)
|
|||||||
tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
|
tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
|
tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
|
||||||
@ -746,3 +721,6 @@ tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
|
|||||||
|
|
||||||
tests/test-c.o: tests/test-c.c llama.h
|
tests/test-c.o: tests/test-c.c llama.h
|
||||||
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
|
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
|
||||||
|
|
||||||
|
tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
13
README.md
13
README.md
@ -10,6 +10,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
|||||||
|
|
||||||
### Hot topics
|
### Hot topics
|
||||||
|
|
||||||
|
- Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
|
||||||
|
- **llama.h API change for handling KV cache offloading and data type: https://github.com/ggerganov/llama.cpp/pull/4309**
|
||||||
- Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225
|
- Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225
|
||||||
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
|
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
|
||||||
- Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167
|
- Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167
|
||||||
@ -95,7 +97,18 @@ as the main playground for developing new features for the [ggml](https://github
|
|||||||
- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
|
- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
|
||||||
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
|
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
|
||||||
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
|
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
|
||||||
|
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
|
||||||
- [X] [StableLM-3b-4e1t](https://github.com/ggerganov/llama.cpp/pull/3586)
|
- [X] [StableLM-3b-4e1t](https://github.com/ggerganov/llama.cpp/pull/3586)
|
||||||
|
- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
|
||||||
|
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
|
||||||
|
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
|
||||||
|
|
||||||
|
**Multimodal models:**
|
||||||
|
|
||||||
|
- [x] [Llava 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e)
|
||||||
|
- [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
|
||||||
|
- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
|
||||||
|
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
|
||||||
|
|
||||||
|
|
||||||
**Bindings:**
|
**Bindings:**
|
||||||
|
@ -278,8 +278,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.yarn_beta_slow = std::stof(argv[i]);
|
params.yarn_beta_slow = std::stof(argv[i]);
|
||||||
} else if (arg == "--memory-f32") {
|
|
||||||
params.memory_f16 = false;
|
|
||||||
} else if (arg == "--samplers") {
|
} else if (arg == "--samplers") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
@ -510,6 +508,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||||||
params.infill = true;
|
params.infill = true;
|
||||||
} else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
|
} else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
|
||||||
params.dump_kv_cache = true;
|
params.dump_kv_cache = true;
|
||||||
|
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
||||||
|
params.no_kv_offload = true;
|
||||||
|
} else if (arg == "-ctk" || arg == "--cache-type-k") {
|
||||||
|
params.cache_type_k = argv[++i];
|
||||||
|
} else if (arg == "-ctv" || arg == "--cache-type-v") {
|
||||||
|
params.cache_type_v = argv[++i];
|
||||||
} else if (arg == "--multiline-input") {
|
} else if (arg == "--multiline-input") {
|
||||||
params.multiline_input = true;
|
params.multiline_input = true;
|
||||||
} else if (arg == "--simple-io") {
|
} else if (arg == "--simple-io") {
|
||||||
@ -652,6 +656,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||||||
} else if (arg == "-h" || arg == "--help") {
|
} else if (arg == "-h" || arg == "--help") {
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
} else if (arg == "--version") {
|
||||||
|
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
||||||
|
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
||||||
|
exit(0);
|
||||||
} else if (arg == "--random-prompt") {
|
} else if (arg == "--random-prompt") {
|
||||||
params.random_prompt = true;
|
params.random_prompt = true;
|
||||||
} else if (arg == "--in-prefix-bos") {
|
} else if (arg == "--in-prefix-bos") {
|
||||||
@ -790,6 +798,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||||||
printf("\n");
|
printf("\n");
|
||||||
printf("options:\n");
|
printf("options:\n");
|
||||||
printf(" -h, --help show this help message and exit\n");
|
printf(" -h, --help show this help message and exit\n");
|
||||||
|
printf(" --version show version and build info\n");
|
||||||
printf(" -i, --interactive run in interactive mode\n");
|
printf(" -i, --interactive run in interactive mode\n");
|
||||||
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
||||||
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
||||||
@ -858,8 +867,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||||||
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
||||||
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||||
printf(" --no-penalize-nl do not penalize newline token\n");
|
printf(" --no-penalize-nl do not penalize newline token\n");
|
||||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
|
||||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
|
||||||
printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
|
printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
|
||||||
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
||||||
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
||||||
@ -900,6 +907,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||||||
printf(" --verbose-prompt print prompt before generation\n");
|
printf(" --verbose-prompt print prompt before generation\n");
|
||||||
printf(" -dkvc, --dump-kv-cache\n");
|
printf(" -dkvc, --dump-kv-cache\n");
|
||||||
printf(" verbose print of the KV cache\n");
|
printf(" verbose print of the KV cache\n");
|
||||||
|
printf(" -nkvo, --no-kv-offload\n");
|
||||||
|
printf(" disable KV offload\n");
|
||||||
|
printf(" -ctk TYPE, --cache-type-k TYPE\n");
|
||||||
|
printf(" KV cache data type for K (default: %s)\n", params.cache_type_k.c_str());
|
||||||
|
printf(" -ctv TYPE, --cache-type-v TYPE\n");
|
||||||
|
printf(" KV cache data type for V (default: %s)\n", params.cache_type_v.c_str());
|
||||||
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
||||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||||
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
||||||
@ -1015,6 +1028,29 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
|||||||
return mparams;
|
return mparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||||
|
if (s == "f16") {
|
||||||
|
return GGML_TYPE_F16;
|
||||||
|
}
|
||||||
|
if (s == "q8_0") {
|
||||||
|
return GGML_TYPE_Q8_0;
|
||||||
|
}
|
||||||
|
if (s == "q4_0") {
|
||||||
|
return GGML_TYPE_Q4_0;
|
||||||
|
}
|
||||||
|
if (s == "q4_1") {
|
||||||
|
return GGML_TYPE_Q4_1;
|
||||||
|
}
|
||||||
|
if (s == "q5_0") {
|
||||||
|
return GGML_TYPE_Q5_0;
|
||||||
|
}
|
||||||
|
if (s == "q5_1") {
|
||||||
|
return GGML_TYPE_Q5_1;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw std::runtime_error("Invalid cache type: " + s);
|
||||||
|
}
|
||||||
|
|
||||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
||||||
auto cparams = llama_context_default_params();
|
auto cparams = llama_context_default_params();
|
||||||
|
|
||||||
@ -1024,7 +1060,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|||||||
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||||
cparams.mul_mat_q = params.mul_mat_q;
|
cparams.mul_mat_q = params.mul_mat_q;
|
||||||
cparams.seed = params.seed;
|
cparams.seed = params.seed;
|
||||||
cparams.f16_kv = params.memory_f16;
|
|
||||||
cparams.logits_all = params.logits_all;
|
cparams.logits_all = params.logits_all;
|
||||||
cparams.embedding = params.embedding;
|
cparams.embedding = params.embedding;
|
||||||
cparams.rope_scaling_type = params.rope_scaling_type;
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
||||||
@ -1035,6 +1070,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|||||||
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
||||||
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
||||||
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
||||||
|
cparams.offload_kqv = !params.no_kv_offload;
|
||||||
|
|
||||||
|
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
||||||
|
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
||||||
|
|
||||||
return cparams;
|
return cparams;
|
||||||
}
|
}
|
||||||
@ -1447,7 +1486,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|||||||
}
|
}
|
||||||
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
||||||
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
||||||
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
|
||||||
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
||||||
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
||||||
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
||||||
|
@ -100,7 +100,6 @@ struct gpt_params {
|
|||||||
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
||||||
|
|
||||||
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
|
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
|
||||||
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
|
||||||
bool random_prompt = false; // do not randomize prompt if none provided
|
bool random_prompt = false; // do not randomize prompt if none provided
|
||||||
bool use_color = false; // use color to distinguish generations and inputs
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
bool interactive = false; // interactive mode
|
bool interactive = false; // interactive mode
|
||||||
@ -125,10 +124,14 @@ struct gpt_params {
|
|||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
bool infill = false; // use infill mode
|
bool infill = false; // use infill mode
|
||||||
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||||
|
bool no_kv_offload = false; // disable KV offloading
|
||||||
|
|
||||||
|
std::string cache_type_k = "f16"; // KV cache data type for the K
|
||||||
|
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||||
|
|
||||||
// multimodal models (see examples/llava)
|
// multimodal models (see examples/llava)
|
||||||
std::string mmproj = ""; // path to multimodal projector
|
std::string mmproj = ""; // path to multimodal projector
|
||||||
std::string image = ""; // path to an image file
|
std::string image = ""; // path to an image file
|
||||||
};
|
};
|
||||||
|
|
||||||
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
|
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
|
||||||
|
@ -61,13 +61,13 @@
|
|||||||
// #define LOG_TARGET stderr
|
// #define LOG_TARGET stderr
|
||||||
// #include "log.h"
|
// #include "log.h"
|
||||||
//
|
//
|
||||||
// The log target can also be redirected to a diffrent function
|
// The log target can also be redirected to a different function
|
||||||
// like so:
|
// like so:
|
||||||
//
|
//
|
||||||
// #define LOG_TARGET log_handler_diffrent()
|
// #define LOG_TARGET log_handler_different()
|
||||||
// #include "log.h"
|
// #include "log.h"
|
||||||
//
|
//
|
||||||
// FILE* log_handler_diffrent()
|
// FILE* log_handler_different()
|
||||||
// {
|
// {
|
||||||
// return stderr;
|
// return stderr;
|
||||||
// }
|
// }
|
||||||
@ -421,7 +421,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriS
|
|||||||
|
|
||||||
// Disables logs entirely at runtime.
|
// Disables logs entirely at runtime.
|
||||||
// Makes LOG() and LOG_TEE() produce no output,
|
// Makes LOG() and LOG_TEE() produce no output,
|
||||||
// untill enabled back.
|
// until enabled back.
|
||||||
#define log_disable() log_disable_impl()
|
#define log_disable() log_disable_impl()
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
// INTERNAL, DO NOT USE
|
||||||
|
@ -113,13 +113,15 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
|||||||
default : break;
|
default : break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else result += "-> mirostat ";
|
} else {
|
||||||
|
result += "-> mirostat ";
|
||||||
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// no reasons to expose this function in header
|
// no reasons to expose this function in header
|
||||||
void sampler_queue(
|
static void sampler_queue(
|
||||||
struct llama_context * ctx_main,
|
struct llama_context * ctx_main,
|
||||||
const llama_sampling_params & params,
|
const llama_sampling_params & params,
|
||||||
llama_token_data_array & cur_p,
|
llama_token_data_array & cur_p,
|
||||||
|
@ -71,7 +71,7 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd)
|
|||||||
|
|
||||||
struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
|
struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
|
||||||
float scale = 1.0f; // xavier
|
float scale = 1.0f; // xavier
|
||||||
switch (tensor->n_dims) {
|
switch (ggml_n_dims(tensor)) {
|
||||||
case 1:
|
case 1:
|
||||||
scale /= sqrtf((float) tensor->ne[0]);
|
scale /= sqrtf((float) tensor->ne[0]);
|
||||||
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
||||||
@ -119,7 +119,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
|
struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
|
||||||
switch (tensor->n_dims) {
|
switch (ggml_n_dims(tensor)) {
|
||||||
case 1:
|
case 1:
|
||||||
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
||||||
float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
|
float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
|
||||||
@ -183,25 +183,27 @@ float fclamp(const float v, const float min, const float max) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
|
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
|
||||||
GGML_ASSERT(tensor->n_dims == 1);
|
|
||||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||||
|
GGML_ASSERT(tensor->ne[1] == 1);
|
||||||
|
GGML_ASSERT(tensor->ne[2] == 1);
|
||||||
|
GGML_ASSERT(tensor->ne[3] == 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
|
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
|
||||||
GGML_ASSERT(tensor->n_dims == 2);
|
|
||||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||||
|
GGML_ASSERT(tensor->ne[2] == 1);
|
||||||
|
GGML_ASSERT(tensor->ne[3] == 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
|
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
|
||||||
GGML_ASSERT(tensor->n_dims == 3);
|
|
||||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||||
GGML_ASSERT(tensor->ne[2] == ne2);
|
GGML_ASSERT(tensor->ne[2] == ne2);
|
||||||
|
GGML_ASSERT(tensor->ne[3] == 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
||||||
GGML_ASSERT(tensor->n_dims == 4);
|
|
||||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||||
GGML_ASSERT(tensor->ne[2] == ne2);
|
GGML_ASSERT(tensor->ne[2] == ne2);
|
||||||
@ -225,8 +227,8 @@ int64_t get_example_targets_batch(
|
|||||||
bool sample_random_offsets
|
bool sample_random_offsets
|
||||||
) {
|
) {
|
||||||
GGML_ASSERT(samples_count > 0);
|
GGML_ASSERT(samples_count > 0);
|
||||||
GGML_ASSERT(tokens_input->n_dims == 2);
|
GGML_ASSERT(ggml_is_matrix(tokens_input));
|
||||||
GGML_ASSERT(target_probs->n_dims == 3);
|
GGML_ASSERT(ggml_is_3d(target_probs));
|
||||||
int64_t n_vocab = target_probs->ne[0];
|
int64_t n_vocab = target_probs->ne[0];
|
||||||
int64_t n_tokens = tokens_input->ne[0];
|
int64_t n_tokens = tokens_input->ne[0];
|
||||||
int64_t n_batch = tokens_input->ne[1];
|
int64_t n_batch = tokens_input->ne[1];
|
||||||
|
@ -77,8 +77,18 @@ class Model:
|
|||||||
self.gguf_writer.add_embedding_length(n_embd)
|
self.gguf_writer.add_embedding_length(n_embd)
|
||||||
if (n_ff := self.hparams.get("intermediate_size")) is not None:
|
if (n_ff := self.hparams.get("intermediate_size")) is not None:
|
||||||
self.gguf_writer.add_feed_forward_length(n_ff)
|
self.gguf_writer.add_feed_forward_length(n_ff)
|
||||||
if (n_head := self.hparams.get("num_attention_head")) is not None:
|
if (n_head := self.hparams.get("num_attention_heads")) is not None:
|
||||||
self.gguf_writer.add_head_count(n_head)
|
self.gguf_writer.add_head_count(n_head)
|
||||||
|
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
|
||||||
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
||||||
|
|
||||||
|
if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps)
|
||||||
|
if (n_experts := self.hparams.get("num_local_experts")) is not None:
|
||||||
|
self.gguf_writer.add_expert_count(n_experts)
|
||||||
|
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
|
||||||
|
self.gguf_writer.add_expert_used_count(n_experts_used)
|
||||||
|
|
||||||
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
|
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
|
||||||
|
|
||||||
def write_tensors(self):
|
def write_tensors(self):
|
||||||
@ -170,6 +180,8 @@ class Model:
|
|||||||
return StableLMModel
|
return StableLMModel
|
||||||
if model_architecture == "QWenLMHeadModel":
|
if model_architecture == "QWenLMHeadModel":
|
||||||
return QwenModel
|
return QwenModel
|
||||||
|
if model_architecture == "MixtralForCausalLM":
|
||||||
|
return MixtralModel
|
||||||
return Model
|
return Model
|
||||||
|
|
||||||
def _is_model_safetensors(self) -> bool:
|
def _is_model_safetensors(self) -> bool:
|
||||||
@ -207,6 +219,8 @@ class Model:
|
|||||||
return gguf.MODEL_ARCH.STABLELM
|
return gguf.MODEL_ARCH.STABLELM
|
||||||
if arch == "QWenLMHeadModel":
|
if arch == "QWenLMHeadModel":
|
||||||
return gguf.MODEL_ARCH.QWEN
|
return gguf.MODEL_ARCH.QWEN
|
||||||
|
if arch == "MixtralForCausalLM":
|
||||||
|
return gguf.MODEL_ARCH.LLAMA
|
||||||
|
|
||||||
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
||||||
|
|
||||||
@ -837,6 +851,11 @@ class StableLMModel(Model):
|
|||||||
self.gguf_writer.add_layer_norm_eps(1e-5)
|
self.gguf_writer.add_layer_norm_eps(1e-5)
|
||||||
|
|
||||||
|
|
||||||
|
class MixtralModel(Model):
|
||||||
|
def set_vocab(self):
|
||||||
|
self._set_vocab_sentencepiece()
|
||||||
|
|
||||||
|
|
||||||
class QwenModel(Model):
|
class QwenModel(Model):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def token_bytes_to_string(b):
|
def token_bytes_to_string(b):
|
||||||
|
@ -3,7 +3,6 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import struct
|
import struct
|
||||||
import sys
|
import sys
|
||||||
from typing import Any, BinaryIO, Sequence
|
from typing import Any, BinaryIO, Sequence
|
||||||
@ -11,43 +10,15 @@ from typing import Any, BinaryIO, Sequence
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||||
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
|
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
|
||||||
|
|
||||||
|
|
||||||
HF_SUBLAYER_TO_GGML = {
|
|
||||||
"self_attn.q_proj": "attn_q",
|
|
||||||
"self_attn.k_proj": "attn_k",
|
|
||||||
"self_attn.v_proj": "attn_v",
|
|
||||||
"self_attn.o_proj": "attn_output",
|
|
||||||
"mlp.gate_proj": "ffn_gate",
|
|
||||||
"mlp.down_proj": "ffn_down",
|
|
||||||
"mlp.up_proj": "ffn_up",
|
|
||||||
"input_layernorm": "attn_norm",
|
|
||||||
"post_attention_layernorm": "ffn_norm",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def translate_tensor_name(t: str) -> str:
|
|
||||||
match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
|
|
||||||
if match:
|
|
||||||
nn = match.group(1)
|
|
||||||
sub_layer = match.group(2)
|
|
||||||
lora_type = match.group(3)
|
|
||||||
|
|
||||||
sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
|
|
||||||
if sub_layer_renamed is None:
|
|
||||||
print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
output_string = (
|
|
||||||
f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
|
|
||||||
)
|
|
||||||
return output_string
|
|
||||||
else:
|
|
||||||
print(f"Error: unrecognized tensor {t}")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
|
def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
|
||||||
fout.write(b"ggla"[::-1]) # magic (ggml lora)
|
fout.write(b"ggla"[::-1]) # magic (ggml lora)
|
||||||
fout.write(struct.pack("i", 1)) # file version
|
fout.write(struct.pack("i", 1)) # file version
|
||||||
@ -61,9 +32,7 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
|
|||||||
fout.write(struct.pack("i", int(params["lora_alpha"])))
|
fout.write(struct.pack("i", int(params["lora_alpha"])))
|
||||||
|
|
||||||
|
|
||||||
def write_tensor_header(
|
def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
|
||||||
self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
|
|
||||||
) -> None:
|
|
||||||
sname = name.encode("utf-8")
|
sname = name.encode("utf-8")
|
||||||
fout.write(
|
fout.write(
|
||||||
struct.pack(
|
struct.pack(
|
||||||
@ -78,11 +47,12 @@ def write_tensor_header(
|
|||||||
fout.seek((fout.tell() + 31) & -32)
|
fout.seek((fout.tell() + 31) & -32)
|
||||||
|
|
||||||
|
|
||||||
if len(sys.argv) != 2:
|
if len(sys.argv) < 2:
|
||||||
print(f"Usage: python {sys.argv[0]} <path>")
|
print(f"Usage: python {sys.argv[0]} <path> [arch]")
|
||||||
print(
|
print(
|
||||||
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
|
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
|
||||||
)
|
)
|
||||||
|
print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
input_json = os.path.join(sys.argv[1], "adapter_config.json")
|
input_json = os.path.join(sys.argv[1], "adapter_config.json")
|
||||||
@ -90,6 +60,14 @@ input_model = os.path.join(sys.argv[1], "adapter_model.bin")
|
|||||||
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
|
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
|
||||||
|
|
||||||
model = torch.load(input_model, map_location="cpu")
|
model = torch.load(input_model, map_location="cpu")
|
||||||
|
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
|
||||||
|
|
||||||
|
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
|
||||||
|
print(f"Error: unsupported architecture {arch_name}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
|
||||||
|
name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
|
||||||
|
|
||||||
with open(input_json, "r") as f:
|
with open(input_json, "r") as f:
|
||||||
params = json.load(f)
|
params = json.load(f)
|
||||||
@ -117,6 +95,7 @@ with open(output_path, "wb") as fout:
|
|||||||
|
|
||||||
write_file_header(fout, params)
|
write_file_header(fout, params)
|
||||||
for k, v in model.items():
|
for k, v in model.items():
|
||||||
|
orig_k = k
|
||||||
if k.endswith(".default.weight"):
|
if k.endswith(".default.weight"):
|
||||||
k = k.replace(".default.weight", ".weight")
|
k = k.replace(".default.weight", ".weight")
|
||||||
if k in ["llama_proj.weight", "llama_proj.bias"]:
|
if k in ["llama_proj.weight", "llama_proj.bias"]:
|
||||||
@ -129,7 +108,32 @@ with open(output_path, "wb") as fout:
|
|||||||
v = v.float()
|
v = v.float()
|
||||||
|
|
||||||
t = v.detach().numpy()
|
t = v.detach().numpy()
|
||||||
tname = translate_tensor_name(k)
|
|
||||||
|
prefix = "base_model.model."
|
||||||
|
if k.startswith(prefix):
|
||||||
|
k = k[len(prefix) :]
|
||||||
|
|
||||||
|
lora_suffixes = (".lora_A.weight", ".lora_B.weight")
|
||||||
|
if k.endswith(lora_suffixes):
|
||||||
|
suffix = k[-len(lora_suffixes[0]):]
|
||||||
|
k = k[: -len(lora_suffixes[0])]
|
||||||
|
else:
|
||||||
|
print(f"Error: unrecognized tensor name {orig_k}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
tname = name_map.get_name(k)
|
||||||
|
if tname is None:
|
||||||
|
print(f"Error: could not map tensor name {orig_k}")
|
||||||
|
print(" Note: the arch parameter must be specified if the model is not llama")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if suffix == ".lora_A.weight":
|
||||||
|
tname += ".weight.loraA"
|
||||||
|
elif suffix == ".lora_B.weight":
|
||||||
|
tname += ".weight.loraB"
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
|
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
|
||||||
write_tensor_header(fout, tname, t.shape, t.dtype)
|
write_tensor_header(fout, tname, t.shape, t.dtype)
|
||||||
t.tofile(fout)
|
t.tofile(fout)
|
||||||
|
401
convert.py
401
convert.py
@ -10,6 +10,7 @@ import itertools
|
|||||||
import json
|
import json
|
||||||
import math
|
import math
|
||||||
import mmap
|
import mmap
|
||||||
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
import re
|
import re
|
||||||
import signal
|
import signal
|
||||||
@ -18,15 +19,15 @@ import sys
|
|||||||
import time
|
import time
|
||||||
import zipfile
|
import zipfile
|
||||||
from abc import ABCMeta, abstractmethod
|
from abc import ABCMeta, abstractmethod
|
||||||
|
from collections import OrderedDict
|
||||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
|
from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, Optional, TypeVar, cast
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sentencepiece import SentencePieceProcessor
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
import os
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
import gguf
|
import gguf
|
||||||
@ -42,6 +43,7 @@ NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
|||||||
ARCH = gguf.MODEL_ARCH.LLAMA
|
ARCH = gguf.MODEL_ARCH.LLAMA
|
||||||
|
|
||||||
DEFAULT_CONCURRENCY = 8
|
DEFAULT_CONCURRENCY = 8
|
||||||
|
|
||||||
#
|
#
|
||||||
# data types
|
# data types
|
||||||
#
|
#
|
||||||
@ -62,10 +64,10 @@ class UnquantizedDataType(DataType):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
|
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
|
||||||
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
|
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
|
||||||
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
|
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
|
||||||
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
|
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
@ -151,14 +153,16 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Params:
|
class Params:
|
||||||
n_vocab: int
|
n_vocab: int
|
||||||
n_embd: int
|
n_embd: int
|
||||||
n_layer: int
|
n_layer: int
|
||||||
n_ctx: int
|
n_ctx: int
|
||||||
n_ff: int
|
n_ff: int
|
||||||
n_head: int
|
n_head: int
|
||||||
n_head_kv: int
|
n_head_kv: int
|
||||||
f_norm_eps: float
|
n_experts: int | None = None
|
||||||
|
n_experts_used: int | None = None
|
||||||
|
f_norm_eps: float | None = None
|
||||||
|
|
||||||
rope_scaling_type: gguf.RopeScalingType | None = None
|
rope_scaling_type: gguf.RopeScalingType | None = None
|
||||||
f_rope_freq_base: float | None = None
|
f_rope_freq_base: float | None = None
|
||||||
@ -233,6 +237,13 @@ class Params:
|
|||||||
raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
|
raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
|
||||||
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
||||||
|
|
||||||
|
n_experts = None
|
||||||
|
n_experts_used = None
|
||||||
|
|
||||||
|
if "num_local_experts" in config:
|
||||||
|
n_experts = config["num_local_experts"]
|
||||||
|
n_experts_used = config["num_experts_per_tok"]
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab = config["vocab_size"],
|
n_vocab = config["vocab_size"],
|
||||||
n_embd = config["hidden_size"],
|
n_embd = config["hidden_size"],
|
||||||
@ -241,6 +252,8 @@ class Params:
|
|||||||
n_ff = config["intermediate_size"],
|
n_ff = config["intermediate_size"],
|
||||||
n_head = (n_head := config["num_attention_heads"]),
|
n_head = (n_head := config["num_attention_heads"]),
|
||||||
n_head_kv = config.get("num_key_value_heads", n_head),
|
n_head_kv = config.get("num_key_value_heads", n_head),
|
||||||
|
n_experts = n_experts,
|
||||||
|
n_experts_used = n_experts_used,
|
||||||
f_norm_eps = config["rms_norm_eps"],
|
f_norm_eps = config["rms_norm_eps"],
|
||||||
f_rope_freq_base = config.get("rope_theta"),
|
f_rope_freq_base = config.get("rope_theta"),
|
||||||
rope_scaling_type = rope_scaling_type,
|
rope_scaling_type = rope_scaling_type,
|
||||||
@ -255,8 +268,15 @@ class Params:
|
|||||||
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
|
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
|
||||||
config = json.load(open(config_path))
|
config = json.load(open(config_path))
|
||||||
|
|
||||||
|
n_experts = None
|
||||||
|
n_experts_used = None
|
||||||
|
f_rope_freq_base = None
|
||||||
|
|
||||||
# hack to determine LLaMA v1 vs v2 vs CodeLlama
|
# hack to determine LLaMA v1 vs v2 vs CodeLlama
|
||||||
if config.get("rope_theta") == 1000000:
|
if config.get("moe"):
|
||||||
|
# Mixtral
|
||||||
|
n_ctx = 32768
|
||||||
|
elif config.get("rope_theta") == 1000000:
|
||||||
# CodeLlama
|
# CodeLlama
|
||||||
n_ctx = 16384
|
n_ctx = 16384
|
||||||
elif config["norm_eps"] == 1e-05:
|
elif config["norm_eps"] == 1e-05:
|
||||||
@ -266,16 +286,27 @@ class Params:
|
|||||||
# LLaMA v1
|
# LLaMA v1
|
||||||
n_ctx = 2048
|
n_ctx = 2048
|
||||||
|
|
||||||
|
if "layers.0.feed_forward.w1.weight" in model:
|
||||||
|
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
|
||||||
|
|
||||||
|
if config.get("moe"):
|
||||||
|
n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
|
||||||
|
n_experts = config["moe"]["num_experts"]
|
||||||
|
n_experts_used = config["moe"]["num_experts_per_tok"]
|
||||||
|
f_rope_freq_base = 1e6
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab = model["tok_embeddings.weight"].shape[0],
|
n_vocab = model["tok_embeddings.weight"].shape[0],
|
||||||
n_embd = config["dim"],
|
n_embd = config["dim"],
|
||||||
n_layer = config["n_layers"],
|
n_layer = config["n_layers"],
|
||||||
n_ctx = n_ctx,
|
n_ctx = n_ctx,
|
||||||
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0],
|
n_ff = n_ff,
|
||||||
n_head = (n_head := config["n_heads"]),
|
n_head = (n_head := config["n_heads"]),
|
||||||
n_head_kv = config.get("n_kv_heads", n_head),
|
n_head_kv = config.get("n_kv_heads", n_head),
|
||||||
|
n_experts = n_experts,
|
||||||
|
n_experts_used = n_experts_used,
|
||||||
f_norm_eps = config["norm_eps"],
|
f_norm_eps = config["norm_eps"],
|
||||||
f_rope_freq_base = config.get("rope_theta"),
|
f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -297,127 +328,138 @@ class Params:
|
|||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
#
|
class VocabLoader:
|
||||||
# vocab
|
def __init__(self, params: Params, fname_tokenizer: Path) -> None:
|
||||||
#
|
try:
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
except ImportError as e:
|
||||||
|
raise ImportError(
|
||||||
|
"To use VocabLoader, please install the `transformers` package. "
|
||||||
|
"You can install it with `pip install transformers`."
|
||||||
|
) from e
|
||||||
|
|
||||||
class BpeVocab:
|
try:
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), trust_remote_code=True)
|
||||||
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
|
except ValueError:
|
||||||
added_tokens: dict[str, int]
|
self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True)
|
||||||
if fname_added_tokens is not None:
|
|
||||||
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
self.added_tokens_dict: OrderedDict[str, int] = OrderedDict()
|
||||||
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
|
|
||||||
|
for tok, tokidx in sorted(self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]):
|
||||||
|
if tokidx >= params.n_vocab or tokidx < self.tokenizer.vocab_size:
|
||||||
|
continue
|
||||||
|
|
||||||
|
self.added_tokens_dict[tok] = tokidx
|
||||||
|
|
||||||
|
self.unk_token_id: int = self.tokenizer.unk_token_id
|
||||||
|
self.specials: dict[str, int] = {
|
||||||
|
tok: self.tokenizer.get_vocab()[tok]
|
||||||
|
for tok in self.tokenizer.all_special_tokens
|
||||||
|
}
|
||||||
|
self.special_ids: set[int] = set(self.tokenizer.all_special_ids)
|
||||||
|
self.vocab_size_base: int = self.tokenizer.vocab_size
|
||||||
|
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_dict)
|
||||||
|
self.fname_tokenizer: Path = fname_tokenizer
|
||||||
|
|
||||||
|
vocab_file = "tokenizer.model"
|
||||||
|
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
|
||||||
|
if path_candidate is not None:
|
||||||
|
self.spm = SentencePieceProcessor(str(path_candidate))
|
||||||
|
print(self.spm.vocab_size(), self.vocab_size_base)
|
||||||
else:
|
else:
|
||||||
# Fall back to trying to find the added tokens in tokenizer.json
|
self.spm = None
|
||||||
tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
|
|
||||||
if not tokenizer_json_file.is_file():
|
|
||||||
added_tokens = {}
|
|
||||||
else:
|
|
||||||
tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
|
|
||||||
added_tokens = dict(
|
|
||||||
(item['content'], item['id'])
|
|
||||||
for item in tokenizer_json.get('added_tokens', [])
|
|
||||||
# Added tokens here can be duplicates of the main vocabulary.
|
|
||||||
if item['content'] not in self.bpe_tokenizer)
|
|
||||||
|
|
||||||
vocab_size: int = len(self.bpe_tokenizer)
|
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
tokenizer = self.tokenizer
|
||||||
actual_ids = sorted(added_tokens.values())
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.get_vocab().items()}
|
||||||
if expected_ids != actual_ids:
|
added_tokens_ids = set(self.added_tokens_dict.values())
|
||||||
expected_end_id = vocab_size + len(actual_ids) - 1
|
|
||||||
raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
|
|
||||||
|
|
||||||
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
for i in range(self.vocab_size_base):
|
||||||
self.added_tokens_list = [text for (text, idx) in items]
|
if i in added_tokens_ids:
|
||||||
self.vocab_size_base: int = vocab_size
|
continue
|
||||||
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
|
|
||||||
self.fname_tokenizer = fname_tokenizer
|
|
||||||
self.fname_added_tokens = fname_added_tokens
|
|
||||||
|
|
||||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
text = reverse_vocab[i].encode("utf-8")
|
||||||
tokenizer = self.bpe_tokenizer
|
yield text, self.get_token_score(i), self.get_token_type(i)
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
|
|
||||||
|
|
||||||
for i, _ in enumerate(tokenizer):
|
def get_token_type(self, token_id: int) -> gguf.TokenType:
|
||||||
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
|
toktype = gguf.TokenType.NORMAL
|
||||||
|
|
||||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
if self.spm is not None and token_id < self.spm.vocab_size():
|
||||||
for text in self.added_tokens_list:
|
if self.spm.is_unknown(token_id):
|
||||||
score = -1000.0
|
|
||||||
yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
|
|
||||||
|
|
||||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
yield from self.bpe_tokens()
|
|
||||||
yield from self.added_tokens()
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
|
||||||
|
|
||||||
|
|
||||||
class SentencePieceVocab:
|
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
|
||||||
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
|
||||||
added_tokens: dict[str, int]
|
|
||||||
if fname_added_tokens is not None:
|
|
||||||
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
|
|
||||||
else:
|
|
||||||
added_tokens = {}
|
|
||||||
|
|
||||||
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
|
|
||||||
|
|
||||||
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
|
||||||
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
|
||||||
actual_new_ids = sorted(new_tokens.keys())
|
|
||||||
|
|
||||||
if expected_new_ids != actual_new_ids:
|
|
||||||
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
|
|
||||||
|
|
||||||
# Token pieces that were added to the base vocabulary.
|
|
||||||
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
|
||||||
self.vocab_size_base = vocab_size
|
|
||||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
|
||||||
self.fname_tokenizer = fname_tokenizer
|
|
||||||
self.fname_added_tokens = fname_added_tokens
|
|
||||||
|
|
||||||
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
tokenizer = self.sentencepiece_tokenizer
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
|
||||||
piece = tokenizer.id_to_piece(i)
|
|
||||||
text: bytes = piece.encode("utf-8")
|
|
||||||
score: float = tokenizer.get_score(i)
|
|
||||||
|
|
||||||
toktype = gguf.TokenType.NORMAL
|
|
||||||
if tokenizer.is_unknown(i):
|
|
||||||
toktype = gguf.TokenType.UNKNOWN
|
toktype = gguf.TokenType.UNKNOWN
|
||||||
if tokenizer.is_control(i):
|
if self.spm.is_control(token_id):
|
||||||
|
toktype = gguf.TokenType.CONTROL
|
||||||
|
if self.spm.is_unused(token_id):
|
||||||
|
toktype = gguf.TokenType.UNUSED
|
||||||
|
if self.spm.is_byte(token_id):
|
||||||
|
toktype = gguf.TokenType.BYTE
|
||||||
|
else:
|
||||||
|
if token_id == self.unk_token_id:
|
||||||
|
toktype = gguf.TokenType.UNKNOWN
|
||||||
|
if token_id in self.special_ids:
|
||||||
toktype = gguf.TokenType.CONTROL
|
toktype = gguf.TokenType.CONTROL
|
||||||
|
|
||||||
# NOTE: I think added_tokens are user defined.
|
return toktype
|
||||||
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
|
||||||
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
|
|
||||||
|
|
||||||
if tokenizer.is_unused(i):
|
def get_token_score(self, token_id: int) -> float:
|
||||||
toktype = gguf.TokenType.UNUSED
|
if self.spm is not None and token_id < self.spm.vocab_size():
|
||||||
if tokenizer.is_byte(i):
|
return cast(float, self.spm.get_score(token_id))
|
||||||
toktype = gguf.TokenType.BYTE
|
return 0.0
|
||||||
|
|
||||||
yield text, score, toktype
|
|
||||||
|
|
||||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
for text in self.added_tokens_list:
|
|
||||||
score = -1000.0
|
for text in self.added_tokens_dict:
|
||||||
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
|
if text in self.specials:
|
||||||
|
|
||||||
|
toktype = self.get_token_type(self.specials[text])
|
||||||
|
score = self.get_token_score(self.specials[text])
|
||||||
|
|
||||||
|
else:
|
||||||
|
toktype = gguf.TokenType.USER_DEFINED
|
||||||
|
score = -1000.0
|
||||||
|
|
||||||
|
yield text.encode("utf-8"), score, toktype
|
||||||
|
|
||||||
|
def has_newline_token(self) -> bool:
|
||||||
|
return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab
|
||||||
|
|
||||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
yield from self.sentencepiece_tokens()
|
yield from self.hf_tokens()
|
||||||
yield from self.added_tokens()
|
yield from self.added_tokens()
|
||||||
|
|
||||||
|
def get_vocab_type(self) -> str:
|
||||||
|
path_candidates = []
|
||||||
|
vocab_file = "tokenizer.model"
|
||||||
|
path_candidates.append(vocab_file)
|
||||||
|
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
|
||||||
|
if path_candidate is not None:
|
||||||
|
return "llama"
|
||||||
|
|
||||||
|
vocab_file = "vocab.json"
|
||||||
|
path_candidates.append(vocab_file)
|
||||||
|
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
|
||||||
|
if path_candidate is not None:
|
||||||
|
return "gpt2"
|
||||||
|
|
||||||
|
vocab_file = "tokenizer.json"
|
||||||
|
path_candidates.append(vocab_file)
|
||||||
|
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
|
||||||
|
if path_candidate:
|
||||||
|
if not self.has_newline_token():
|
||||||
|
return "gpt2"
|
||||||
|
return "llama"
|
||||||
|
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Could not find {path_candidates} in {self.fname_tokenizer} or its parent; "
|
||||||
|
"if it's in another directory, pass the directory as --vocab-dir"
|
||||||
|
)
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_dict)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
Vocab: TypeAlias = 'VocabLoader'
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# data loading
|
# data loading
|
||||||
@ -585,7 +627,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
|
|||||||
|
|
||||||
if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
|
if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
|
||||||
# Transformers models put different tensors in different files, but
|
# Transformers models put different tensors in different files, but
|
||||||
# don't split indivdual tensors between files.
|
# don't split individual tensors between files.
|
||||||
model: LazyModel = {}
|
model: LazyModel = {}
|
||||||
for mp in models_plus:
|
for mp in models_plus:
|
||||||
model.update(mp.model)
|
model.update(mp.model)
|
||||||
@ -678,7 +720,7 @@ class LazyUnpickler(pickle.Unpickler):
|
|||||||
return func(*args)
|
return func(*args)
|
||||||
|
|
||||||
CLASSES: dict[tuple[str, str], Any] = {
|
CLASSES: dict[tuple[str, str], Any] = {
|
||||||
# getattr used here as a workaround for mypy not being smart enough to detrmine
|
# getattr used here as a workaround for mypy not being smart enough to determine
|
||||||
# the staticmethods have a __func__ attribute.
|
# the staticmethods have a __func__ attribute.
|
||||||
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
|
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
|
||||||
('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
|
('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
|
||||||
@ -794,20 +836,27 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
|
|||||||
yield result
|
yield result
|
||||||
|
|
||||||
|
|
||||||
def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
|
||||||
if params.n_vocab != vocab.vocab_size:
|
if params.n_vocab != vocab.vocab_size:
|
||||||
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
|
if params.n_vocab == vocab.vocab_size:
|
||||||
if params.n_vocab == vocab.vocab_size_base:
|
|
||||||
print("Ignoring added_tokens.json since model matches vocab size without it.")
|
print("Ignoring added_tokens.json since model matches vocab size without it.")
|
||||||
vocab.added_tokens_list = []
|
vocab.added_tokens_dict = OrderedDict()
|
||||||
vocab.vocab_size = vocab.vocab_size_base
|
vocab.vocab_size = vocab.vocab_size
|
||||||
|
return
|
||||||
|
|
||||||
|
if pad_vocab and params.n_vocab > vocab.vocab_size:
|
||||||
|
pad_count = params.n_vocab - vocab.vocab_size
|
||||||
|
print(f'Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>')
|
||||||
|
for i in range(1, (params.n_vocab - vocab.vocab_size) + 1):
|
||||||
|
vocab.added_tokens_dict[f'<dummy{i:05}>'] = -1
|
||||||
|
vocab.vocab_size = params.n_vocab
|
||||||
return
|
return
|
||||||
msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
|
msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
|
||||||
if vocab.fname_added_tokens is not None:
|
|
||||||
msg += f" combined with {vocab.fname_added_tokens}"
|
|
||||||
msg += f" has {vocab.vocab_size})."
|
msg += f" has {vocab.vocab_size})."
|
||||||
if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
|
if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
|
||||||
msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
|
msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
|
||||||
|
if vocab.vocab_size < params.n_vocab:
|
||||||
|
msg += " Possibly try using the --padvocab option."
|
||||||
raise Exception(msg)
|
raise Exception(msg)
|
||||||
|
|
||||||
|
|
||||||
@ -832,7 +881,17 @@ class OutputFile:
|
|||||||
self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
|
self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
|
||||||
self.gguf.add_head_count (params.n_head)
|
self.gguf.add_head_count (params.n_head)
|
||||||
self.gguf.add_head_count_kv (params.n_head_kv)
|
self.gguf.add_head_count_kv (params.n_head_kv)
|
||||||
self.gguf.add_layer_norm_rms_eps (params.f_norm_eps)
|
|
||||||
|
if params.n_experts:
|
||||||
|
self.gguf.add_expert_count(params.n_experts)
|
||||||
|
|
||||||
|
if params.n_experts_used:
|
||||||
|
self.gguf.add_expert_used_count(params.n_experts_used)
|
||||||
|
|
||||||
|
if params.f_norm_eps:
|
||||||
|
self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
|
||||||
|
else:
|
||||||
|
raise ValueError('f_norm_eps is None')
|
||||||
|
|
||||||
if params.f_rope_freq_base is not None:
|
if params.f_rope_freq_base is not None:
|
||||||
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
|
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
|
||||||
@ -861,12 +920,8 @@ class OutputFile:
|
|||||||
scores.append(score)
|
scores.append(score)
|
||||||
toktypes.append(toktype)
|
toktypes.append(toktype)
|
||||||
|
|
||||||
if isinstance(vocab, SentencePieceVocab):
|
vocab_type = vocab.get_vocab_type()
|
||||||
self.gguf.add_tokenizer_model("llama")
|
self.gguf.add_tokenizer_model(vocab_type)
|
||||||
elif isinstance(vocab, BpeVocab):
|
|
||||||
self.gguf.add_tokenizer_model("gpt2")
|
|
||||||
else:
|
|
||||||
raise ValueError('Unknown vocab type: Not BpeVocab or SentencePieceVocab')
|
|
||||||
self.gguf.add_token_list(tokens)
|
self.gguf.add_token_list(tokens)
|
||||||
self.gguf.add_token_scores(scores)
|
self.gguf.add_token_scores(scores)
|
||||||
self.gguf.add_token_types(toktypes)
|
self.gguf.add_token_types(toktypes)
|
||||||
@ -892,8 +947,12 @@ class OutputFile:
|
|||||||
self.gguf.close()
|
self.gguf.close()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
def write_vocab_only(
|
||||||
check_vocab_size(params, vocab)
|
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
|
||||||
|
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
||||||
|
pad_vocab: bool = False,
|
||||||
|
) -> None:
|
||||||
|
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
|
||||||
|
|
||||||
of = OutputFile(fname_out, endianess=endianess)
|
of = OutputFile(fname_out, endianess=endianess)
|
||||||
|
|
||||||
@ -920,8 +979,13 @@ class OutputFile:
|
|||||||
return dt.quantize(arr)
|
return dt.quantize(arr)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
def write_all(
|
||||||
check_vocab_size(params, vocab)
|
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
|
||||||
|
concurrency: int = DEFAULT_CONCURRENCY,
|
||||||
|
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
||||||
|
pad_vocab: bool = False,
|
||||||
|
) -> None:
|
||||||
|
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
|
||||||
|
|
||||||
of = OutputFile(fname_out, endianess=endianess)
|
of = OutputFile(fname_out, endianess=endianess)
|
||||||
|
|
||||||
@ -956,7 +1020,7 @@ class OutputFile:
|
|||||||
|
|
||||||
|
|
||||||
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
||||||
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type
|
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
|
||||||
|
|
||||||
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
||||||
return GGMLFileType.AllF32
|
return GGMLFileType.AllF32
|
||||||
@ -1079,35 +1143,17 @@ def load_some_model(path: Path) -> ModelPlus:
|
|||||||
return model_plus
|
return model_plus
|
||||||
|
|
||||||
|
|
||||||
def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
|
def find_vocab_file_path(path: Path, vocab_file: str) -> Optional[Path]:
|
||||||
# Be extra-friendly and accept either a file or a directory. Also, if it's
|
path2 = path / vocab_file
|
||||||
# a directory, it might be the model directory, and tokenizer.model might
|
# Use `.parent` instead of /.. to handle the symlink case better.
|
||||||
# be in the parent of that.
|
path3 = path.parent / vocab_file
|
||||||
if path.is_dir():
|
|
||||||
vocab_file = "tokenizer.model"
|
|
||||||
if vocabtype == 'bpe':
|
|
||||||
vocab_file = "vocab.json"
|
|
||||||
path2 = path / vocab_file
|
|
||||||
# Use `.parent` instead of /.. to handle the symlink case better.
|
|
||||||
path3 = path.parent / vocab_file
|
|
||||||
if path2.exists():
|
|
||||||
path = path2
|
|
||||||
elif path3.exists():
|
|
||||||
path = path3
|
|
||||||
else:
|
|
||||||
raise FileNotFoundError(
|
|
||||||
f"Could not find {vocab_file} in {path} or its parent; "
|
|
||||||
"if it's in another directory, pass the directory as --vocab-dir")
|
|
||||||
|
|
||||||
print(f"Loading vocab file '{path}', type '{vocabtype}'")
|
if path2.exists():
|
||||||
|
return path2
|
||||||
|
if path3.exists():
|
||||||
|
return path3
|
||||||
|
|
||||||
added_tokens_path = path.parent / "added_tokens.json"
|
return None
|
||||||
if vocabtype == "bpe":
|
|
||||||
return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
|
|
||||||
elif vocabtype == "spm":
|
|
||||||
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
|
|
||||||
|
|
||||||
|
|
||||||
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
|
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
|
||||||
@ -1145,11 +1191,11 @@ def main(args_in: list[str] | None = None) -> None:
|
|||||||
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
|
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
|
||||||
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
|
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
|
||||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin, *.safetensors)")
|
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
|
||||||
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
|
|
||||||
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
|
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
|
||||||
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
|
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
|
||||||
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
|
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
|
||||||
|
parser.add_argument("--padvocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
|
||||||
|
|
||||||
args = parser.parse_args(args_in)
|
args = parser.parse_args(args_in)
|
||||||
if args.dump_single:
|
if args.dump_single:
|
||||||
@ -1192,12 +1238,13 @@ def main(args_in: list[str] | None = None) -> None:
|
|||||||
if not args.outfile:
|
if not args.outfile:
|
||||||
raise ValueError("need --outfile if using --vocab-only")
|
raise ValueError("need --outfile if using --vocab-only")
|
||||||
# FIXME: Try to respect vocab_dir somehow?
|
# FIXME: Try to respect vocab_dir somehow?
|
||||||
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
|
vocab = VocabLoader(params, args.vocab_dir or args.model)
|
||||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
||||||
load_merges = args.vocabtype == 'bpe',
|
load_merges = True,
|
||||||
n_vocab = vocab.vocab_size)
|
n_vocab = vocab.vocab_size)
|
||||||
outfile = args.outfile
|
outfile = args.outfile
|
||||||
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
|
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
|
||||||
|
endianess = endianess, pad_vocab = args.padvocab)
|
||||||
print(f"Wrote {outfile}")
|
print(f"Wrote {outfile}")
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -1205,12 +1252,15 @@ def main(args_in: list[str] | None = None) -> None:
|
|||||||
vocab = model_plus.vocab
|
vocab = model_plus.vocab
|
||||||
else:
|
else:
|
||||||
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
||||||
vocab = load_vocab(vocab_dir, args.vocabtype)
|
vocab = VocabLoader(params, vocab_dir)
|
||||||
|
|
||||||
# FIXME: Try to respect vocab_dir somehow?
|
# FIXME: Try to respect vocab_dir somehow?
|
||||||
|
print(f"Vocab info: {vocab}")
|
||||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
||||||
load_merges = args.vocabtype == 'bpe',
|
load_merges = True,
|
||||||
n_vocab = vocab.vocab_size)
|
n_vocab = vocab.vocab_size)
|
||||||
|
|
||||||
|
print(f"Special vocab info: {special_vocab}")
|
||||||
model = model_plus.model
|
model = model_plus.model
|
||||||
model = convert_model_names(model, params)
|
model = convert_model_names(model, params)
|
||||||
ftype = pick_output_type(model, args.outtype)
|
ftype = pick_output_type(model, args.outtype)
|
||||||
@ -1220,7 +1270,8 @@ def main(args_in: list[str] | None = None) -> None:
|
|||||||
params.ftype = ftype
|
params.ftype = ftype
|
||||||
print(f"Writing {outfile}, format {ftype}")
|
print(f"Writing {outfile}, format {ftype}")
|
||||||
|
|
||||||
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
|
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
|
||||||
|
concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab)
|
||||||
print(f"Wrote {outfile}")
|
print(f"Wrote {outfile}")
|
||||||
|
|
||||||
|
|
||||||
|
@ -1258,9 +1258,9 @@ static struct ggml_tensor * forward_lora(
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
|
static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
|
||||||
assert(logits->n_dims == 2);
|
assert(ggml_is_matrix(logits));
|
||||||
assert(probs->n_dims == 2);
|
assert(ggml_is_matrix(probs));
|
||||||
assert(best_samples->n_dims == 1);
|
assert(ggml_is_vector(best_samples));
|
||||||
assert(logits->ne[1] == best_samples->ne[0]);
|
assert(logits->ne[1] == best_samples->ne[0]);
|
||||||
assert(logits->ne[0] == probs->ne[0]);
|
assert(logits->ne[0] == probs->ne[0]);
|
||||||
assert(logits->ne[1] == probs->ne[1]);
|
assert(logits->ne[1] == probs->ne[1]);
|
||||||
@ -1292,9 +1292,9 @@ static void sample_softmax_batch(
|
|||||||
struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
|
struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
|
||||||
struct ggml_tensor * best_samples
|
struct ggml_tensor * best_samples
|
||||||
) {
|
) {
|
||||||
GGML_ASSERT(best_samples->n_dims == 2);
|
GGML_ASSERT(ggml_is_matrix(best_samples));
|
||||||
GGML_ASSERT(logits->n_dims == 3);
|
GGML_ASSERT(ggml_is_3d(logits));
|
||||||
GGML_ASSERT(probs->n_dims == 3);
|
GGML_ASSERT(ggml_is_3d(probs));
|
||||||
int n_tokens = best_samples->ne[0];
|
int n_tokens = best_samples->ne[0];
|
||||||
int n_batch = best_samples->ne[1];
|
int n_batch = best_samples->ne[1];
|
||||||
int n_vocab = logits->ne[0];
|
int n_vocab = logits->ne[0];
|
||||||
@ -1334,7 +1334,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void print_matrix(struct ggml_tensor * probs) {
|
static void print_matrix(struct ggml_tensor * probs) {
|
||||||
assert(probs->n_dims == 2);
|
assert(ggml_is_matrix(probs));
|
||||||
for (int i = 0; i < probs->ne[1]; ++i) {
|
for (int i = 0; i < probs->ne[1]; ++i) {
|
||||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||||
float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
|
float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
|
||||||
@ -1386,8 +1386,8 @@ static void get_example_targets(int example_id, struct ggml_tensor * tokens_inpu
|
|||||||
static void get_example_targets_batch(
|
static void get_example_targets_batch(
|
||||||
struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
|
struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
|
||||||
) {
|
) {
|
||||||
GGML_ASSERT(tokens_input->n_dims == 2);
|
GGML_ASSERT(ggml_is_matrix(tokens_input));
|
||||||
GGML_ASSERT( targets->n_dims == 3);
|
GGML_ASSERT(ggml_is_3d(targets));
|
||||||
int n_tokens = tokens_input->ne[0];
|
int n_tokens = tokens_input->ne[0];
|
||||||
int n_batch = tokens_input->ne[1];
|
int n_batch = tokens_input->ne[1];
|
||||||
GGML_ASSERT(n_tokens == targets->ne[1]);
|
GGML_ASSERT(n_tokens == targets->ne[1]);
|
||||||
|
@ -129,13 +129,13 @@ int main(int argc, char ** argv) {
|
|||||||
const ggml_type qtype = GGML_TYPE_Q4_1;
|
const ggml_type qtype = GGML_TYPE_Q4_1;
|
||||||
|
|
||||||
size_t ctx_size = 0;
|
size_t ctx_size = 0;
|
||||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
|
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
|
||||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
|
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
|
||||||
ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
|
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
|
||||||
ctx_size += sizex*sizey*ggml_type_sizef(qtype);
|
ctx_size += ggml_row_size(qtype, sizex*sizey);
|
||||||
ctx_size += sizex*sizey*ggml_type_sizef(qtype);
|
ctx_size += ggml_row_size(qtype, sizex*sizey);
|
||||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
|
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
|
||||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
|
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
|
||||||
ctx_size += 1024*1024*16;
|
ctx_size += 1024*1024*16;
|
||||||
|
|
||||||
printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
|
printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
|
||||||
|
@ -427,7 +427,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void print_matrix(struct ggml_tensor * probs) {
|
static void print_matrix(struct ggml_tensor * probs) {
|
||||||
assert(probs->n_dims == 2);
|
assert(ggml_is_matrix(probs));
|
||||||
for (int i = 0; i < probs->ne[1]; ++i) {
|
for (int i = 0; i < probs->ne[1]; ++i) {
|
||||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||||
float p = get_f32_2d(probs, k, i);
|
float p = get_f32_2d(probs, k, i);
|
||||||
@ -639,7 +639,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
|
|||||||
|
|
||||||
static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
|
static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
|
||||||
int ct;
|
int ct;
|
||||||
switch (gg_weights->n_dims){
|
switch (ggml_n_dims(gg_weights)) {
|
||||||
case 1:
|
case 1:
|
||||||
ct = 0;
|
ct = 0;
|
||||||
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
|
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
|
||||||
|
@ -1110,7 +1110,7 @@ static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor,
|
|||||||
name = ggml_get_name(tensor);
|
name = ggml_get_name(tensor);
|
||||||
}
|
}
|
||||||
uint32_t name_len = strlen(name);
|
uint32_t name_len = strlen(name);
|
||||||
uint32_t nd = tensor->n_dims;
|
uint32_t nd = ggml_n_dims(tensor);
|
||||||
uint32_t ne[4] = { (uint32_t)tensor->ne[0],
|
uint32_t ne[4] = { (uint32_t)tensor->ne[0],
|
||||||
(uint32_t)tensor->ne[1],
|
(uint32_t)tensor->ne[1],
|
||||||
(uint32_t)tensor->ne[2],
|
(uint32_t)tensor->ne[2],
|
||||||
|
@ -195,7 +195,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
|
|||||||
|
|
||||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
||||||
|
|
||||||
printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
|
printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data);
|
||||||
|
|
||||||
// print first 10 elements
|
// print first 10 elements
|
||||||
const float * data = (const float *) cur->data;
|
const float * data = (const float *) cur->data;
|
||||||
|
@ -53,6 +53,13 @@ static std::vector<T> split(const std::string & str, char delim) {
|
|||||||
return values;
|
return values;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename T, typename F>
|
||||||
|
static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
|
||||||
|
std::vector<std::string> str_values;
|
||||||
|
std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
|
||||||
|
return str_values;
|
||||||
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
static T avg(const std::vector<T> & v) {
|
static T avg(const std::vector<T> & v) {
|
||||||
if (v.empty()) {
|
if (v.empty()) {
|
||||||
@ -126,7 +133,8 @@ struct cmd_params {
|
|||||||
std::vector<int> n_prompt;
|
std::vector<int> n_prompt;
|
||||||
std::vector<int> n_gen;
|
std::vector<int> n_gen;
|
||||||
std::vector<int> n_batch;
|
std::vector<int> n_batch;
|
||||||
std::vector<bool> f32_kv;
|
std::vector<ggml_type> type_k;
|
||||||
|
std::vector<ggml_type> type_v;
|
||||||
std::vector<int> n_threads;
|
std::vector<int> n_threads;
|
||||||
std::vector<int> n_gpu_layers;
|
std::vector<int> n_gpu_layers;
|
||||||
std::vector<int> main_gpu;
|
std::vector<int> main_gpu;
|
||||||
@ -142,7 +150,8 @@ static const cmd_params cmd_params_defaults = {
|
|||||||
/* n_prompt */ {512},
|
/* n_prompt */ {512},
|
||||||
/* n_gen */ {128},
|
/* n_gen */ {128},
|
||||||
/* n_batch */ {512},
|
/* n_batch */ {512},
|
||||||
/* f32_kv */ {false},
|
/* type_k */ {GGML_TYPE_F16},
|
||||||
|
/* type_v */ {GGML_TYPE_F16},
|
||||||
/* n_threads */ {get_num_physical_cores()},
|
/* n_threads */ {get_num_physical_cores()},
|
||||||
/* n_gpu_layers */ {99},
|
/* n_gpu_layers */ {99},
|
||||||
/* main_gpu */ {0},
|
/* main_gpu */ {0},
|
||||||
@ -162,7 +171,8 @@ static void print_usage(int /* argc */, char ** argv) {
|
|||||||
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
||||||
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
||||||
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
||||||
printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
|
printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
||||||
|
printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
||||||
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
||||||
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||||
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||||
@ -173,9 +183,32 @@ static void print_usage(int /* argc */, char ** argv) {
|
|||||||
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static ggml_type ggml_type_from_name(const std::string & s) {
|
||||||
|
if (s == "f16") {
|
||||||
|
return GGML_TYPE_F16;
|
||||||
|
}
|
||||||
|
if (s == "q8_0") {
|
||||||
|
return GGML_TYPE_Q8_0;
|
||||||
|
}
|
||||||
|
if (s == "q4_0") {
|
||||||
|
return GGML_TYPE_Q4_0;
|
||||||
|
}
|
||||||
|
if (s == "q4_1") {
|
||||||
|
return GGML_TYPE_Q4_1;
|
||||||
|
}
|
||||||
|
if (s == "q5_0") {
|
||||||
|
return GGML_TYPE_Q5_0;
|
||||||
|
}
|
||||||
|
if (s == "q5_1") {
|
||||||
|
return GGML_TYPE_Q5_1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return GGML_TYPE_COUNT;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static cmd_params parse_cmd_params(int argc, char ** argv) {
|
static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
cmd_params params;
|
cmd_params params;
|
||||||
std::string arg;
|
std::string arg;
|
||||||
@ -224,13 +257,38 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
auto p = split<int>(argv[i], split_delim);
|
auto p = split<int>(argv[i], split_delim);
|
||||||
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
|
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
|
||||||
} else if (arg == "--memory-f32") {
|
} else if (arg == "-ctk" || arg == "--cache-type-k") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
auto p = split<int>(argv[i], split_delim);
|
auto p = split<std::string>(argv[i], split_delim);
|
||||||
params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
|
std::vector<ggml_type> types;
|
||||||
|
for (const auto & t : p) {
|
||||||
|
ggml_type gt = ggml_type_from_name(t);
|
||||||
|
if (gt == GGML_TYPE_COUNT) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
types.push_back(gt);
|
||||||
|
}
|
||||||
|
params.type_k.insert(params.type_k.end(), types.begin(), types.end());
|
||||||
|
} else if (arg == "-ctv" || arg == "--cache-type-v") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto p = split<std::string>(argv[i], split_delim);
|
||||||
|
std::vector<ggml_type> types;
|
||||||
|
for (const auto & t : p) {
|
||||||
|
ggml_type gt = ggml_type_from_name(t);
|
||||||
|
if (gt == GGML_TYPE_COUNT) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
types.push_back(gt);
|
||||||
|
}
|
||||||
|
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
|
||||||
} else if (arg == "-t" || arg == "--threads") {
|
} else if (arg == "-t" || arg == "--threads") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
@ -321,7 +379,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
|
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
|
||||||
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
|
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
|
||||||
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
|
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
|
||||||
if (params.f32_kv.empty()) { params.f32_kv = cmd_params_defaults.f32_kv; }
|
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
|
||||||
|
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
|
||||||
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
|
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
|
||||||
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
|
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
|
||||||
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
|
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
|
||||||
@ -336,7 +395,8 @@ struct cmd_params_instance {
|
|||||||
int n_prompt;
|
int n_prompt;
|
||||||
int n_gen;
|
int n_gen;
|
||||||
int n_batch;
|
int n_batch;
|
||||||
bool f32_kv;
|
ggml_type type_k;
|
||||||
|
ggml_type type_v;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
int n_gpu_layers;
|
int n_gpu_layers;
|
||||||
int main_gpu;
|
int main_gpu;
|
||||||
@ -365,7 +425,8 @@ struct cmd_params_instance {
|
|||||||
|
|
||||||
cparams.n_ctx = n_prompt + n_gen;
|
cparams.n_ctx = n_prompt + n_gen;
|
||||||
cparams.n_batch = n_batch;
|
cparams.n_batch = n_batch;
|
||||||
cparams.f16_kv = !f32_kv;
|
cparams.type_k = type_k;
|
||||||
|
cparams.type_v = type_v;
|
||||||
cparams.mul_mat_q = mul_mat_q;
|
cparams.mul_mat_q = mul_mat_q;
|
||||||
|
|
||||||
return cparams;
|
return cparams;
|
||||||
@ -380,7 +441,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
|
|||||||
for (const auto & mg : params.main_gpu)
|
for (const auto & mg : params.main_gpu)
|
||||||
for (const auto & ts : params.tensor_split)
|
for (const auto & ts : params.tensor_split)
|
||||||
for (const auto & nb : params.n_batch)
|
for (const auto & nb : params.n_batch)
|
||||||
for (const auto & fk : params.f32_kv)
|
for (const auto & tk : params.type_k)
|
||||||
|
for (const auto & tv : params.type_v)
|
||||||
for (const auto & mmq : params.mul_mat_q)
|
for (const auto & mmq : params.mul_mat_q)
|
||||||
for (const auto & nt : params.n_threads) {
|
for (const auto & nt : params.n_threads) {
|
||||||
cmd_params_instance instance = {
|
cmd_params_instance instance = {
|
||||||
@ -388,7 +450,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
|
|||||||
/* .n_prompt = */ n_prompt,
|
/* .n_prompt = */ n_prompt,
|
||||||
/* .n_gen = */ n_gen,
|
/* .n_gen = */ n_gen,
|
||||||
/* .n_batch = */ nb,
|
/* .n_batch = */ nb,
|
||||||
/* .f32_kv = */ fk,
|
/* .type_k = */ tk,
|
||||||
|
/* .type_v = */ tv,
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .main_gpu = */ mg,
|
/* .main_gpu = */ mg,
|
||||||
@ -410,7 +473,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||||||
for (const auto & mg : params.main_gpu)
|
for (const auto & mg : params.main_gpu)
|
||||||
for (const auto & ts : params.tensor_split)
|
for (const auto & ts : params.tensor_split)
|
||||||
for (const auto & nb : params.n_batch)
|
for (const auto & nb : params.n_batch)
|
||||||
for (const auto & fk : params.f32_kv)
|
for (const auto & tk : params.type_k)
|
||||||
|
for (const auto & tv : params.type_v)
|
||||||
for (const auto & mmq : params.mul_mat_q)
|
for (const auto & mmq : params.mul_mat_q)
|
||||||
for (const auto & nt : params.n_threads) {
|
for (const auto & nt : params.n_threads) {
|
||||||
for (const auto & n_prompt : params.n_prompt) {
|
for (const auto & n_prompt : params.n_prompt) {
|
||||||
@ -422,7 +486,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||||||
/* .n_prompt = */ n_prompt,
|
/* .n_prompt = */ n_prompt,
|
||||||
/* .n_gen = */ 0,
|
/* .n_gen = */ 0,
|
||||||
/* .n_batch = */ nb,
|
/* .n_batch = */ nb,
|
||||||
/* .f32_kv = */ fk,
|
/* .type_k = */ tk,
|
||||||
|
/* .type_v = */ tv,
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .main_gpu = */ mg,
|
/* .main_gpu = */ mg,
|
||||||
@ -441,7 +506,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||||||
/* .n_prompt = */ 0,
|
/* .n_prompt = */ 0,
|
||||||
/* .n_gen = */ n_gen,
|
/* .n_gen = */ n_gen,
|
||||||
/* .n_batch = */ nb,
|
/* .n_batch = */ nb,
|
||||||
/* .f32_kv = */ fk,
|
/* .type_k = */ tk,
|
||||||
|
/* .type_v = */ tv,
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .main_gpu = */ mg,
|
/* .main_gpu = */ mg,
|
||||||
@ -489,7 +555,8 @@ struct test {
|
|||||||
uint64_t model_n_params;
|
uint64_t model_n_params;
|
||||||
int n_batch;
|
int n_batch;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
bool f32_kv;
|
ggml_type type_k;
|
||||||
|
ggml_type type_v;
|
||||||
int n_gpu_layers;
|
int n_gpu_layers;
|
||||||
int main_gpu;
|
int main_gpu;
|
||||||
bool mul_mat_q;
|
bool mul_mat_q;
|
||||||
@ -508,7 +575,8 @@ struct test {
|
|||||||
model_n_params = llama_model_n_params(lmodel);
|
model_n_params = llama_model_n_params(lmodel);
|
||||||
n_batch = inst.n_batch;
|
n_batch = inst.n_batch;
|
||||||
n_threads = inst.n_threads;
|
n_threads = inst.n_threads;
|
||||||
f32_kv = inst.f32_kv;
|
type_k = inst.type_k;
|
||||||
|
type_v = inst.type_v;
|
||||||
n_gpu_layers = inst.n_gpu_layers;
|
n_gpu_layers = inst.n_gpu_layers;
|
||||||
main_gpu = inst.main_gpu;
|
main_gpu = inst.main_gpu;
|
||||||
mul_mat_q = inst.mul_mat_q;
|
mul_mat_q = inst.mul_mat_q;
|
||||||
@ -571,7 +639,7 @@ struct test {
|
|||||||
"cuda", "opencl", "metal", "gpu_blas", "blas",
|
"cuda", "opencl", "metal", "gpu_blas", "blas",
|
||||||
"cpu_info", "gpu_info",
|
"cpu_info", "gpu_info",
|
||||||
"model_filename", "model_type", "model_size", "model_n_params",
|
"model_filename", "model_type", "model_size", "model_n_params",
|
||||||
"n_batch", "n_threads", "f16_kv",
|
"n_batch", "n_threads", "type_k", "type_v",
|
||||||
"n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
|
"n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
|
||||||
"n_prompt", "n_gen", "test_time",
|
"n_prompt", "n_gen", "test_time",
|
||||||
"avg_ns", "stddev_ns",
|
"avg_ns", "stddev_ns",
|
||||||
@ -621,7 +689,7 @@ struct test {
|
|||||||
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
||||||
cpu_info, gpu_info,
|
cpu_info, gpu_info,
|
||||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||||
std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
|
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
||||||
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
|
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
|
||||||
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
||||||
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
||||||
@ -805,8 +873,11 @@ struct markdown_printer : public printer {
|
|||||||
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
||||||
fields.push_back("n_batch");
|
fields.push_back("n_batch");
|
||||||
}
|
}
|
||||||
if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
|
if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
|
||||||
fields.push_back("f16_kv");
|
fields.push_back("type_k");
|
||||||
|
}
|
||||||
|
if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
|
||||||
|
fields.push_back("type_v");
|
||||||
}
|
}
|
||||||
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
|
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
|
||||||
fields.push_back("main_gpu");
|
fields.push_back("main_gpu");
|
||||||
|
@ -514,7 +514,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
ctx_size += padded_size;
|
ctx_size += padded_size;
|
||||||
if (verbosity >= 3) {
|
if (verbosity >= 3) {
|
||||||
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, padded_size=%zu, offset=%zu\n", __func__, i,
|
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, padded_size=%zu, offset=%zu\n", __func__, i,
|
||||||
cur->n_dims, cur->name, tensor_size, padded_size, offset);
|
ggml_n_dims(cur), cur->name, tensor_size, padded_size, offset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -739,7 +739,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
|||||||
temp->ny = longer_side;
|
temp->ny = longer_side;
|
||||||
temp->size = 3 * longer_side * longer_side;
|
temp->size = 3 * longer_side * longer_side;
|
||||||
temp->data = new uint8_t[temp->size]();
|
temp->data = new uint8_t[temp->size]();
|
||||||
uint8_t bc[3] = {122, 116, 104}; // bakground color in RGB from LLaVA
|
uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
|
||||||
|
|
||||||
// fill with background color
|
// fill with background color
|
||||||
for (size_t i = 0; i < temp->size; i++) {
|
for (size_t i = 0; i < temp->size; i++) {
|
||||||
@ -962,7 +962,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||||||
}
|
}
|
||||||
|
|
||||||
// quantize only 2D tensors
|
// quantize only 2D tensors
|
||||||
quantize &= (cur->n_dims == 2);
|
quantize &= (ggml_n_dims(cur) == 2);
|
||||||
|
|
||||||
if (quantize) {
|
if (quantize) {
|
||||||
new_type = type;
|
new_type = type;
|
||||||
@ -1035,7 +1035,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||||||
fout.put(0);
|
fout.put(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), cur->n_dims, quantize,
|
printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
|
||||||
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -51,7 +51,7 @@ def bytes_to_unicode():
|
|||||||
The reversible bpe codes work on unicode strings.
|
The reversible bpe codes work on unicode strings.
|
||||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||||
This is a signficant percentage of your normal, say, 32K bpe vocab.
|
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||||
"""
|
"""
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
# llama.cpp/examples/lookahead
|
# llama.cpp/examples/lookahead
|
||||||
|
|
||||||
Demonstartion of lookahead decoding technique:
|
Demonstration of lookahead decoding technique:
|
||||||
|
|
||||||
https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
||||||
|
|
||||||
|
@ -321,7 +321,6 @@ int main(int argc, char ** argv) {
|
|||||||
auto cparams = llama_context_default_params();
|
auto cparams = llama_context_default_params();
|
||||||
cparams.n_ctx = 256;
|
cparams.n_ctx = 256;
|
||||||
cparams.seed = 1;
|
cparams.seed = 1;
|
||||||
cparams.f16_kv = false;
|
|
||||||
|
|
||||||
ctx = llama_new_context_with_model(model, cparams);
|
ctx = llama_new_context_with_model(model, cparams);
|
||||||
|
|
||||||
|
@ -222,7 +222,7 @@ node index.js
|
|||||||
|
|
||||||
`content`: Set the text to process.
|
`content`: Set the text to process.
|
||||||
|
|
||||||
**POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
|
- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
||||||
|
@ -11227,7 +11227,7 @@ class binary_reader
|
|||||||
}
|
}
|
||||||
if (is_ndarray) // ndarray dimensional vector can only contain integers, and can not embed another array
|
if (is_ndarray) // ndarray dimensional vector can only contain integers, and can not embed another array
|
||||||
{
|
{
|
||||||
return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read, exception_message(input_format, "ndarray dimentional vector is not allowed", "size"), nullptr));
|
return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read, exception_message(input_format, "ndarray dimensional vector is not allowed", "size"), nullptr));
|
||||||
}
|
}
|
||||||
std::vector<size_t> dim;
|
std::vector<size_t> dim;
|
||||||
if (JSON_HEDLEY_UNLIKELY(!get_ubjson_ndarray_size(dim)))
|
if (JSON_HEDLEY_UNLIKELY(!get_ubjson_ndarray_size(dim)))
|
||||||
|
@ -34,7 +34,8 @@ export async function* llama(prompt, params = {}, config = {}) {
|
|||||||
headers: {
|
headers: {
|
||||||
'Connection': 'keep-alive',
|
'Connection': 'keep-alive',
|
||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
'Accept': 'text/event-stream'
|
'Accept': 'text/event-stream',
|
||||||
|
...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
|
||||||
},
|
},
|
||||||
signal: controller.signal,
|
signal: controller.signal,
|
||||||
});
|
});
|
||||||
@ -114,7 +115,7 @@ export async function* llama(prompt, params = {}, config = {}) {
|
|||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Call llama, return an event target that you can subcribe to
|
// Call llama, return an event target that you can subscribe to
|
||||||
//
|
//
|
||||||
// Example:
|
// Example:
|
||||||
//
|
//
|
||||||
|
@ -223,7 +223,7 @@
|
|||||||
repeat_last_n: 256, // 0 = disable penalty, -1 = context size
|
repeat_last_n: 256, // 0 = disable penalty, -1 = context size
|
||||||
repeat_penalty: 1.18, // 1.0 = disabled
|
repeat_penalty: 1.18, // 1.0 = disabled
|
||||||
top_k: 40, // <= 0 to use vocab size
|
top_k: 40, // <= 0 to use vocab size
|
||||||
top_p: 0.5, // 1.0 = disabled
|
top_p: 0.95, // 1.0 = disabled
|
||||||
min_p: 0.05, // 0 = disabled
|
min_p: 0.05, // 0 = disabled
|
||||||
tfs_z: 1.0, // 1.0 = disabled
|
tfs_z: 1.0, // 1.0 = disabled
|
||||||
typical_p: 1.0, // 1.0 = disabled
|
typical_p: 1.0, // 1.0 = disabled
|
||||||
@ -235,10 +235,11 @@
|
|||||||
grammar: '',
|
grammar: '',
|
||||||
n_probs: 0, // no completion_probabilities,
|
n_probs: 0, // no completion_probabilities,
|
||||||
image_data: [],
|
image_data: [],
|
||||||
cache_prompt: true
|
cache_prompt: true,
|
||||||
|
api_key: ''
|
||||||
})
|
})
|
||||||
|
|
||||||
/* START: Support for storing prompt templates and parameters in borwser LocalStorage */
|
/* START: Support for storing prompt templates and parameters in browsers LocalStorage */
|
||||||
|
|
||||||
const local_storage_storageKey = "llamacpp_server_local_storage";
|
const local_storage_storageKey = "llamacpp_server_local_storage";
|
||||||
|
|
||||||
@ -282,7 +283,7 @@
|
|||||||
let importedTemplates = local_storage_getDataAsObject('user_templates')
|
let importedTemplates = local_storage_getDataAsObject('user_templates')
|
||||||
|
|
||||||
if (importedTemplates) {
|
if (importedTemplates) {
|
||||||
// saved templates were successfuly imported.
|
// saved templates were successfully imported.
|
||||||
|
|
||||||
console.log('Processing saved templates and updating default template')
|
console.log('Processing saved templates and updating default template')
|
||||||
params.value = { ...params.value, image_data: [] };
|
params.value = { ...params.value, image_data: [] };
|
||||||
@ -303,7 +304,7 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
function userTemplateResetToDefault() {
|
function userTemplateResetToDefault() {
|
||||||
console.log('Reseting themplate to default')
|
console.log('Resetting template to default')
|
||||||
selectedUserTemplate.value.name = 'default';
|
selectedUserTemplate.value.name = 'default';
|
||||||
selectedUserTemplate.value.data = savedUserTemplates.value['default'];
|
selectedUserTemplate.value.data = savedUserTemplates.value['default'];
|
||||||
}
|
}
|
||||||
@ -762,7 +763,7 @@
|
|||||||
|
|
||||||
<fieldset class="two">
|
<fieldset class="two">
|
||||||
${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
|
${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
|
||||||
${FloatField({ label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
|
${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
|
||||||
${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
|
${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
|
||||||
${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
|
${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
|
||||||
${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
|
${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
|
||||||
@ -790,6 +791,10 @@
|
|||||||
<fieldset>
|
<fieldset>
|
||||||
${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
|
${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
|
||||||
</fieldset>
|
</fieldset>
|
||||||
|
<fieldset>
|
||||||
|
<label for="api_key">API Key</label>
|
||||||
|
<input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
|
||||||
|
</fieldset>
|
||||||
</details>
|
</details>
|
||||||
</form>
|
</form>
|
||||||
`
|
`
|
||||||
|
@ -36,6 +36,7 @@ using json = nlohmann::json;
|
|||||||
struct server_params
|
struct server_params
|
||||||
{
|
{
|
||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
|
std::string api_key;
|
||||||
std::string public_path = "examples/server/public";
|
std::string public_path = "examples/server/public";
|
||||||
int32_t port = 8080;
|
int32_t port = 8080;
|
||||||
int32_t read_timeout = 600;
|
int32_t read_timeout = 600;
|
||||||
@ -376,7 +377,6 @@ struct llama_client_slot
|
|||||||
|
|
||||||
int32_t num_prompt_tokens = 0;
|
int32_t num_prompt_tokens = 0;
|
||||||
int32_t num_prompt_tokens_processed = 0;
|
int32_t num_prompt_tokens_processed = 0;
|
||||||
int32_t multibyte_pending = 0;
|
|
||||||
|
|
||||||
json prompt;
|
json prompt;
|
||||||
std::string generated_text;
|
std::string generated_text;
|
||||||
@ -425,7 +425,6 @@ struct llama_client_slot
|
|||||||
stopped_word = false;
|
stopped_word = false;
|
||||||
stopped_limit = false;
|
stopped_limit = false;
|
||||||
stopping_word = "";
|
stopping_word = "";
|
||||||
multibyte_pending = 0;
|
|
||||||
n_past = 0;
|
n_past = 0;
|
||||||
sent_count = 0;
|
sent_count = 0;
|
||||||
sent_token_probs_index = 0;
|
sent_token_probs_index = 0;
|
||||||
@ -992,35 +991,36 @@ struct llama_server_context
|
|||||||
slot.generated_text += token_str;
|
slot.generated_text += token_str;
|
||||||
slot.has_next_token = true;
|
slot.has_next_token = true;
|
||||||
|
|
||||||
if (slot.multibyte_pending > 0)
|
// check if there is incomplete UTF-8 character at the end
|
||||||
|
bool incomplete = false;
|
||||||
|
for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i)
|
||||||
{
|
{
|
||||||
slot.multibyte_pending -= token_str.size();
|
unsigned char c = slot.generated_text[slot.generated_text.size() - i];
|
||||||
}
|
if ((c & 0xC0) == 0x80)
|
||||||
else if (token_str.size() == 1)
|
{
|
||||||
{
|
// continuation byte: 10xxxxxx
|
||||||
const char c = token_str[0];
|
continue;
|
||||||
// 2-byte characters: 110xxxxx 10xxxxxx
|
}
|
||||||
if ((c & 0xE0) == 0xC0)
|
if ((c & 0xE0) == 0xC0)
|
||||||
{
|
{
|
||||||
slot.multibyte_pending = 1;
|
// 2-byte character: 110xxxxx ...
|
||||||
// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
|
incomplete = i < 2;
|
||||||
}
|
}
|
||||||
else if ((c & 0xF0) == 0xE0)
|
else if ((c & 0xF0) == 0xE0)
|
||||||
{
|
{
|
||||||
slot.multibyte_pending = 2;
|
// 3-byte character: 1110xxxx ...
|
||||||
// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
incomplete = i < 3;
|
||||||
}
|
}
|
||||||
else if ((c & 0xF8) == 0xF0)
|
else if ((c & 0xF8) == 0xF0)
|
||||||
{
|
{
|
||||||
slot.multibyte_pending = 3;
|
// 4-byte character: 11110xxx ...
|
||||||
}
|
incomplete = i < 4;
|
||||||
else
|
|
||||||
{
|
|
||||||
slot.multibyte_pending = 0;
|
|
||||||
}
|
}
|
||||||
|
// else 1-byte character or invalid byte
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (slot.multibyte_pending == 0)
|
if (!incomplete)
|
||||||
{
|
{
|
||||||
size_t pos = std::min(slot.sent_count, slot.generated_text.size());
|
size_t pos = std::min(slot.sent_count, slot.generated_text.size());
|
||||||
const std::string str_test = slot.generated_text.substr(pos);
|
const std::string str_test = slot.generated_text.substr(pos);
|
||||||
@ -1055,7 +1055,7 @@ struct llama_server_context
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (slot.multibyte_pending > 0 && !slot.has_next_token)
|
if (incomplete)
|
||||||
{
|
{
|
||||||
slot.has_next_token = true;
|
slot.has_next_token = true;
|
||||||
}
|
}
|
||||||
@ -1954,6 +1954,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||||||
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
||||||
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
|
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
|
||||||
printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
|
printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
|
||||||
|
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
|
||||||
printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
|
printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
|
||||||
printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
|
printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
|
||||||
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
|
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
|
||||||
@ -2003,6 +2004,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||||||
}
|
}
|
||||||
sparams.public_path = argv[i];
|
sparams.public_path = argv[i];
|
||||||
}
|
}
|
||||||
|
else if (arg == "--api-key")
|
||||||
|
{
|
||||||
|
if (++i >= argc)
|
||||||
|
{
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
sparams.api_key = argv[i];
|
||||||
|
}
|
||||||
else if (arg == "--timeout" || arg == "-to")
|
else if (arg == "--timeout" || arg == "-to")
|
||||||
{
|
{
|
||||||
if (++i >= argc)
|
if (++i >= argc)
|
||||||
@ -2108,10 +2118,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||||||
}
|
}
|
||||||
params.yarn_beta_slow = std::stof(argv[i]);
|
params.yarn_beta_slow = std::stof(argv[i]);
|
||||||
}
|
}
|
||||||
else if (arg == "--memory-f32" || arg == "--memory_f32")
|
|
||||||
{
|
|
||||||
params.memory_f16 = false;
|
|
||||||
}
|
|
||||||
else if (arg == "--threads" || arg == "-t")
|
else if (arg == "--threads" || arg == "-t")
|
||||||
{
|
{
|
||||||
if (++i >= argc)
|
if (++i >= argc)
|
||||||
@ -2386,7 +2392,9 @@ json oaicompat_completion_params_parse(
|
|||||||
llama_params["__oaicompat"] = true;
|
llama_params["__oaicompat"] = true;
|
||||||
|
|
||||||
// Map OpenAI parameters to llama.cpp parameters
|
// Map OpenAI parameters to llama.cpp parameters
|
||||||
|
llama_params["model"] = json_value(body, "model", std::string("uknown"));
|
||||||
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
|
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
|
||||||
|
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
||||||
llama_params["temperature"] = json_value(body, "temperature", 0.8);
|
llama_params["temperature"] = json_value(body, "temperature", 0.8);
|
||||||
llama_params["top_k"] = json_value(body, "top_k", 40);
|
llama_params["top_k"] = json_value(body, "top_k", 40);
|
||||||
llama_params["top_p"] = json_value(body, "top_p", 0.95);
|
llama_params["top_p"] = json_value(body, "top_p", 0.95);
|
||||||
@ -2672,6 +2680,32 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
httplib::Server svr;
|
httplib::Server svr;
|
||||||
|
|
||||||
|
// Middleware for API key validation
|
||||||
|
auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
|
||||||
|
// If API key is not set, skip validation
|
||||||
|
if (sparams.api_key.empty()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for API key in the header
|
||||||
|
auto auth_header = req.get_header_value("Authorization");
|
||||||
|
std::string prefix = "Bearer ";
|
||||||
|
if (auth_header.substr(0, prefix.size()) == prefix) {
|
||||||
|
std::string received_api_key = auth_header.substr(prefix.size());
|
||||||
|
if (received_api_key == sparams.api_key) {
|
||||||
|
return true; // API key is valid
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// API key is invalid or not provided
|
||||||
|
res.set_content("Unauthorized: Invalid API Key", "text/plain");
|
||||||
|
res.status = 401; // Unauthorized
|
||||||
|
|
||||||
|
LOG_WARNING("Unauthorized: Invalid API Key", {});
|
||||||
|
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
|
||||||
svr.set_default_headers({{"Server", "llama.cpp"},
|
svr.set_default_headers({{"Server", "llama.cpp"},
|
||||||
{"Access-Control-Allow-Origin", "*"},
|
{"Access-Control-Allow-Origin", "*"},
|
||||||
{"Access-Control-Allow-Headers", "content-type"}});
|
{"Access-Control-Allow-Headers", "content-type"}});
|
||||||
@ -2714,8 +2748,11 @@ int main(int argc, char **argv)
|
|||||||
res.set_content(data.dump(), "application/json");
|
res.set_content(data.dump(), "application/json");
|
||||||
});
|
});
|
||||||
|
|
||||||
svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
|
svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
|
||||||
{
|
{
|
||||||
|
if (!validate_api_key(req, res)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
const int task_id = llama.request_completion(data, false, false, -1);
|
const int task_id = llama.request_completion(data, false, false, -1);
|
||||||
if (!json_value(data, "stream", false)) {
|
if (!json_value(data, "stream", false)) {
|
||||||
@ -2802,8 +2839,11 @@ int main(int argc, char **argv)
|
|||||||
});
|
});
|
||||||
|
|
||||||
// TODO: add mount point without "/v1" prefix -- how?
|
// TODO: add mount point without "/v1" prefix -- how?
|
||||||
svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req, httplib::Response &res)
|
svr.Post("/v1/chat/completions", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
|
||||||
{
|
{
|
||||||
|
if (!validate_api_key(req, res)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
json data = oaicompat_completion_params_parse(json::parse(req.body));
|
json data = oaicompat_completion_params_parse(json::parse(req.body));
|
||||||
|
|
||||||
const int task_id = llama.request_completion(data, false, false, -1);
|
const int task_id = llama.request_completion(data, false, false, -1);
|
||||||
@ -2872,8 +2912,11 @@ int main(int argc, char **argv)
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
|
svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
|
||||||
{
|
{
|
||||||
|
if (!validate_api_key(req, res)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
const int task_id = llama.request_completion(data, true, false, -1);
|
const int task_id = llama.request_completion(data, true, false, -1);
|
||||||
if (!json_value(data, "stream", false)) {
|
if (!json_value(data, "stream", false)) {
|
||||||
@ -3008,11 +3051,15 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
|
svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
|
||||||
{
|
{
|
||||||
|
if (res.status == 401)
|
||||||
|
{
|
||||||
|
res.set_content("Unauthorized", "text/plain");
|
||||||
|
}
|
||||||
if (res.status == 400)
|
if (res.status == 400)
|
||||||
{
|
{
|
||||||
res.set_content("Invalid request", "text/plain");
|
res.set_content("Invalid request", "text/plain");
|
||||||
}
|
}
|
||||||
else if (res.status != 500)
|
else if (res.status == 404)
|
||||||
{
|
{
|
||||||
res.set_content("File Not Found", "text/plain");
|
res.set_content("File Not Found", "text/plain");
|
||||||
res.status = 404;
|
res.status = 404;
|
||||||
@ -3035,11 +3082,15 @@ int main(int argc, char **argv)
|
|||||||
// to make it ctrl+clickable:
|
// to make it ctrl+clickable:
|
||||||
LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
|
LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
|
||||||
|
|
||||||
LOG_INFO("HTTP server listening", {
|
std::unordered_map<std::string, std::string> log_data;
|
||||||
{"hostname", sparams.hostname},
|
log_data["hostname"] = sparams.hostname;
|
||||||
{"port", sparams.port},
|
log_data["port"] = std::to_string(sparams.port);
|
||||||
});
|
|
||||||
|
|
||||||
|
if (!sparams.api_key.empty()) {
|
||||||
|
log_data["api_key"] = "api_key: ****" + sparams.api_key.substr(sparams.api_key.length() - 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_INFO("HTTP server listening", log_data);
|
||||||
// run the HTTP server in a thread - see comment below
|
// run the HTTP server in a thread - see comment below
|
||||||
std::thread t([&]()
|
std::thread t([&]()
|
||||||
{
|
{
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
# llama.cpp/examples/speculative
|
# llama.cpp/examples/speculative
|
||||||
|
|
||||||
Demonstartion of speculative decoding and tree-based speculative decoding techniques
|
Demonstration of speculative decoding and tree-based speculative decoding techniques
|
||||||
|
|
||||||
More info:
|
More info:
|
||||||
|
|
||||||
|
@ -203,8 +203,9 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
const std::string token_str = llama_token_to_piece(ctx_tgt, id);
|
const std::string token_str = llama_token_to_piece(ctx_tgt, id);
|
||||||
|
|
||||||
printf("%s", token_str.c_str());
|
if (!params.use_color) {
|
||||||
fflush(stdout);
|
printf("%s", token_str.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
if (id == llama_token_eos(model_tgt)) {
|
if (id == llama_token_eos(model_tgt)) {
|
||||||
has_eos = true;
|
has_eos = true;
|
||||||
@ -236,10 +237,18 @@ int main(int argc, char ** argv) {
|
|||||||
++n_past_tgt;
|
++n_past_tgt;
|
||||||
++n_past_dft;
|
++n_past_dft;
|
||||||
++i_dft;
|
++i_dft;
|
||||||
|
if (params.use_color) {
|
||||||
|
// Color token according to its origin sequence
|
||||||
|
printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (params.use_color) {
|
||||||
|
printf("%s", token_str.c_str());
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
|
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
|
||||||
|
|
||||||
@ -419,7 +428,7 @@ int main(int argc, char ** argv) {
|
|||||||
++n_past_tgt;
|
++n_past_tgt;
|
||||||
}
|
}
|
||||||
|
|
||||||
// the first token is always proposed by the traget model before the speculation loop so we erase it here
|
// the first token is always proposed by the target model before the speculation loop so we erase it here
|
||||||
for (int s = 0; s < n_seq_dft; ++s) {
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
if (!drafts[s].active) {
|
if (!drafts[s].active) {
|
||||||
continue;
|
continue;
|
||||||
|
@ -1295,10 +1295,6 @@ int main(int argc, char ** argv) {
|
|||||||
opt_cb_data.last_save_iter = opt->iter;
|
opt_cb_data.last_save_iter = opt->iter;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (alloc) {
|
|
||||||
ggml_allocr_free(alloc);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_free(opt->ctx);
|
ggml_free(opt->ctx);
|
||||||
free_train_state(train);
|
free_train_state(train);
|
||||||
ggml_free(model.ctx);
|
ggml_free(model.ctx);
|
||||||
|
49
ggml-alloc.c
49
ggml-alloc.c
@ -168,10 +168,6 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
|
|||||||
size = aligned_offset(NULL, size, alloc->alignment);
|
size = aligned_offset(NULL, size, alloc->alignment);
|
||||||
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
||||||
|
|
||||||
if (!alloc->measure) {
|
|
||||||
ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
remove_allocated_tensor(alloc, tensor);
|
remove_allocated_tensor(alloc, tensor);
|
||||||
#endif
|
#endif
|
||||||
@ -237,7 +233,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
|
ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
|
||||||
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
|
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size);
|
||||||
|
|
||||||
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
||||||
|
|
||||||
@ -449,7 +445,6 @@ static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * n
|
|||||||
static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
|
static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
|
||||||
ggml_tallocr_t alloc = node_tallocr(galloc, view);
|
ggml_tallocr_t alloc = node_tallocr(galloc, view);
|
||||||
|
|
||||||
//printf("init_view: %s from src %s\n", view->name, view->view_src->name);
|
|
||||||
GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
|
GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
|
||||||
if (update_backend) {
|
if (update_backend) {
|
||||||
view->backend = view->view_src->backend;
|
view->backend = view->view_src->backend;
|
||||||
@ -459,7 +454,7 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
|
|||||||
|
|
||||||
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
||||||
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
||||||
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
|
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
|
||||||
|
|
||||||
if (!alloc->measure) {
|
if (!alloc->measure) {
|
||||||
ggml_backend_buffer_init_tensor(alloc->buffer, view);
|
ggml_backend_buffer_init_tensor(alloc->buffer, view);
|
||||||
@ -765,3 +760,43 @@ size_t ggml_allocr_max_size(ggml_allocr_t alloc) {
|
|||||||
size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
|
size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
|
||||||
return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
|
return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// utils
|
||||||
|
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||||
|
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
||||||
|
|
||||||
|
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
||||||
|
|
||||||
|
size_t nbytes = 0;
|
||||||
|
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
|
if (t->data == NULL && t->view_src == NULL) {
|
||||||
|
nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nbytes == 0) {
|
||||||
|
fprintf(stderr, "%s: no tensors to allocate\n", __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
|
||||||
|
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
|
||||||
|
|
||||||
|
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
|
if (t->data == NULL) {
|
||||||
|
if (t->view_src == NULL) {
|
||||||
|
ggml_tallocr_alloc(tallocr, t);
|
||||||
|
} else {
|
||||||
|
ggml_backend_view_init(buffer, t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tallocr_free(tallocr);
|
||||||
|
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
|
||||||
|
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
|
||||||
|
}
|
||||||
|
@ -8,6 +8,7 @@ extern "C" {
|
|||||||
|
|
||||||
struct ggml_backend;
|
struct ggml_backend;
|
||||||
struct ggml_backend_buffer;
|
struct ggml_backend_buffer;
|
||||||
|
struct ggml_backend_buffer_type;
|
||||||
|
|
||||||
//
|
//
|
||||||
// Legacy API
|
// Legacy API
|
||||||
@ -42,7 +43,7 @@ GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph
|
|||||||
// ggml-backend v2 API
|
// ggml-backend v2 API
|
||||||
//
|
//
|
||||||
|
|
||||||
// Seperate tensor and graph allocator objects
|
// Separate tensor and graph allocator objects
|
||||||
// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
|
// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
|
||||||
// The original API is kept as a wrapper around the new API
|
// The original API is kept as a wrapper around the new API
|
||||||
|
|
||||||
@ -80,6 +81,12 @@ GGML_API void ggml_gallocr_alloc_graph_n(
|
|||||||
struct ggml_hash_set hash_set,
|
struct ggml_hash_set hash_set,
|
||||||
ggml_tallocr_t * hash_node_talloc);
|
ggml_tallocr_t * hash_node_talloc);
|
||||||
|
|
||||||
|
|
||||||
|
// Utils
|
||||||
|
// Create a buffer and allocate all the tensors in a ggml_context
|
||||||
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft);
|
||||||
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -12,31 +12,50 @@ extern "C" {
|
|||||||
// Backend buffer
|
// Backend buffer
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// buffer type
|
||||||
|
typedef void * ggml_backend_buffer_type_context_t;
|
||||||
|
|
||||||
|
struct ggml_backend_buffer_type_i {
|
||||||
|
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
||||||
|
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
||||||
|
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
||||||
|
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_backend_buffer_type {
|
||||||
|
struct ggml_backend_buffer_type_i iface;
|
||||||
|
ggml_backend_buffer_type_context_t context;
|
||||||
|
};
|
||||||
|
|
||||||
|
// buffer
|
||||||
typedef void * ggml_backend_buffer_context_t;
|
typedef void * ggml_backend_buffer_context_t;
|
||||||
|
|
||||||
struct ggml_backend_buffer_i {
|
struct ggml_backend_buffer_i {
|
||||||
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
void (*free_buffer)(ggml_backend_buffer_t buffer);
|
||||||
void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer
|
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
||||||
size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
|
void * (*get_base) (ggml_backend_buffer_t buffer);
|
||||||
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
|
void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
|
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
|
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
|
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
|
||||||
|
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_buffer {
|
struct ggml_backend_buffer {
|
||||||
struct ggml_backend_buffer_i iface;
|
struct ggml_backend_buffer_i iface;
|
||||||
|
ggml_backend_buffer_type_t buft;
|
||||||
ggml_backend_t backend;
|
|
||||||
ggml_backend_buffer_context_t context;
|
ggml_backend_buffer_context_t context;
|
||||||
|
|
||||||
size_t size;
|
size_t size;
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
|
ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||||
struct ggml_backend * backend,
|
ggml_backend_buffer_type_t buft,
|
||||||
struct ggml_backend_buffer_i iface,
|
struct ggml_backend_buffer_i iface,
|
||||||
ggml_backend_buffer_context_t context,
|
ggml_backend_buffer_context_t context,
|
||||||
size_t size);
|
size_t size);
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend
|
// Backend
|
||||||
//
|
//
|
||||||
@ -49,20 +68,17 @@ extern "C" {
|
|||||||
void (*free)(ggml_backend_t backend);
|
void (*free)(ggml_backend_t backend);
|
||||||
|
|
||||||
// buffer allocation
|
// buffer allocation
|
||||||
ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
|
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
|
||||||
|
|
||||||
// get buffer alignment
|
// (optional) asynchroneous tensor data access
|
||||||
size_t (*get_alignment)(ggml_backend_t backend);
|
|
||||||
|
|
||||||
// tensor data access
|
|
||||||
// these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
|
|
||||||
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
void (*synchronize) (ggml_backend_t backend);
|
|
||||||
|
|
||||||
// (optional) copy tensor between different backends, allow for single-copy tranfers
|
// (optional) asynchroneous tensor copy
|
||||||
void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
|
||||||
|
void (*synchronize) (ggml_backend_t backend);
|
||||||
|
|
||||||
// compute graph with a plan
|
// compute graph with a plan
|
||||||
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
@ -82,6 +98,15 @@ extern "C" {
|
|||||||
ggml_backend_context_t context;
|
ggml_backend_context_t context;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// Backend registry
|
||||||
|
//
|
||||||
|
|
||||||
|
typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
|
||||||
|
|
||||||
|
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
771
ggml-backend.c
771
ggml-backend.c
File diff suppressed because it is too large
Load Diff
@ -7,41 +7,44 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
||||||
|
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||||
|
typedef struct ggml_backend * ggml_backend_t;
|
||||||
|
typedef void * ggml_backend_graph_plan_t;
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend buffer
|
// Backend buffer
|
||||||
//
|
//
|
||||||
|
|
||||||
struct ggml_backend_buffer;
|
// buffer type
|
||||||
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
|
||||||
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||||
|
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||||
|
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
||||||
|
|
||||||
// backend buffer functions
|
// buffer
|
||||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
|
||||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
|
||||||
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
GGML_API void ggml_backend_buffer_free_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||||
|
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend
|
// Backend
|
||||||
//
|
//
|
||||||
|
|
||||||
struct ggml_backend;
|
|
||||||
typedef struct ggml_backend * ggml_backend_t;
|
|
||||||
typedef void * ggml_backend_graph_plan_t;
|
|
||||||
|
|
||||||
GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
|
|
||||||
|
|
||||||
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
||||||
GGML_API void ggml_backend_free(ggml_backend_t backend);
|
GGML_API void ggml_backend_free(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
|
||||||
|
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
||||||
|
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
|
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
|
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
GGML_API void ggml_backend_tensor_set_async( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
|
||||||
GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
|
||||||
|
|
||||||
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
@ -57,6 +60,7 @@ extern "C" {
|
|||||||
|
|
||||||
// tensor copy between different backends
|
// tensor copy between different backends
|
||||||
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy
|
||||||
|
|
||||||
//
|
//
|
||||||
// CPU backend
|
// CPU backend
|
||||||
@ -68,8 +72,23 @@ extern "C" {
|
|||||||
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
||||||
|
|
||||||
// Create a backend buffer from an existing pointer
|
// Create a backend buffer from an existing pointer
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
|
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Backend registry
|
||||||
|
//
|
||||||
|
|
||||||
|
// The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
|
||||||
|
|
||||||
|
GGML_API size_t ggml_backend_reg_get_count(void);
|
||||||
|
GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
|
||||||
|
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
|
||||||
|
GGML_API const char * ggml_backend_reg_get_name(size_t i);
|
||||||
|
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
|
||||||
|
GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend scheduler
|
// Backend scheduler
|
||||||
@ -131,6 +150,32 @@ extern "C" {
|
|||||||
ggml_backend_sched_t sched,
|
ggml_backend_sched_t sched,
|
||||||
struct ggml_cgraph * graph);
|
struct ggml_cgraph * graph);
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// Utils
|
||||||
|
//
|
||||||
|
|
||||||
|
struct ggml_backend_graph_copy {
|
||||||
|
ggml_backend_buffer_t buffer;
|
||||||
|
struct ggml_context * ctx_allocated;
|
||||||
|
struct ggml_context * ctx_unallocated;
|
||||||
|
struct ggml_cgraph * graph;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Copy a graph to a different backend
|
||||||
|
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
||||||
|
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
||||||
|
|
||||||
|
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
||||||
|
|
||||||
|
// Compare the output of two backends
|
||||||
|
GGML_API void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
||||||
|
|
||||||
|
// Tensor initialization
|
||||||
|
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||||
|
GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
2217
ggml-cuda.cu
2217
ggml-cuda.cu
File diff suppressed because it is too large
Load Diff
10
ggml-cuda.h
10
ggml-cuda.h
@ -49,7 +49,15 @@ GGML_API int ggml_cuda_get_device_count(void);
|
|||||||
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
|
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
|
||||||
|
|
||||||
|
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||||
|
GGML_API int ggml_backend_cuda_get_device(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||||
|
|
||||||
|
// pinned host buffer for use with CPU backend for faster copies between CPU and GPU
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
@ -232,7 +232,7 @@ bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml
|
|||||||
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
||||||
size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
||||||
|
|
||||||
// returns GGML_HAHSHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
||||||
size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
||||||
|
|
||||||
// return index, asserts if table is full
|
// return index, asserts if table is full
|
||||||
|
@ -99,6 +99,12 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
|||||||
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||||
|
|
||||||
|
// helper to check if the device supports a specific family
|
||||||
|
// ideally, the user code should be doing these checks
|
||||||
|
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
||||||
|
GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
1209
ggml-metal.m
1209
ggml-metal.m
File diff suppressed because it is too large
Load Diff
2461
ggml-metal.metal
2461
ggml-metal.metal
File diff suppressed because it is too large
Load Diff
@ -3114,7 +3114,7 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
||||||
|
|
||||||
// These tempory registers are for masking and shift operations
|
// These temporary registers are for masking and shift operations
|
||||||
vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
|
vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
|
||||||
vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
|
vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
|
||||||
|
|
||||||
@ -4757,7 +4757,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
vl = 16;
|
vl = 16;
|
||||||
|
|
||||||
// retreive lane to multiply with scale
|
// retrieve lane to multiply with scale
|
||||||
vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
|
vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
|
||||||
vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
|
vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
|
||||||
vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
|
vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
|
||||||
|
96
ggml.h
96
ggml.h
@ -215,9 +215,9 @@
|
|||||||
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
||||||
|
|
||||||
#define GGML_MAX_DIMS 4
|
#define GGML_MAX_DIMS 4
|
||||||
#define GGML_MAX_PARAMS 1024
|
#define GGML_MAX_PARAMS 2048
|
||||||
#define GGML_MAX_CONTEXTS 64
|
#define GGML_MAX_CONTEXTS 64
|
||||||
#define GGML_MAX_SRC 6
|
#define GGML_MAX_SRC 10
|
||||||
#define GGML_MAX_NAME 64
|
#define GGML_MAX_NAME 64
|
||||||
#define GGML_MAX_OP_PARAMS 64
|
#define GGML_MAX_OP_PARAMS 64
|
||||||
#define GGML_DEFAULT_N_THREADS 4
|
#define GGML_DEFAULT_N_THREADS 4
|
||||||
@ -283,6 +283,20 @@
|
|||||||
const type prefix##3 = (pointer)->array[3]; \
|
const type prefix##3 = (pointer)->array[3]; \
|
||||||
GGML_UNUSED(prefix##3);
|
GGML_UNUSED(prefix##3);
|
||||||
|
|
||||||
|
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
||||||
|
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
||||||
|
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
||||||
|
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
||||||
|
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||||
|
|
||||||
|
#define GGML_TENSOR_BINARY_OP_LOCALS \
|
||||||
|
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
||||||
|
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
||||||
|
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
||||||
|
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
|
||||||
|
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
||||||
|
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
@ -381,6 +395,7 @@ extern "C" {
|
|||||||
GGML_OP_GROUP_NORM,
|
GGML_OP_GROUP_NORM,
|
||||||
|
|
||||||
GGML_OP_MUL_MAT,
|
GGML_OP_MUL_MAT,
|
||||||
|
GGML_OP_MUL_MAT_ID,
|
||||||
GGML_OP_OUT_PROD,
|
GGML_OP_OUT_PROD,
|
||||||
|
|
||||||
GGML_OP_SCALE,
|
GGML_OP_SCALE,
|
||||||
@ -407,8 +422,10 @@ extern "C" {
|
|||||||
GGML_OP_CONV_TRANSPOSE_2D,
|
GGML_OP_CONV_TRANSPOSE_2D,
|
||||||
GGML_OP_POOL_1D,
|
GGML_OP_POOL_1D,
|
||||||
GGML_OP_POOL_2D,
|
GGML_OP_POOL_2D,
|
||||||
|
|
||||||
GGML_OP_UPSCALE, // nearest interpolate
|
GGML_OP_UPSCALE, // nearest interpolate
|
||||||
|
GGML_OP_PAD,
|
||||||
|
GGML_OP_ARGSORT,
|
||||||
|
GGML_OP_LEAKY_RELU,
|
||||||
|
|
||||||
GGML_OP_FLASH_ATTN,
|
GGML_OP_FLASH_ATTN,
|
||||||
GGML_OP_FLASH_FF,
|
GGML_OP_FLASH_FF,
|
||||||
@ -448,7 +465,8 @@ extern "C" {
|
|||||||
GGML_UNARY_OP_GELU,
|
GGML_UNARY_OP_GELU,
|
||||||
GGML_UNARY_OP_GELU_QUICK,
|
GGML_UNARY_OP_GELU_QUICK,
|
||||||
GGML_UNARY_OP_SILU,
|
GGML_UNARY_OP_SILU,
|
||||||
GGML_UNARY_OP_LEAKY
|
|
||||||
|
GGML_UNARY_OP_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ggml_object_type {
|
enum ggml_object_type {
|
||||||
@ -484,7 +502,6 @@ extern "C" {
|
|||||||
|
|
||||||
struct ggml_backend_buffer * buffer;
|
struct ggml_backend_buffer * buffer;
|
||||||
|
|
||||||
int n_dims;
|
|
||||||
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
||||||
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
||||||
// nb[0] = ggml_type_size(type)
|
// nb[0] = ggml_type_size(type)
|
||||||
@ -516,7 +533,7 @@ extern "C" {
|
|||||||
|
|
||||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||||
|
|
||||||
char padding[12];
|
char padding[8];
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||||
@ -621,16 +638,22 @@ extern "C" {
|
|||||||
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
||||||
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
|
||||||
|
|
||||||
GGML_API int ggml_blck_size (enum ggml_type type);
|
GGML_API int ggml_blck_size(enum ggml_type type);
|
||||||
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
||||||
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
||||||
|
|
||||||
|
GGML_DEPRECATED(
|
||||||
|
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
||||||
|
"use ggml_row_size() instead");
|
||||||
|
|
||||||
GGML_API const char * ggml_type_name(enum ggml_type type);
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
||||||
GGML_API const char * ggml_op_name (enum ggml_op op);
|
GGML_API const char * ggml_op_name (enum ggml_op op);
|
||||||
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
||||||
|
|
||||||
|
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
||||||
|
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
||||||
|
|
||||||
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
||||||
@ -641,6 +664,11 @@ extern "C" {
|
|||||||
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
||||||
|
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
||||||
|
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
||||||
|
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
||||||
|
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
||||||
|
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
||||||
|
|
||||||
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||||
|
|
||||||
@ -773,6 +801,9 @@ extern "C" {
|
|||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
// dst = a
|
||||||
|
// view(dst, nb1, nb2, nb3, offset) += b
|
||||||
|
// return dst
|
||||||
GGML_API struct ggml_tensor * ggml_acc(
|
GGML_API struct ggml_tensor * ggml_acc(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
@ -937,15 +968,14 @@ extern "C" {
|
|||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_leaky(
|
GGML_API struct ggml_tensor * ggml_leaky_relu(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a, float negative_slope, bool inplace);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
// TODO: double-check this computation is correct
|
|
||||||
GGML_API struct ggml_tensor * ggml_gelu(
|
GGML_API struct ggml_tensor * ggml_gelu(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
@ -1027,6 +1057,16 @@ extern "C" {
|
|||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
// indirect matrix multiplication
|
||||||
|
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
||||||
|
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * const as[],
|
||||||
|
int n_as,
|
||||||
|
struct ggml_tensor * ids,
|
||||||
|
int id,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
// A: m columns, n rows,
|
// A: m columns, n rows,
|
||||||
// B: p columns, n rows,
|
// B: p columns, n rows,
|
||||||
// result is m columns, p rows
|
// result is m columns, p rows
|
||||||
@ -1234,6 +1274,7 @@ extern "C" {
|
|||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
// supports 3D: a->ne[2] == b->ne[1]
|
||||||
GGML_API struct ggml_tensor * ggml_get_rows(
|
GGML_API struct ggml_tensor * ggml_get_rows(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
@ -1520,6 +1561,32 @@ extern "C" {
|
|||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int scale_factor);
|
int scale_factor);
|
||||||
|
|
||||||
|
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
||||||
|
GGML_API struct ggml_tensor * ggml_pad(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int p0,
|
||||||
|
int p1,
|
||||||
|
int p2,
|
||||||
|
int p3);
|
||||||
|
|
||||||
|
// sort rows
|
||||||
|
enum ggml_sort_order {
|
||||||
|
GGML_SORT_ASC,
|
||||||
|
GGML_SORT_DESC,
|
||||||
|
};
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_argsort(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
enum ggml_sort_order order);
|
||||||
|
|
||||||
|
// top k elements per row
|
||||||
|
GGML_API struct ggml_tensor * ggml_top_k(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int k);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_flash_attn(
|
GGML_API struct ggml_tensor * ggml_flash_attn(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * q,
|
struct ggml_tensor * q,
|
||||||
@ -1581,7 +1648,6 @@ extern "C" {
|
|||||||
int kh);
|
int kh);
|
||||||
|
|
||||||
// used in sam
|
// used in sam
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_add_rel_pos(
|
GGML_API struct ggml_tensor * ggml_add_rel_pos(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
@ -1756,7 +1822,7 @@ extern "C" {
|
|||||||
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
||||||
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
|
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
|
||||||
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||||
GGML_API struct ggml_cgraph * ggml_graph_view (struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i0, int i1);
|
GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
|
||||||
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
||||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
||||||
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
||||||
|
@ -61,7 +61,7 @@ If you want to publish the package manually for any reason, you need to have `tw
|
|||||||
pip install build twine
|
pip install build twine
|
||||||
```
|
```
|
||||||
|
|
||||||
Then, folow these steps to release a new version:
|
Then, follow these steps to release a new version:
|
||||||
|
|
||||||
1. Bump the version in `pyproject.toml`.
|
1. Bump the version in `pyproject.toml`.
|
||||||
2. Build the package:
|
2. Build the package:
|
||||||
|
@ -38,6 +38,8 @@ class Keys:
|
|||||||
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
||||||
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
||||||
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||||
|
EXPERT_COUNT = "{arch}.expert_count"
|
||||||
|
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
||||||
|
|
||||||
class Attention:
|
class Attention:
|
||||||
HEAD_COUNT = "{arch}.attention.head_count"
|
HEAD_COUNT = "{arch}.attention.head_count"
|
||||||
@ -111,10 +113,14 @@ class MODEL_TENSOR(IntEnum):
|
|||||||
ATTN_NORM = auto()
|
ATTN_NORM = auto()
|
||||||
ATTN_NORM_2 = auto()
|
ATTN_NORM_2 = auto()
|
||||||
ATTN_ROT_EMBD = auto()
|
ATTN_ROT_EMBD = auto()
|
||||||
|
FFN_GATE_INP = auto()
|
||||||
|
FFN_NORM = auto()
|
||||||
FFN_GATE = auto()
|
FFN_GATE = auto()
|
||||||
FFN_DOWN = auto()
|
FFN_DOWN = auto()
|
||||||
FFN_UP = auto()
|
FFN_UP = auto()
|
||||||
FFN_NORM = auto()
|
FFN_GATE_EXP = auto()
|
||||||
|
FFN_DOWN_EXP = auto()
|
||||||
|
FFN_UP_EXP = auto()
|
||||||
ATTN_Q_NORM = auto()
|
ATTN_Q_NORM = auto()
|
||||||
ATTN_K_NORM = auto()
|
ATTN_K_NORM = auto()
|
||||||
|
|
||||||
@ -154,10 +160,14 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||||
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
||||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}",
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}",
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}",
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
@ -172,10 +182,14 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||||||
MODEL_TENSOR.ATTN_V,
|
MODEL_TENSOR.ATTN_V,
|
||||||
MODEL_TENSOR.ATTN_OUT,
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
MODEL_TENSOR.FFN_NORM,
|
MODEL_TENSOR.FFN_NORM,
|
||||||
MODEL_TENSOR.FFN_GATE,
|
MODEL_TENSOR.FFN_GATE,
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
],
|
],
|
||||||
MODEL_ARCH.GPTNEOX: [
|
MODEL_ARCH.GPTNEOX: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
@ -339,6 +339,12 @@ class GGUFWriter:
|
|||||||
def add_clamp_kqv(self, value: float) -> None:
|
def add_clamp_kqv(self, value: float) -> None:
|
||||||
self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
|
self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_expert_count(self, count: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
def add_expert_used_count(self, count: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
def add_layer_norm_eps(self, value: float) -> None:
|
def add_layer_norm_eps(self, value: float) -> None:
|
||||||
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
|
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
@ -149,6 +149,11 @@ class TensorNameMap:
|
|||||||
"model.layers.{bid}.ln2", # yi
|
"model.layers.{bid}.ln2", # yi
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP: (
|
||||||
|
"layers.{bid}.feed_forward.gate", # mixtral
|
||||||
|
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
||||||
|
),
|
||||||
|
|
||||||
# Feed-forward up
|
# Feed-forward up
|
||||||
MODEL_TENSOR.FFN_UP: (
|
MODEL_TENSOR.FFN_UP: (
|
||||||
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
||||||
@ -164,11 +169,21 @@ class TensorNameMap:
|
|||||||
"transformer.h.{bid}.mlp.w1", # qwen
|
"transformer.h.{bid}.mlp.w1", # qwen
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP: (
|
||||||
|
"layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral
|
||||||
|
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
|
||||||
|
),
|
||||||
|
|
||||||
# Feed-forward gate
|
# Feed-forward gate
|
||||||
MODEL_TENSOR.FFN_GATE: (
|
MODEL_TENSOR.FFN_GATE: (
|
||||||
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
||||||
"layers.{bid}.feed_forward.w1", # llama-pth
|
"layers.{bid}.feed_forward.w1", # llama-pth
|
||||||
"transformer.h.{bid}.mlp.w2", # qwen
|
"transformer.h.{bid}.mlp.w2", # qwen
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||||
|
"layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral
|
||||||
|
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward down
|
# Feed-forward down
|
||||||
@ -185,6 +200,11 @@ class TensorNameMap:
|
|||||||
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||||
|
"layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral
|
||||||
|
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
|
||||||
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||||
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
||||||
),
|
),
|
||||||
@ -213,11 +233,14 @@ class TensorNameMap:
|
|||||||
for tensor, keys in self.block_mappings_cfg.items():
|
for tensor, keys in self.block_mappings_cfg.items():
|
||||||
if tensor not in MODEL_TENSORS[arch]:
|
if tensor not in MODEL_TENSORS[arch]:
|
||||||
continue
|
continue
|
||||||
tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
|
# TODO: make this configurable
|
||||||
self.mapping[tensor_name] = (tensor, tensor_name)
|
n_experts = 8
|
||||||
for key in keys:
|
for xid in range(n_experts):
|
||||||
key = key.format(bid = bid)
|
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
|
||||||
self.mapping[key] = (tensor, tensor_name)
|
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||||
|
for key in keys:
|
||||||
|
key = key.format(bid = bid, xid = xid)
|
||||||
|
self.mapping[key] = (tensor, tensor_name)
|
||||||
|
|
||||||
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
|
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
|
||||||
result = self.mapping.get(key)
|
result = self.mapping.get(key)
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "gguf"
|
name = "gguf"
|
||||||
version = "0.6.0"
|
version = "0.7.0"
|
||||||
description = "Read and write ML models in GGUF for GGML"
|
description = "Read and write ML models in GGUF for GGML"
|
||||||
authors = ["GGML <ggml@ggml.ai>"]
|
authors = ["GGML <ggml@ggml.ai>"]
|
||||||
packages = [
|
packages = [
|
||||||
|
14
llama.h
14
llama.h
@ -39,10 +39,11 @@
|
|||||||
|
|
||||||
#define LLAMA_MAX_RNG_STATE (64*1024)
|
#define LLAMA_MAX_RNG_STATE (64*1024)
|
||||||
|
|
||||||
|
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||||
|
|
||||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
#define LLAMA_SESSION_VERSION 2
|
#define LLAMA_SESSION_VERSION 3
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
@ -211,11 +212,14 @@ extern "C" {
|
|||||||
float yarn_beta_slow; // YaRN high correction dim
|
float yarn_beta_slow; // YaRN high correction dim
|
||||||
uint32_t yarn_orig_ctx; // YaRN original context size
|
uint32_t yarn_orig_ctx; // YaRN original context size
|
||||||
|
|
||||||
|
enum ggml_type type_k; // data type for K cache
|
||||||
|
enum ggml_type type_v; // data type for V cache
|
||||||
|
|
||||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
||||||
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
bool embedding; // embedding mode only
|
||||||
bool embedding; // embedding mode only
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
};
|
};
|
||||||
|
|
||||||
// model quantization parameters
|
// model quantization parameters
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
numpy==1.24.4
|
numpy==1.24.4
|
||||||
sentencepiece==0.1.98
|
sentencepiece==0.1.98
|
||||||
|
transformers>=4.34.0
|
||||||
gguf>=0.1.0
|
gguf>=0.1.0
|
||||||
|
protobuf>=4.21.0
|
||||||
|
38
scripts/get-flags.mk
Normal file
38
scripts/get-flags.mk
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
ifeq '' '$(findstring clang,$(shell $(GF_CC) --version))'
|
||||||
|
GF_CC_IS_GCC = 1
|
||||||
|
GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null || $(GF_CC) -dumpversion; } | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
|
||||||
|
else
|
||||||
|
GF_CC_IS_CLANG = 1
|
||||||
|
ifeq '' '$(findstring Apple,$(shell $(GF_CC) --version))'
|
||||||
|
GF_CC_IS_LLVM_CLANG = 1
|
||||||
|
else
|
||||||
|
GF_CC_IS_APPLE_CLANG = 1
|
||||||
|
endif
|
||||||
|
GF_CC_VER := \
|
||||||
|
$(shell $(GF_CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \
|
||||||
|
| awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(GF_CC_IS_CLANG), 1)
|
||||||
|
# clang options
|
||||||
|
GF_CFLAGS = -Wunreachable-code-break -Wunreachable-code-return
|
||||||
|
GF_CXXFLAGS = -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
|
||||||
|
|
||||||
|
ifneq '' '$(and $(GF_CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(GF_CC_VER) \>= 030800)))'
|
||||||
|
GF_CFLAGS += -Wdouble-promotion
|
||||||
|
endif
|
||||||
|
ifneq '' '$(and $(GF_CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(GF_CC_VER) \>= 070300)))'
|
||||||
|
GF_CFLAGS += -Wdouble-promotion
|
||||||
|
endif
|
||||||
|
else
|
||||||
|
# gcc options
|
||||||
|
GF_CFLAGS = -Wdouble-promotion
|
||||||
|
GF_CXXFLAGS = -Wno-array-bounds
|
||||||
|
|
||||||
|
ifeq ($(shell expr $(GF_CC_VER) \>= 070100), 1)
|
||||||
|
GF_CXXFLAGS += -Wno-format-truncation
|
||||||
|
endif
|
||||||
|
ifeq ($(shell expr $(GF_CC_VER) \>= 080100), 1)
|
||||||
|
GF_CXXFLAGS += -Wextra-semi
|
||||||
|
endif
|
||||||
|
endif
|
@ -20,5 +20,6 @@ cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
|
|||||||
cp -rpv ../ggml/include/ggml/ggml-alloc.h ./ggml-alloc.h
|
cp -rpv ../ggml/include/ggml/ggml-alloc.h ./ggml-alloc.h
|
||||||
cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h
|
cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h
|
||||||
|
|
||||||
cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp
|
cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp
|
||||||
cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp
|
cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp
|
||||||
|
cp -rpv ../ggml/tests/test-backend-ops.cpp ./tests/test-backend-ops.cpp
|
||||||
|
@ -22,26 +22,32 @@ endfunction()
|
|||||||
llama_build_and_test_executable(test-quantize-fns.cpp)
|
llama_build_and_test_executable(test-quantize-fns.cpp)
|
||||||
llama_build_and_test_executable(test-quantize-perf.cpp)
|
llama_build_and_test_executable(test-quantize-perf.cpp)
|
||||||
llama_build_and_test_executable(test-sampling.cpp)
|
llama_build_and_test_executable(test-sampling.cpp)
|
||||||
|
|
||||||
llama_build_executable(test-tokenizer-0-llama.cpp)
|
llama_build_executable(test-tokenizer-0-llama.cpp)
|
||||||
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||||
|
|
||||||
llama_build_executable(test-tokenizer-0-falcon.cpp)
|
llama_build_executable(test-tokenizer-0-falcon.cpp)
|
||||||
llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
|
|
||||||
llama_build_executable(test-tokenizer-1-llama.cpp)
|
llama_build_executable(test-tokenizer-1-llama.cpp)
|
||||||
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
llama_test_executable (test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
||||||
|
|
||||||
llama_build_executable(test-tokenizer-1-bpe.cpp)
|
llama_build_executable(test-tokenizer-1-bpe.cpp)
|
||||||
llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
llama_test_executable (test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
llama_test_executable (test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
|
llama_test_executable (test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
llama_test_executable (test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
llama_test_executable (test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
llama_test_executable (test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||||
# llama_test_executable(test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
|
# llama_test_executable (test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
|
||||||
|
|
||||||
llama_build_and_test_executable(test-grammar-parser.cpp)
|
llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||||
llama_build_and_test_executable(test-llama-grammar.cpp)
|
llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||||
llama_build_and_test_executable(test-grad0.cpp) # SLOW
|
llama_build_and_test_executable(test-grad0.cpp)
|
||||||
# llama_build_and_test_executable(test-opt.cpp) # SLOW
|
# llama_build_and_test_executable(test-opt.cpp) # SLOW
|
||||||
|
llama_build_and_test_executable(test-backend-ops.cpp)
|
||||||
|
|
||||||
llama_build_and_test_executable(test-rope.cpp)
|
llama_build_and_test_executable(test-rope.cpp)
|
||||||
|
|
||||||
|
1688
tests/test-backend-ops.cpp
Normal file
1688
tests/test-backend-ops.cpp
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,4 +1,4 @@
|
|||||||
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
@ -117,7 +117,7 @@ static void usage(char * argv[]) {
|
|||||||
printf(" --size SIZE set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
|
printf(" --size SIZE set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
|
||||||
printf(" -3 use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
|
printf(" -3 use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
|
||||||
printf(" -4 use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
|
printf(" -4 use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
|
||||||
printf(" --op OP set test opration as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
|
printf(" --op OP set test operation as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
|
||||||
printf(" quantize_row_q_dot, vec_dot_q (all)\n");
|
printf(" quantize_row_q_dot, vec_dot_q (all)\n");
|
||||||
printf(" --type TYPE set test type as");
|
printf(" --type TYPE set test type as");
|
||||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||||
@ -202,7 +202,7 @@ int main(int argc, char * argv[]) {
|
|||||||
}
|
}
|
||||||
int alignment = std::stoi(argv[i]);
|
int alignment = std::stoi(argv[i]);
|
||||||
if (alignment < 0 || alignment > MAX_ALIGNMENT) {
|
if (alignment < 0 || alignment > MAX_ALIGNMENT) {
|
||||||
fprintf(stderr, "error: aligment-offset must be less than %d\n", MAX_ALIGNMENT);
|
fprintf(stderr, "error: alignment-offset must be less than %d\n", MAX_ALIGNMENT);
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -286,7 +286,7 @@ int main(int argc, char * argv[]) {
|
|||||||
qfns.from_float_reference(test_data1, test_q1, size);
|
qfns.from_float_reference(test_data1, test_q1, size);
|
||||||
return test_q1[0];
|
return test_q1[0];
|
||||||
};
|
};
|
||||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
size_t quantized_size = ggml_row_size(type, size);
|
||||||
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
@ -300,7 +300,7 @@ int main(int argc, char * argv[]) {
|
|||||||
qfns.from_float(test_data1, test_q1, size);
|
qfns.from_float(test_data1, test_q1, size);
|
||||||
return test_q1[0];
|
return test_q1[0];
|
||||||
};
|
};
|
||||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
size_t quantized_size = ggml_row_size(type, size);
|
||||||
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
@ -315,7 +315,7 @@ int main(int argc, char * argv[]) {
|
|||||||
qfns.to_float(test_q1, test_out, size);
|
qfns.to_float(test_q1, test_out, size);
|
||||||
return test_out[0];
|
return test_out[0];
|
||||||
};
|
};
|
||||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
size_t quantized_size = ggml_row_size(type, size);
|
||||||
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
@ -330,7 +330,7 @@ int main(int argc, char * argv[]) {
|
|||||||
vdot.from_float(test_data1, test_q1, size);
|
vdot.from_float(test_data1, test_q1, size);
|
||||||
return test_q1[0];
|
return test_q1[0];
|
||||||
};
|
};
|
||||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
size_t quantized_size = ggml_row_size(type, size);
|
||||||
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
@ -347,7 +347,7 @@ int main(int argc, char * argv[]) {
|
|||||||
qfns.vec_dot(size, &result, test_q1, test_q2);
|
qfns.vec_dot(size, &result, test_q1, test_q2);
|
||||||
return result;
|
return result;
|
||||||
};
|
};
|
||||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
size_t quantized_size = ggml_row_size(type, size);
|
||||||
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
Loading…
Reference in New Issue
Block a user