Merge branch 'master' into compilade/bitnet-ternary

This commit is contained in:
Francis Couture-Harpin 2024-07-28 21:27:33 -04:00
commit 79a278e922
340 changed files with 43840 additions and 162220 deletions

View File

@ -6,7 +6,7 @@ ARG CUDA_VERSION=11.7.1
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} as build
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all

View File

@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} as build
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878

View File

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1

View File

@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER
# Target the CUDA runtime image
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} as build
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
@ -25,7 +25,7 @@ ENV GGML_CUDA=1
RUN make -j$(nproc) llama-cli
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
RUN apt-get update && \
apt-get install -y libgomp1

View File

@ -1,6 +1,6 @@
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
@ -14,10 +14,12 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
echo "GGML_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
echo "Building with static libs" && \
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
cmake --build build --config Release --target llama-cli
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
COPY --from=build /app/build/bin/llama-cli /llama-cli

View File

@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} as build
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878

View File

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=jammy
FROM ubuntu:$UBUNTU_VERSION as build
FROM ubuntu:$UBUNTU_VERSION AS build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget libgomp1

View File

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && \
apt-get install -y build-essential git
@ -11,7 +11,7 @@ COPY . .
RUN make -j$(nproc) llama-cli
FROM ubuntu:$UBUNTU_VERSION as runtime
FROM ubuntu:$UBUNTU_VERSION AS runtime
RUN apt-get update && \
apt-get install -y libgomp1

View File

@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER
# Target the CUDA runtime image
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} as build
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
@ -27,7 +27,7 @@ ENV LLAMA_CURL=1
RUN make -j$(nproc) llama-server
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl

View File

@ -1,6 +1,6 @@
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
@ -14,10 +14,11 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
echo "GGML_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \
echo "Building with dynamic libs" && \
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake --build build --config Release --target llama-server
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev curl

View File

@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} as build
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878

View File

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=jammy
FROM ubuntu:$UBUNTU_VERSION as build
FROM ubuntu:$UBUNTU_VERSION AS build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget

View File

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && \
apt-get install -y build-essential git libcurl4-openssl-dev curl
@ -13,7 +13,7 @@ ENV LLAMA_CURL=1
RUN make -j$(nproc) llama-server
FROM ubuntu:$UBUNTU_VERSION as runtime
FROM ubuntu:$UBUNTU_VERSION AS runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1

View File

@ -10,7 +10,6 @@
"llama-embedding"
"llama-server"
"llama-quantize"
"llama-train-text-from-scratch"
];
mkApp = name: {
type = "app";

View File

@ -17,19 +17,19 @@
rocmPackages,
vulkan-headers,
vulkan-loader,
clblast,
curl,
shaderc,
useBlas ? builtins.all (x: !x) [
useCuda
useMetalKit
useOpenCL
useRocm
useVulkan
] && blas.meta.available,
useCuda ? config.cudaSupport,
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
useMpi ? false, # Increases the runtime closure size by ~700M
useOpenCL ? false,
useRocm ? config.rocmSupport,
enableCurl ? true,
useVulkan ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
@ -56,7 +56,6 @@ let
++ lib.optionals useCuda [ "CUDA" ]
++ lib.optionals useMetalKit [ "MetalKit" ]
++ lib.optionals useMpi [ "MPI" ]
++ lib.optionals useOpenCL [ "OpenCL" ]
++ lib.optionals useRocm [ "ROCm" ]
++ lib.optionals useVulkan [ "Vulkan" ];
@ -91,6 +90,22 @@ let
ps.tiktoken
ps.torchWithoutCuda
ps.transformers
# server bench
ps.matplotlib
# server tests
ps.openai
ps.behave
ps.prometheus-client
# for examples/pydantic-models-to-grammar-examples.py
ps.docstring-parser
ps.pydantic
# for scripts/compare-llama-bench.py
ps.gitpython
ps.tabulate
]
);
@ -132,6 +147,7 @@ let
vulkanBuildInputs = [
vulkan-headers
vulkan-loader
shaderc
];
in
@ -198,19 +214,19 @@ effectiveStdenv.mkDerivation (
optionals effectiveStdenv.isDarwin darwinBuildInputs
++ optionals useCuda cudaBuildInputs
++ optionals useMpi [ mpi ]
++ optionals useOpenCL [ clblast ]
++ optionals useRocm rocmBuildInputs
++ optionals useBlas [ blas ]
++ optionals useVulkan vulkanBuildInputs;
++ optionals useVulkan vulkanBuildInputs
++ optionals enableCurl [ curl ];
cmakeFlags =
[
(cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "LLAMA_CURL" enableCurl)
(cmakeBool "GGML_NATIVE" false)
(cmakeBool "GGML_BLAS" useBlas)
(cmakeBool "GGML_CLBLAST" useOpenCL)
(cmakeBool "GGML_CUDA" useCuda)
(cmakeBool "GGML_HIPBLAS" useRocm)
(cmakeBool "GGML_METAL" useMetalKit)
@ -254,7 +270,6 @@ effectiveStdenv.mkDerivation (
useCuda
useMetalKit
useMpi
useOpenCL
useRocm
useVulkan
;
@ -281,7 +296,7 @@ effectiveStdenv.mkDerivation (
# Configurations we don't want even the CI to evaluate. Results in the
# "unsupported platform" messages. This is mostly a no-op, because
# cudaPackages would've refused to evaluate anyway.
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
badPlatforms = optionals useCuda lib.platforms.darwin;
# Configurations that are known to result in build failures. Can be
# overridden by importing Nixpkgs with `allowBroken = true`.

View File

@ -8,13 +8,11 @@ arg1="$1"
shift
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
python3 ./convert-hf-to-gguf.py "$@"
python3 ./convert_hf_to_gguf.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./llama-quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
./llama-cli "$@"
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
./llama-finetune "$@"
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do
@ -36,8 +34,6 @@ else
echo " ex: --outtype f16 \"/models/7B/\" "
echo " --quantize (-q): Optimize with quantization process ggml"
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
echo " --finetune (-f): Run finetune command to create a lora finetune of the model"
echo " See documentation for finetune for command-line parameters"
echo " --all-in-one (-a): Execute --convert & --quantize"
echo " ex: \"/models/\" 7B"
echo " --server (-s): Run a model on the server"

View File

@ -9,5 +9,3 @@ contact_links:
- name: Want to contribute?
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
about: Head to the contribution guide page of the wiki for areas you can help with

4
.github/labeler.yml vendored
View File

@ -16,7 +16,9 @@ SYCL:
- any-glob-to-any-file:
- ggml/include/ggml-sycl.h
- ggml/src/ggml-sycl.cpp
- README-sycl.md
- ggml/src/ggml-sycl/**
- docs/backend/SYCL.md
- examples/sycl/**
Nvidia GPU:
- changed-files:
- any-glob-to-any-file:

View File

@ -47,7 +47,7 @@ jobs:
sysctl -a
mkdir build
cd build
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
@ -105,7 +105,7 @@ jobs:
sysctl -a
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
@ -222,7 +222,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
cmake --build . --config Release -j $(nproc)
- name: Test
@ -355,8 +355,10 @@ jobs:
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libvulkan-dev
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
sudo apt-get update -y
sudo apt-get install -y build-essential vulkan-sdk
- name: Build
id: cmake_build
@ -799,6 +801,7 @@ jobs:
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
cd build
$env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
& $sde -future -- ctest -L main -C Release --verbose --timeout 900
- name: Determine tag name
@ -857,6 +860,7 @@ jobs:
mkdir build
cd build
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Determine tag name

38
.github/workflows/python-type-check.yml vendored Normal file
View File

@ -0,0 +1,38 @@
name: Python Type-Check
on:
push:
paths:
- '.github/workflows/python-type-check.yml'
- '**.py'
- '**/requirements*.txt'
pull_request:
paths:
- '.github/workflows/python-type-check.yml'
- '**.py'
- '**/requirements*.txt'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
jobs:
python-type-check:
runs-on: ubuntu-latest
name: pyright type-check
steps:
- name: Check out source repository
uses: actions/checkout@v4
- name: Set up Python environment
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install Python dependencies
# TODO: use a venv
run: pip install -r requirements/requirements-all.txt
- name: Type-check with Pyright
uses: jakebailey/pyright-action@v2
with:
version: 1.1.370
level: warning
warnings: true

18
.gitignore vendored
View File

@ -47,8 +47,10 @@ build*
!build-info.cpp.in
!build-info.sh
!build.zig
!docs/build.md
/libllama.so
/llama-*
/vulkan-shaders-gen
android-ndk-*
arm_neon.h
cmake-build-*
@ -60,6 +62,11 @@ llama-batched-swift
out/
tmp/
# Deprecated
/main
/server
# CI
!.github/workflows/*.yml
@ -98,13 +105,14 @@ examples/server/*.mjs.hpp
# Python
__pycache__
.venv
/Pipfile
dist
poetry.lock
/.venv
__pycache__/
*/poetry.lock
poetry.toml
# Nix
/result
# Test binaries
/tests/test-backend-ops
/tests/test-double-float

View File

@ -42,13 +42,14 @@ endif()
option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
if (WIN32)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
endif()
#
# option list
#
# general
option(LLAMA_CCACHE "llama: use ccache if available" ON)
# debug
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
@ -73,20 +74,26 @@ option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
# override ggml options
set(GGML_CCACHE ${LLAMA_CCACHE})
set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD})
set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS})
set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
set(GGML_LLAMAFILE ON)
set(GGML_CUDA_USE_GRAPHS ON)
# change the default for these ggml options
if (NOT DEFINED GGML_LLAMAFILE)
set(GGML_LLAMAFILE ON)
endif()
if (NOT DEFINED GGML_CUDA_USE_GRAPHS)
set(GGML_CUDA_USE_GRAPHS ON)
endif()
# transition helpers
function (llama_option_depr TYPE OLD NEW)
if (${OLD})
message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
set(${NEW} ON)
set(${NEW} ON PARENT_SCOPE)
endif()
endfunction()
@ -96,16 +103,19 @@ llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
llama_option_depr(WARNING LLAMA_OPENMP GGML_OPENMP)
llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
#
# build the library
#
add_subdirectory(ggml)
if (NOT TARGET ggml)
add_subdirectory(ggml)
# ... otherwise assume ggml is added by a parent CMakeLists.txt
endif()
add_subdirectory(src)
#
@ -123,7 +133,16 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
get_directory_property(LLAMA_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
# At the moment some compile definitions are placed within the ggml/src
# directory but not exported on the `ggml` target. This could be improved by
# determining _precisely_ which defines are necessary for the llama-config
# package.
#
get_directory_property(GGML_DIR_DEFINES DIRECTORY ggml/src COMPILE_DEFINITIONS)
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
install(TARGETS llama LIBRARY PUBLIC_HEADER)
@ -146,7 +165,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)
install(
FILES convert-hf-to-gguf.py
FILES convert_hf_to_gguf.py
PERMISSIONS
OWNER_READ
OWNER_WRITE

View File

@ -19,6 +19,7 @@
"cacheVariables": {
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
"CMAKE_CXX_COMPILER": "icx",
"CMAKE_C_COMPILER": "cl",
"GGML_SYCL": "ON",
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
}

View File

@ -1,14 +1,28 @@
# Contributing Guidelines
# Pull requests (for contributors)
## Checklist
- Test your changes:
- Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
- Execute [the full CI locally on your machine](ci/README.md) before publishing
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
- The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
* Make sure your PR follows the [coding guidelines](https://github.com/ggerganov/llama.cpp/blob/master/README.md#coding-guidelines)
* Test your changes using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
* Execute [the full CI locally on your machine](ci/README.md) before publishing
# Pull requests (for collaborators)
## PR formatting
- Squash-merge PRs
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Optionally, pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
# Coding guidelines
- Avoid adding third-party dependencies, extra files, extra headers, etc.
- Always consider cross-compatibility with other operating systems and architectures
- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
![matmul](media/matmul.png)
* Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
- The PR template has a series of review complexity checkboxes `[ ]` that you can mark as `[X]` for your conveience. Refer to [About task lists](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) for more information.
* If the pull request only contains documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times.
* When squashing multiple commits on merge, use the following format for your commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : Fix typo in utils.py (#1234)`

272
Makefile
View File

@ -11,9 +11,9 @@ BUILD_TARGETS = \
llama-embedding \
llama-eval-callback \
llama-export-lora \
llama-finetune \
llama-gbnf-validator \
llama-gguf \
llama-gguf-hash \
llama-gguf-split \
llama-gritlm \
llama-imatrix \
@ -36,7 +36,6 @@ BUILD_TARGETS = \
llama-simple \
llama-speculative \
llama-tokenize \
llama-train-text-from-scratch \
llama-vdot \
llama-cvector-generator \
tests/test-c.o
@ -45,6 +44,7 @@ BUILD_TARGETS = \
TEST_TARGETS = \
tests/test-autorelease \
tests/test-backend-ops \
tests/test-chat-template \
tests/test-double-float \
tests/test-grad0 \
tests/test-grammar-integration \
@ -61,6 +61,15 @@ TEST_TARGETS = \
tests/test-tokenizer-1-bpe \
tests/test-tokenizer-1-spm
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm
# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
# We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
LEGACY_TARGETS_BUILD = main quantize perplexity embedding server
# Deprecation aliases
ifdef LLAMA_CUBLAS
$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
@ -186,7 +195,11 @@ ifdef GGML_RPC
BUILD_TARGETS += rpc-server
endif
default: $(BUILD_TARGETS)
ifdef GGML_VULKAN
BUILD_TARGETS += vulkan-shaders-gen
endif
default: $(BUILD_TARGETS) $(LEGACY_TARGETS_BUILD)
test: $(TEST_TARGETS)
@failures=0; \
@ -221,7 +234,7 @@ test: $(TEST_TARGETS)
fi
@echo 'All tests passed.'
all: $(BUILD_TARGETS) $(TEST_TARGETS)
all: $(BUILD_TARGETS) $(TEST_TARGETS) $(LEGACY_TARGETS_BUILD)
ifdef RISCV_CROSS_COMPILE
CC := riscv64-unknown-linux-gnu-gcc
@ -238,17 +251,22 @@ MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++11 -fPIC
MK_NVCCFLAGS = -std=c++11
ifndef LLAMA_NO_CCACHE
ifdef LLAMA_NO_CCACHE
GGML_NO_CCACHE := 1
DEPRECATE_WARNING := 1
endif
ifndef GGML_NO_CCACHE
CCACHE := $(shell which ccache)
ifdef CCACHE
export CCACHE_SLOPPINESS = time_macros
$(info I ccache found, compilation results will be cached. Disable with LLAMA_NO_CCACHE.)
$(info I ccache found, compilation results will be cached. Disable with GGML_NO_CCACHE.)
CC := $(CCACHE) $(CC)
CXX := $(CCACHE) $(CXX)
else
$(info I ccache not found. Consider installing it for faster compilation.)
endif # CCACHE
endif # LLAMA_NO_CCACHE
endif # GGML_NO_CCACHE
# clock_gettime came in POSIX.1b (1993)
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
@ -307,9 +325,9 @@ ifdef LLAMA_DEBUG
endif
else
MK_CPPFLAGS += -DNDEBUG
MK_CFLAGS += -O3
MK_CXXFLAGS += -O3
MK_NVCCFLAGS += -O3
MK_CFLAGS += -O3 -g
MK_CXXFLAGS += -O3 -g
MK_NVCCFLAGS += -O3 -g
endif
ifdef LLAMA_SANITIZE_THREAD
@ -510,10 +528,21 @@ ifndef GGML_NO_ACCELERATE
endif
endif # GGML_NO_ACCELERATE
ifdef GGML_MUSA
CC := clang
CXX := clang++
GGML_CUDA := 1
MK_CPPFLAGS += -DGGML_USE_MUSA
endif
ifndef GGML_NO_OPENMP
MK_CPPFLAGS += -DGGML_USE_OPENMP
MK_CFLAGS += -fopenmp
MK_CXXFLAGS += -fopenmp
ifdef GGML_MUSA
MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
MK_LDFLAGS += -L/usr/lib/llvm-10/lib
endif # GGML_MUSA
endif # GGML_NO_OPENMP
ifdef GGML_OPENBLAS
@ -531,14 +560,20 @@ ifdef GGML_OPENBLAS64
endif # GGML_OPENBLAS64
ifdef GGML_BLIS
MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis
MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
MK_LDFLAGS += -lblis -L/usr/local/lib
OBJ_GGML += ggml/src/ggml-blas.o
endif # GGML_BLIS
ifdef GGML_NVPL
MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
MK_LDFLAGS += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
OBJ_GGML += ggml/src/ggml-blas.o
endif # GGML_NVPL
ifndef GGML_NO_LLAMAFILE
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
OBJ_GGML += ggml/src/sgemm.o
OBJ_GGML += ggml/src/llamafile/sgemm.o
endif
ifdef GGML_RPC
@ -558,15 +593,27 @@ else
endif # GGML_CUDA_FA_ALL_QUANTS
ifdef GGML_CUDA
ifneq ('', '$(wildcard /opt/cuda)')
CUDA_PATH ?= /opt/cuda
else
CUDA_PATH ?= /usr/local/cuda
endif
ifdef GGML_MUSA
ifneq ('', '$(wildcard /opt/musa)')
CUDA_PATH ?= /opt/musa
else
CUDA_PATH ?= /usr/local/musa
endif
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
MK_NVCCFLAGS += -use_fast_math
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
MK_LDFLAGS += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_22
else
ifneq ('', '$(wildcard /opt/cuda)')
CUDA_PATH ?= /opt/cuda
else
CUDA_PATH ?= /usr/local/cuda
endif
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
MK_NVCCFLAGS += -use_fast_math
endif # GGML_MUSA
OBJ_GGML += ggml/src/ggml-cuda.o
OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
@ -576,9 +623,11 @@ ifdef LLAMA_FATAL_WARNINGS
MK_NVCCFLAGS += -Werror all-warnings
endif # LLAMA_FATAL_WARNINGS
ifndef GGML_MUSA
ifndef JETSON_EOL_MODULE_DETECT
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
endif # JETSON_EOL_MODULE_DETECT
endif # GGML_MUSA
ifdef LLAMA_DEBUG
MK_NVCCFLAGS += -lineinfo
@ -591,8 +640,12 @@ endif # GGML_CUDA_DEBUG
ifdef GGML_CUDA_NVCC
NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
else
NVCC = $(CCACHE) nvcc
endif #GGML_CUDA_NVCC
ifdef GGML_MUSA
NVCC = $(CCACHE) mcc
else
NVCC = $(CCACHE) nvcc
endif # GGML_MUSA
endif # GGML_CUDA_NVCC
ifdef CUDA_DOCKER_ARCH
MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
@ -663,9 +716,15 @@ define NVCC_COMPILE
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
endef # NVCC_COMPILE
else
ifdef GGML_MUSA
define NVCC_COMPILE
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -c $< -o $@
endef # NVCC_COMPILE
else
define NVCC_COMPILE
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
endef # NVCC_COMPILE
endif # GGML_MUSA
endif # JETSON_EOL_MODULE_DETECT
ggml/src/ggml-cuda/%.o: \
@ -688,8 +747,8 @@ endif # GGML_CUDA
ifdef GGML_VULKAN
MK_CPPFLAGS += -DGGML_USE_VULKAN
MK_LDFLAGS += -lvulkan
OBJ_GGML += ggml/src/ggml-vulkan.o
MK_LDFLAGS += $(shell pkg-config --libs vulkan)
OBJ_GGML += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o
ifdef GGML_VULKAN_CHECK_RESULTS
MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
@ -711,10 +770,28 @@ ifdef GGML_VULKAN_RUN_TESTS
MK_CPPFLAGS += -DGGML_VULKAN_RUN_TESTS
endif
ggml/src/ggml-vulkan.o: \
ggml/src/ggml-vulkan.cpp \
ggml/include/ggml-vulkan.h
$(CXX) $(CXXFLAGS) -c $< -o $@
GLSLC_CMD = glslc
_ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
_ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp
_ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp
_ggml_vk_input_dir = ggml/src/vulkan-shaders
_ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
$(_ggml_vk_header): $(_ggml_vk_source)
$(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
$(_ggml_vk_genshaders_cmd) \
--glslc $(GLSLC_CMD) \
--input-dir $(_ggml_vk_input_dir) \
--target-hpp $(_ggml_vk_header) \
--target-cpp $(_ggml_vk_source)
vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
endif # GGML_VULKAN
ifdef GGML_HIPBLAS
@ -751,6 +828,14 @@ ifdef GGML_CUDA_FORCE_DMMV
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # GGML_CUDA_FORCE_DMMV
ifdef GGML_CUDA_FORCE_MMQ
HIPFLAGS += -DGGML_CUDA_FORCE_MMQ
endif # GGML_CUDA_FORCE_MMQ
ifdef GGML_CUDA_FORCE_CUBLAS
HIPFLAGS += -DGGML_CUDA_FORCE_CUBLAS
endif # GGML_CUDA_FORCE_CUBLAS
ifdef GGML_CUDA_NO_PEER_COPY
HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
endif # GGML_CUDA_NO_PEER_COPY
@ -819,10 +904,14 @@ OBJ_GGML += \
ggml/src/ggml.o \
ggml/src/ggml-alloc.o \
ggml/src/ggml-backend.o \
ggml/src/ggml-quants.o
ggml/src/ggml-quants.o \
ggml/src/ggml-aarch64.o
OBJ_LLAMA = \
src/llama.o \
src/llama-vocab.o \
src/llama-grammar.o \
src/llama-sampling.o \
src/unicode.o \
src/unicode-data.o
@ -890,6 +979,7 @@ $(info I CXX: $(shell $(CXX) --version | head -n 1))
ifdef GGML_CUDA
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
ifndef GGML_MUSA
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
ifndef CUDA_DOCKER_ARCH
@ -899,6 +989,7 @@ endif # CUDA_POWER_ARCH
endif # CUDA_DOCKER_ARCH
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
endif # GGML_MUSA
endif # GGML_CUDA
$(info )
@ -919,6 +1010,7 @@ $(info - LLAMA_NO_LLAMAFILE)
$(info - LLAMA_NO_ACCELERATE)
$(info - LLAMA_NO_OPENMP)
$(info - LLAMA_NO_METAL)
$(info - LLAMA_NO_CCACHE)
$(info )
endif
@ -952,15 +1044,22 @@ ggml/src/ggml-quants.o: \
ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-aarch64.o: \
ggml/src/ggml-aarch64.c \
ggml/include/ggml.h \
ggml/src/ggml-aarch64.h \
ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-blas.o: \
ggml/src/ggml-blas.cpp \
ggml/include/ggml-blas.h
$(CXX) $(CXXFLAGS) -c $< -o $@
ifndef GGML_NO_LLAMAFILE
ggml/src/sgemm.o: \
ggml/src/sgemm.cpp \
ggml/src/sgemm.h \
ggml/src/llamafile/sgemm.o: \
ggml/src/llamafile/sgemm.cpp \
ggml/src/llamafile/sgemm.h \
ggml/include/ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif # GGML_NO_LLAMAFILE
@ -994,6 +1093,10 @@ src/unicode-data.o: \
src/llama.o: \
src/llama.cpp \
src/llama-impl.h \
src/llama-vocab.h \
src/llama-grammar.h \
src/llama-sampling.h \
src/unicode.h \
include/llama.h \
ggml/include/ggml-cuda.h \
@ -1003,6 +1106,29 @@ src/llama.o: \
ggml/include/ggml-backend.h
$(CXX) $(CXXFLAGS) -c $< -o $@
src/llama-vocab.o: \
src/llama-vocab.cpp \
src/llama-vocab.h \
src/llama-impl.h \
include/llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
src/llama-grammar.o: \
src/llama-grammar.cpp \
src/llama-grammar.h \
src/llama-impl.h \
src/llama-vocab.h \
src/llama-sampling.h \
include/llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
src/llama-sampling.o: \
src/llama-sampling.cpp \
src/llama-sampling.h \
src/llama-impl.h \
include/llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
$(LIB_LLAMA): \
$(OBJ_LLAMA) \
$(LIB_GGML)
@ -1070,6 +1196,7 @@ clean:
rm -rvf src/*.o
rm -rvf tests/*.o
rm -rvf examples/*.o
rm -rvf common/*.o
rm -rvf *.a
rm -rvf *.dll
rm -rvf *.so
@ -1084,6 +1211,8 @@ clean:
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
rm -rvf $(BUILD_TARGETS)
rm -rvf $(TEST_TARGETS)
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
rm -rvf $(LEGACY_TARGETS_CLEAN)
find examples pocs -type f -name "*.o" -delete
#
@ -1170,6 +1299,23 @@ llama-gguf: examples/gguf/gguf.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
examples/gguf-hash/deps/sha1/sha1.o: \
examples/gguf-hash/deps/sha1/sha1.c
$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
examples/gguf-hash/deps/xxhash/xxhash.o: \
examples/gguf-hash/deps/xxhash/xxhash.c
$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
examples/gguf-hash/deps/sha256/sha256.o: \
examples/gguf-hash/deps/sha256/sha256.c
$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-gguf-split: examples/gguf-split/gguf-split.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@ -1185,11 +1331,6 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
$(OBJ_GGML) $(OBJ_LLAMA)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@ -1205,13 +1346,8 @@ llama-baby-llama: examples/baby-llama/baby-llama.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-finetune: examples/finetune/finetune.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-export-lora: examples/export-lora/export-lora.cpp \
$(OBJ_GGML) common/log.h
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1358,7 +1494,7 @@ run-benchmark-matmult: llama-benchmark-matmult
.PHONY: run-benchmark-matmult swift
tests/test-llama-grammar: tests/test-llama-grammar.cpp \
$(OBJ_GGML) $(OBJ_COMMON) src/unicode.o src/unicode-data.o
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1462,3 +1598,51 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
$(OBJ_GGML)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
#
# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
#
# Mark legacy binary targets as .PHONY so that they are always checked.
.PHONY: main quantize perplexity embedding server
# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
# Eventually we will want to remove these target from building all the time.
main: examples/deprecation-warning/deprecation-warning.cpp
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
server: examples/deprecation-warning/deprecation-warning.cpp
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
quantize: examples/deprecation-warning/deprecation-warning.cpp
ifneq (,$(wildcard quantize))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
@echo " Remove the 'quantize' binary to remove this warning."
@echo "#########"
endif
perplexity: examples/deprecation-warning/deprecation-warning.cpp
ifneq (,$(wildcard perplexity))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
@echo " Remove the 'perplexity' binary to remove this warning."
@echo "#########"
endif
embedding: examples/deprecation-warning/deprecation-warning.cpp
ifneq (,$(wildcard embedding))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
@echo " Remove the 'embedding' binary to remove this warning."
@echo "#########"
endif

View File

@ -4,12 +4,16 @@ import PackageDescription
var sources = [
"src/llama.cpp",
"src/llama-vocab.cpp",
"src/llama-grammar.cpp",
"src/llama-sampling.cpp",
"src/unicode.cpp",
"src/unicode-data.cpp",
"ggml/src/ggml.c",
"ggml/src/ggml-alloc.c",
"ggml/src/ggml-backend.c",
"ggml/src/ggml-quants.c",
"ggml/src/ggml-aarch64.c",
]
var resources: [Resource] = []

779
README.md
View File

@ -3,7 +3,7 @@
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@ -13,7 +13,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
> [!IMPORTANT]
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
### Recent API changes
## Recent API changes
- [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
@ -24,9 +24,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
### Hot topics
## Hot topics
- **`convert.py` has been deprecated and moved to `examples/convert-legacy-llama.py`, please use `convert-hf-to-gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
- **`convert.py` has been deprecated and moved to `examples/convert_legacy_llama.py`, please use `convert_hf_to_gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
@ -39,37 +39,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
----
<details>
<summary>Table of Contents</summary>
<ol>
<li>
<a href="#description">Description</a>
</li>
<li>
<a href="#usage">Usage</a>
<ul>
<li><a href="#get-the-code">Get the Code</a></li>
<li><a href="#build">Build</a></li>
<li><a href="#blas-build">BLAS Build</a></li>
<li><a href="#prepare-and-quantize">Prepare and Quantize</a></li>
<li><a href="#run-the-quantized-model">Run the quantized model</a></li>
<li><a href="#memorydisk-requirements">Memory/Disk Requirements</a></li>
<li><a href="#quantization">Quantization</a></li>
<li><a href="#interactive-mode">Interactive mode</a></li>
<li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
<li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
<li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
<li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
<li><a href="#android">Android</a></li>
<li><a href="#docker">Docker</a></li>
</ul>
</li>
<li><a href="#contributing">Contributing</a></li>
<li><a href="#coding-guidelines">Coding guidelines</a></li>
<li><a href="#docs">Docs</a></li>
</ol>
</details>
## Description
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
@ -87,14 +56,6 @@ Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomm
improved significantly thanks to many contributions. It is the main playground for developing new features for the
[ggml](https://github.com/ggerganov/ggml) library.
**Supported platforms:**
- [X] Mac OS
- [X] Linux
- [X] Windows (via CMake)
- [X] Docker
- [X] FreeBSD
**Supported models:**
Typically finetunes of the base models below are supported as well.
@ -108,6 +69,7 @@ Typically finetunes of the base models below are supported as well.
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
@ -134,8 +96,9 @@ Typically finetunes of the base models below are supported as well.
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
- [x] [OLMo](https://allenai.org/olmo)
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
**Multimodal models:**
@ -149,12 +112,6 @@ Typically finetunes of the base models below are supported as well.
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
**HTTP server**
[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
[simplechat](./examples/server/public_simplechat) is a simple chat client, which can be used to chat with the model exposed using above web server (use --path to point to simplechat), from a local web browser.
**Bindings:**
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
@ -175,11 +132,13 @@ Typically finetunes of the base models below are supported as well.
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
**UI:**
Unless otherwise noted these projects are open-source with permissive licensing:
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
- [iohub/collama](https://github.com/iohub/coLLaMA)
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
- [nat/openplayground](https://github.com/nat/openplayground)
@ -217,10 +176,19 @@ Unless otherwise noted these projects are open-source with permissive licensing:
**Tools:**
- [akx/ggify](https://github.com/akx/ggify) download PyTorch models from HuggingFace Hub and convert them to GGML
- [crashr/gppm](https://github.com/crashr/gppm) launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
---
**Infrastructure:**
Here is a typical run using LLaMA v2 13B on M2 Ultra:
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
**Games:**
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
## Demo
<details>
<summary>Typical run using LLaMA v2 13B on M2 Ultra</summary>
```
$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
@ -300,454 +268,85 @@ llama_print_timings: eval time = 24513.59 ms / 399 runs ( 61.44 ms
llama_print_timings: total time = 25431.49 ms
```
</details>
<details>
<summary>Demo of running both LLaMA-7B and whisper.cpp on a single M1 Pro MacBook</summary>
And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4
</details>
## Usage
Here are the end-to-end binary build and model conversion steps for most supported models.
### Get the Code
### Basic usage
Firstly, you need to get the binary. There are different methods that you can follow:
- Method 1: Clone this repository and build locally, see [how to build](./docs/build.md)
- Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md)
- Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md)
- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases)
You can run a basic completion using this command:
```bash
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
# Output:
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
```
### Build
See [this page](./examples/main/README.md) for a full list of parameters.
In order to build llama.cpp you have four different options.
### Conversation mode
- Using `make`:
- On Linux or MacOS:
```bash
make
```
- On Windows:
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
2. Extract `w64devkit` on your pc.
3. Run `w64devkit.exe`.
4. Use the `cd` command to reach the `llama.cpp` folder.
5. From here you can run:
```bash
make
```
- Notes:
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, run `make LLAMA_DEBUG=1`
- Using `CMake`:
```bash
cmake -B build
cmake --build build --config Release
```
**Notes**:
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, there are two cases:
1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
```bash
cmake -B build -DCMAKE_BUILD_TYPE=Debug
cmake --build build
```
2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
```bash
cmake -B build -G "Xcode"
cmake --build build --config Debug
```
- Using `gmake` (FreeBSD):
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
2. Add your user to **video** group
3. Install compilation dependencies.
```bash
sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
```
### Homebrew
On Mac and Linux, the homebrew package manager can be used via
```
brew install llama.cpp
```
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
### Nix
On Mac and Linux, the Nix package manager can be used via
```
nix profile install nixpkgs#llama-cpp
```
For flake enabled installs.
Or
```
nix-env --file '<nixpkgs>' --install --attr llama-cpp
```
For non-flake enabled installs.
This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
#### Flox
On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
```
flox install llama-cpp
```
Flox follows the nixpkgs build of llama.cpp.
### Metal Build
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
argument.
### BLAS Build
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
- #### Accelerate Framework:
This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
- #### OpenBLAS:
This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
- Using `make`:
- On Linux:
```bash
make GGML_OPENBLAS=1
```
- On Windows:
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
3. Extract `w64devkit` on your pc.
4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
6. Run `w64devkit.exe`.
7. Use the `cd` command to reach the `llama.cpp` folder.
8. From here you can run:
```bash
make GGML_OPENBLAS=1
```
- Using `CMake` on Linux:
```bash
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
cmake --build build --config Release
```
- #### BLIS
Check [BLIS.md](docs/BLIS.md) for more information.
- #### SYCL
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md).
- #### Intel oneMKL
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
- Using manual oneAPI installation:
By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
```bash
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON
cmake --build build --config Release
```
- Using oneAPI docker image:
If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
- #### CUDA
This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
- Using `make`:
```bash
make GGML_CUDA=1
```
- Using `CMake`:
```bash
cmake -B build -DGGML_CUDA=ON
cmake --build build --config Release
```
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
| Option | Legal values | Default | Description |
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| GGML_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
- #### hipBLAS
This provides BLAS acceleration on HIP-supported AMD GPUs.
Make sure to have ROCm installed.
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
- Using `make`:
```bash
make GGML_HIPBLAS=1
```
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
```bash
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build --config Release -- -j 16
```
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
Note that if you get the following error:
```
clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
```
Try searching for a directory under `HIP_PATH` that contains the file
`oclc_abi_version_400.bc`. Then, add the following to the start of the
command: `HIP_DEVICE_LIB_PATH=<directory-you-just-found>`, so something
like:
```bash
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build -- -j 16
```
- Using `make` (example for target gfx1030, build with 16 CPU threads):
```bash
make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
```
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
```bash
set PATH=%HIP_PATH%\bin;%PATH%
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
cmake --build build
```
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
| Option | Legal values | Default | Description |
|------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
- #### Vulkan
**With docker**:
You don't need to install Vulkan SDK. It will be installed inside the container.
```sh
# Build the image
docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
# Then, use it:
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
```
**Without docker**:
Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
For example, on Ubuntu 22.04 (jammy), use the command below:
```bash
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
apt update -y
apt-get install -y vulkan-sdk
# To verify the installation, use the command below:
vulkaninfo
```
Alternatively your package manager might be able to provide the appropriate libraries.
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
Then, build llama.cpp using the cmake command below:
```bash
cmake -B build -DGGML_VULKAN=1
cmake --build build --config Release
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
# You should see in the output, ggml_vulkan detected your GPU. For example:
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
```
### Prepare and Quantize
> [!NOTE]
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter:
```bash
# obtain the official LLaMA model weights and place them in ./models
ls ./models
llama-2-7b tokenizer_checklist.chk tokenizer.model
# [Optional] for models using BPE tokenizers
ls ./models
<folder containing weights and tokenizer json> vocab.json
# [Optional] for PyTorch .bin models like Mistral-7B
ls ./models
<folder containing weights and tokenizer json>
llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
# install Python dependencies
python3 -m pip install -r requirements.txt
# convert the model to ggml FP16 format
python3 convert-hf-to-gguf.py models/mymodel/
# quantize the model to 4-bits (using Q4_K_M method)
./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
# update the gguf filetype to current version if older version is now unsupported
./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
# Output:
# > hi, who are you?
# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
#
# > what is 1+1?
# Easy peasy! The answer to 1+1 is... 2!
```
### Run the quantized model
By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
```bash
# start inference on a gguf model
./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
```
When running the larger models, make sure you have enough disk space to store all the intermediate files.
You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
### Running on Windows with prebuilt binaries
You will find prebuilt Windows binaries on the release page.
Simply download and extract the latest zip package of choice: (e.g. `llama-b1380-bin-win-avx2-x64.zip`)
From the unzipped folder, open a terminal/cmd window here and place a pre-converted `.gguf` model file. Test out the main example like so:
```
.\main -m llama-2-7b.Q4_0.gguf -n 128
```bash
./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
```
### Memory/Disk Requirements
### Web server
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
| Model | Original size | Quantized size (Q4_0) |
|------:|--------------:|----------------------:|
| 7B | 13 GB | 3.9 GB |
| 13B | 24 GB | 7.8 GB |
| 30B | 60 GB | 19.5 GB |
| 65B | 120 GB | 38.5 GB |
Example usage:
### Quantization
```bash
./llama-server -m your_model.gguf --port 8080
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
*(outdated)*
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
| 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
| 7B | file size | 13.0G | 3.5G | 3.9G | 4.3G | 4.7G | 6.7G |
| 7B | ms/tok @ 4th | 127 | 55 | 54 | 76 | 83 | 72 |
| 7B | ms/tok @ 8th | 122 | 43 | 45 | 52 | 56 | 67 |
| 7B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
| 13B | perplexity | 5.2543 | 5.3860 | 5.3608 | 5.2856 | 5.2706 | 5.2548 |
| 13B | file size | 25.0G | 6.8G | 7.6G | 8.3G | 9.1G | 13G |
| 13B | ms/tok @ 4th | - | 103 | 105 | 148 | 160 | 131 |
| 13B | ms/tok @ 8th | - | 73 | 82 | 98 | 105 | 128 |
| 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
- [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684)
- recent k-quants improvements and new i-quants
- [#2707](https://github.com/ggerganov/llama.cpp/pull/2707)
- [#2807](https://github.com/ggerganov/llama.cpp/pull/2807)
- [#4773 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4773)
- [#4856 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4856)
- [#4861 - importance matrix](https://github.com/ggerganov/llama.cpp/pull/4861)
- [#4872 - MoE models](https://github.com/ggerganov/llama.cpp/pull/4872)
- [#4897 - 2-bit quantization](https://github.com/ggerganov/llama.cpp/pull/4897)
- [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
- [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
- [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
- [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
- [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
- [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
- [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
### Perplexity (measuring model quality)
You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 threads.
#### How to run
1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
2. Run `./llama-perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
3. Output:
# Basic web UI can be accessed via browser: http://localhost:8080
# Chat completion endpoint: http://localhost:8080/v1/chat/completions
```
perplexity : calculating perplexity over 655 chunks
24.43 seconds per pass - ETA 4.45 hours
[1]4.5970,[2]5.1807,[3]6.0382,...
```
And after 4.45 hours, you will have the final perplexity.
### Interactive mode
If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
> [!NOTE]
> If you prefer basic usage, please consider using conversation mode instead of interactive mode
In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
Here is an example of a few-shot interaction, invoked with the command
@ -798,18 +397,71 @@ The `grammars/` folder contains a handful of sample grammars. To write your own,
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
### Obtaining and using the Facebook LLaMA 2 model
## Build
- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
- Alternatively, if you want to save time and space, you can download already converted and quantized models from [TheBloke](https://huggingface.co/TheBloke), including:
- [LLaMA 2 7B base](https://huggingface.co/TheBloke/Llama-2-7B-GGUF)
- [LLaMA 2 13B base](https://huggingface.co/TheBloke/Llama-2-13B-GGUF)
- [LLaMA 2 70B base](https://huggingface.co/TheBloke/Llama-2-70B-GGUF)
- [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGUF)
- [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF)
- [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGUF)
Please refer to [Build llama.cpp locally](./docs/build.md)
### Seminal papers and background on the models
## Supported backends
| Backend | Target devices |
| --- | --- |
| [Metal](./docs/build.md#metal-build) | Apple Silicon |
| [BLAS](./docs/build.md#blas-build) | All |
| [BLIS](./docs/backend/BLIS.md) | All |
| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
| [MUSA](./docs/build.md#musa) | Moore Threads GPU |
| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
| [Vulkan](./docs/build.md#vulkan) | GPU |
## Tools
### Prepare and Quantize
> [!NOTE]
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
To learn more about quantizing model, [read this documentation](./examples/quantize/README.md)
### Perplexity (measuring model quality)
You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)
## Contributing
- Contributors can open PRs
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
- Collaborators will be invited based on contributions
- Any help with managing issues and PRs is very appreciated!
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
## Other documentations
- [main (cli)](./examples/main/README.md)
- [server](./examples/server/README.md)
- [jeopardy](./examples/jeopardy/README.md)
- [GBNF grammars](./grammars/README.md)
**Development documentations**
- [How to build](./docs/build.md)
- [Running on Docker](./docs/docker.md)
- [Build on Android](./docs/android.md)
- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
**Seminal papers and background on the models**
If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
- LLaMA:
@ -820,178 +472,3 @@ If your issue is with model generation quality, then please at least scan the fo
- GPT-3.5 / InstructGPT / ChatGPT:
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
### Android
#### Build on Android using Termux
[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
```
apt update && apt upgrade -y
apt install git make cmake
```
It's recommended to move your model inside the `~/` directory for best performance:
```
cd storage/downloads
mv model.gguf ~/
```
[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
#### Building the Project using Android NDK
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
```
$ mkdir build-android
$ cd build-android
$ export NDK=<your_ndk_directory>
$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
$ make
```
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
```
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
$cd /data/data/com.termux/files/home/bin
$chmod +x ./*
```
Download model [llama-2-7b-chat.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_K_M.gguf), and push it to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
```
$mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/home/model/
```
Now, you can start chatting:
```
$cd /data/data/com.termux/files/home/bin
$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
```
Here's a demo of an interactive session running on Pixel 5 phone:
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
### Docker
#### Prerequisites
* Docker must be installed and running on your system.
* Create a folder to store big models & intermediate files (ex. /llama/models)
#### Images
We have three Docker images available for this project:
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
Additionally, there the following images, similar to the above:
- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
#### Usage
The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
Replace `/path/to/models` below with the actual path where you downloaded the models.
```bash
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
```
On completion, you are ready to play!
```bash
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
```
or with a light image:
```bash
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
```
or with a server image:
```bash
docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
```
### Docker With CUDA
Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
#### Building Locally
```bash
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
```
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
The defaults are:
- `CUDA_VERSION` set to `11.7.1`
- `CUDA_DOCKER_ARCH` set to `all`
The resulting images, are essentially the same as the non-CUDA images:
1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
#### Usage
After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
```bash
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
```
### Contributing
- Contributors can open PRs
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
- Collaborators will be invited based on contributions
- Any help with managing issues and PRs is very appreciated!
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
### Coding guidelines
- Avoid adding third-party dependencies, extra files, extra headers, etc.
- Always consider cross-compatibility with other operating systems and architectures
- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
![matmul](media/matmul.png)
### Docs
- [main (cli)](./examples/main/README.md)
- [server](./examples/server/README.md)
- [jeopardy](./examples/jeopardy/README.md)
- [BLIS](./docs/BLIS.md)
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
- [GBNF grammars](./grammars/README.md)

View File

@ -103,6 +103,9 @@ function gg_run_ctest_debug {
set -e
# Check cmake, make and ctest are installed
gg_check_build_requirements
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
@ -131,6 +134,9 @@ function gg_run_ctest_release {
set -e
# Check cmake, make and ctest are installed
gg_check_build_requirements
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
@ -287,7 +293,7 @@ function gg_run_open_llama_7b_v2 {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -421,7 +427,7 @@ function gg_run_pythia_1_4b {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -553,7 +559,7 @@ function gg_run_pythia_2_8b {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -688,7 +694,7 @@ function gg_run_embd_bge_small {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -701,6 +707,20 @@ function gg_run_embd_bge_small {
set +e
}
function gg_check_build_requirements {
if ! command -v cmake &> /dev/null; then
gg_printf 'cmake not found, please install'
fi
if ! command -v make &> /dev/null; then
gg_printf 'make not found, please install'
fi
if ! command -v ctest &> /dev/null; then
gg_printf 'ctest not found, please install'
fi
}
function gg_sum_embd_bge_small {
gg_printf '### %s\n\n' "${ci}"

View File

@ -8,6 +8,13 @@ set(GGML_CUDA @GGML_CUDA@)
set(GGML_METAL @GGML_METAL@)
set(GGML_HIPBLAS @GGML_HIPBLAS@)
set(GGML_ACCELERATE @GGML_ACCELERATE@)
set(GGML_VULKAN @GGML_VULKAN@)
set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@)
set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@)
set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@)
set(GGML_SYCL @GGML_SYCL@)
set(GGML_OPENMP @GGML_OPENMP@)
@PACKAGE_INIT@
@ -37,18 +44,36 @@ if (GGML_METAL)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
endif()
if (GGML_VULKAN)
find_package(Vulkan REQUIRED)
endif()
if (GGML_HIPBLAS)
find_package(hip REQUIRED)
find_package(hipblas REQUIRED)
find_package(rocblas REQUIRED)
endif()
if (GGML_SYCL)
find_package(IntelSYCL REQUIRED)
find_package(MKL REQUIRED)
endif()
if (GGML_OPENMP)
find_package(OpenMP REQUIRED)
endif()
find_library(ggml_LIBRARY ggml
REQUIRED
HINTS ${LLAMA_LIB_DIR})
find_library(llama_LIBRARY llama
REQUIRED
HINTS ${LLAMA_LIB_DIR})
set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
add_library(llama UNKNOWN IMPORTED)

View File

@ -1,3 +1,7 @@
#if defined(_MSC_VER)
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#endif
#include "common.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
@ -190,6 +194,12 @@ int32_t cpu_get_num_math() {
// CLI argument parsing
//
void gpt_params_handle_hf_token(gpt_params & params) {
if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
params.hf_token = std::getenv("HF_TOKEN");
}
}
void gpt_params_handle_model_default(gpt_params & params) {
if (!params.hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model
@ -237,6 +247,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
gpt_params_handle_model_default(params);
gpt_params_handle_hf_token(params);
if (params.escape) {
string_process_escapes(params.prompt);
string_process_escapes(params.input_prefix);
@ -472,6 +484,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
else { invalid_param = true; }
return true;
}
if (arg == "--attention") {
CHECK_ARG
std::string value(argv[i]);
/**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
else { invalid_param = true; }
return true;
}
if (arg == "--defrag-thold" || arg == "-dt") {
CHECK_ARG
params.defrag_thold = std::stof(argv[i]);
@ -644,6 +664,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.model_url = argv[i];
return true;
}
if (arg == "-hft" || arg == "--hf-token") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.hf_token = argv[i];
return true;
}
if (arg == "-hfr" || arg == "--hf-repo") {
CHECK_ARG
params.hf_repo = argv[i];
@ -657,7 +685,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
if (arg == "--lora") {
CHECK_ARG
params.lora_adapter.emplace_back(argv[i], 1.0f);
params.use_mmap = false;
return true;
}
if (arg == "--lora-scaled") {
@ -665,12 +692,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
const char* lora_adapter = argv[i];
CHECK_ARG
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
params.use_mmap = false;
return true;
}
if (arg == "--lora-base") {
CHECK_ARG
params.lora_base = argv[i];
return true;
}
if (arg == "--control-vector") {
@ -757,7 +778,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.cache_type_v = argv[++i];
return true;
}
if (arg == "--multiline-input") {
if (arg == "-mli" || arg == "--multiline-input") {
params.multiline_input = true;
return true;
}
@ -769,6 +790,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.cont_batching = true;
return true;
}
if (arg == "-nocb" || arg == "--no-cont-batching") {
params.cont_batching = false;
return true;
}
if (arg == "-fa" || arg == "--flash-attn") {
params.flash_attn = true;
return true;
@ -1014,16 +1039,23 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
if (arg == "--in-prefix-bos") {
params.input_prefix_bos = true;
params.enable_chat_template = false;
return true;
}
if (arg == "--in-prefix") {
CHECK_ARG
params.input_prefix = argv[i];
params.enable_chat_template = false;
return true;
}
if (arg == "--in-suffix") {
CHECK_ARG
params.input_suffix = argv[i];
params.enable_chat_template = false;
return true;
}
if (arg == "--spm-infill") {
params.spm_infill = true;
return true;
}
if (arg == "--grammar") {
@ -1237,6 +1269,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
CHECK_ARG
params.out_file = argv[i];
params.cvector_outfile = argv[i];
params.lora_outfile = argv[i];
return true;
}
if (arg == "-ofreq" || arg == "--output-frequency") {
@ -1291,6 +1324,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
else { invalid_param = true; }
return true;
}
if (arg == "--no-warmup") {
params.warmup = false;
return true;
}
#ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters
if (log_param_single_parse(argv[i])) {
@ -1387,7 +1424,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() });
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n"
"in conversation mode, this will be used as system prompt\n"
"(default: '%s')", params.prompt.c_str() });
options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" });
options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
@ -1402,13 +1441,18 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"halt generation at PROMPT, return control in interactive mode\n"
"can be specified more than once for multiple prompts" });
options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" });
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode, does not print special tokens and suffix/prefix\n"
"if suffix/prefix are not specified, default chat template will be used\n"
"(default: %s)", params.conversation ? "true" : "false" });
options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" });
options.push_back({ "server infill",
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
options.push_back({ "sampling" });
options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
@ -1444,6 +1488,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
options.push_back({ "main", " --chat-template JINJA_TEMPLATE",
"set custom jinja chat template (default: template taken from model's metadata)\n"
"if suffix/prefix are specified, template will be disabled\n"
"only commonly used templates are accepted:\n"
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
options.push_back({ "grammar" });
@ -1454,8 +1499,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });
options.push_back({ "embedding" });
options.push_back({ "embedding", " --pooling {none,mean,cls}",
options.push_back({ "embedding", " --pooling {none,mean,cls,last}",
"pooling type for embeddings, use model default if unspecified" });
options.push_back({ "embedding", " --attention {causal,non-causal}",
"attention type for embeddings, use model default if unspecified" });
options.push_back({ "context hacking" });
options.push_back({ "*", " --rope-scaling {none,linear,yarn}",
@ -1494,6 +1541,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences });
options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" });
options.push_back({ "multi-modality" });
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
@ -1536,9 +1584,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
"advanced option to override model metadata by key. may be specified multiple times.\n"
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" });
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" });
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" });
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
"note: this argument can be repeated to add multiple control vectors" });
options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
@ -1552,6 +1599,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" });
options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" });
options.push_back({ "*", "-hft, --hf-token TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)" });
options.push_back({ "retrieval" });
options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" });
@ -1628,6 +1676,13 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
options.push_back({ "export-lora" });
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
printf("usage: %s [options]\n", argv[0]);
for (const auto & o : options) {
@ -1991,9 +2046,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
llama_model * model = nullptr;
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
} else if (!params.model_url.empty()) {
model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
} else {
model = llama_load_model_from_file(params.model.c_str(), mparams);
}
@ -2039,19 +2094,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
float lora_scale = std::get<1>(params.lora_adapter[i]);
int err = llama_model_apply_lora_from_file(model,
lora_adapter.c_str(),
lora_scale,
((i > 0) || params.lora_base.empty())
? NULL
: params.lora_base.c_str(),
params.n_threads);
if (err != 0) {
auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
if (adapter == nullptr) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
llama_free(lctx);
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
}
llama_lora_adapter_set(lctx, adapter, lora_scale);
}
if (params.ignore_eos) {
@ -2061,7 +2111,24 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
if (params.warmup) {
LOG("warming up the model with an empty run\n");
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
std::vector<llama_token> tmp;
llama_token bos = llama_token_bos(model);
llama_token eos = llama_token_eos(model);
// some models (e.g. T5) don't have a BOS token
if (bos != -1) {
tmp.push_back(bos);
}
tmp.push_back(eos);
if (llama_model_has_encoder(model)) {
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
if (decoder_start_token_id == -1) {
decoder_start_token_id = bos;
}
tmp.clear();
tmp.push_back(decoder_start_token_id);
}
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
llama_kv_cache_clear(lctx);
llama_synchronize(lctx);
@ -2144,6 +2211,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.yarn_beta_slow = params.yarn_beta_slow;
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
cparams.pooling_type = params.pooling_type;
cparams.attention_type = params.attention_type;
cparams.defrag_thold = params.defrag_thold;
cparams.cb_eval = params.cb_eval;
cparams.cb_eval_user_data = params.cb_eval_user_data;
@ -2163,7 +2231,7 @@ static bool starts_with(const std::string & str, const std::string & prefix) {
return str.rfind(prefix, 0) == 0;
}
static bool llama_download_file(const std::string & url, const std::string & path) {
static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
// Initialize libcurl
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
@ -2178,6 +2246,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
// Check if hf-token or bearer-token was specified
if (!hf_token.empty()) {
std::string auth_header = "Authorization: Bearer ";
auth_header += hf_token.c_str();
struct curl_slist *http_headers = NULL;
http_headers = curl_slist_append(http_headers, auth_header.c_str());
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
}
#if defined(_WIN32)
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
// operating system. Currently implemented under MS-Windows.
@ -2373,6 +2450,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
struct llama_model * llama_load_model_from_url(
const char * model_url,
const char * path_model,
const char * hf_token,
const struct llama_model_params & params) {
// Basic validation of the model_url
if (!model_url || strlen(model_url) == 0) {
@ -2380,7 +2458,7 @@ struct llama_model * llama_load_model_from_url(
return NULL;
}
if (!llama_download_file(model_url, path_model)) {
if (!llama_download_file(model_url, path_model, hf_token)) {
return NULL;
}
@ -2428,14 +2506,14 @@ struct llama_model * llama_load_model_from_url(
// Prepare download in parallel
std::vector<std::future<bool>> futures_download;
for (int idx = 1; idx < n_split; idx++) {
futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool {
futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
char split_path[PATH_MAX] = {0};
llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
return llama_download_file(split_url, split_path);
return llama_download_file(split_url, split_path, hf_token);
}, idx));
}
@ -2454,6 +2532,7 @@ struct llama_model * llama_load_model_from_hf(
const char * repo,
const char * model,
const char * path_model,
const char * hf_token,
const struct llama_model_params & params) {
// construct hugging face model url:
//
@ -2469,7 +2548,7 @@ struct llama_model * llama_load_model_from_hf(
model_url += "/resolve/main/";
model_url += model;
return llama_load_model_from_url(model_url.c_str(), path_model, params);
return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
}
#else
@ -2477,6 +2556,7 @@ struct llama_model * llama_load_model_from_hf(
struct llama_model * llama_load_model_from_url(
const char * /*model_url*/,
const char * /*path_model*/,
const char * /*hf_token*/,
const struct llama_model_params & /*params*/) {
fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
return nullptr;
@ -2486,6 +2566,7 @@ struct llama_model * llama_load_model_from_hf(
const char * /*repo*/,
const char * /*model*/,
const char * /*path_model*/,
const char * /*hf_token*/,
const struct llama_model_params & /*params*/) {
fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
return nullptr;
@ -2550,51 +2631,35 @@ std::vector<llama_token> llama_tokenize(
}
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
std::string piece;
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
if (n_chars < 0) {
piece.resize(-n_chars);
int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
GGML_ASSERT(check == -n_chars);
}
else {
piece.resize(n_chars);
}
return std::string(result.data(), result.size());
return piece;
}
std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
std::string piece;
std::string result;
for (size_t i = 0; i < tokens.size(); ++i) {
piece = llama_token_to_piece(ctx, tokens[i]);
// remove the leading space of the first non-BOS token
if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
piece = piece.substr(1);
}
result += piece;
std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
std::string text;
text.resize(std::max(text.capacity(), tokens.size()));
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
if (n_chars < 0) {
text.resize(-n_chars);
n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
}
return result;
}
std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
std::string piece;
std::string result;
for (size_t i = 0; i < tokens.size(); ++i) {
piece = llama_token_to_piece(ctx, tokens[i]);
result += piece;
}
text.resize(n_chars);
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
return result;
return text;
}
bool llama_should_add_bos_token(const llama_model * model) {
@ -2618,6 +2683,7 @@ std::string llama_chat_apply_template(const struct llama_model * model,
const std::vector<llama_chat_msg> & msgs,
bool add_ass) {
int alloc_size = 0;
bool fallback = false; // indicate if we must fallback to default chatml
std::vector<llama_chat_message> chat;
for (auto & msg : msgs) {
chat.push_back({msg.role.c_str(), msg.content.c_str()});
@ -2630,10 +2696,26 @@ std::string llama_chat_apply_template(const struct llama_model * model,
// run the first time to get the total output length
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
// error: chat template is not supported
if (res < 0) {
if (ptr_tmpl != nullptr) {
// if the custom "tmpl" is not supported, we throw an error
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
throw std::runtime_error("this custom template is not supported");
} else {
// If the built-in template is not supported, we default to chatml
res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
fallback = true;
}
}
// if it turns out that our buffer is too small, we resize it
if ((size_t) res > buf.size()) {
buf.resize(res);
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
res = llama_chat_apply_template(
fallback ? nullptr : model,
fallback ? "chatml" : ptr_tmpl,
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
}
std::string formatted_chat(buf.data(), res);
@ -2645,12 +2727,19 @@ std::string llama_chat_format_single(const struct llama_model * model,
const std::vector<llama_chat_msg> & past_msg,
const llama_chat_msg & new_msg,
bool add_ass) {
auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
std::ostringstream ss;
auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
std::vector<llama_chat_msg> chat_new(past_msg);
// if the past_msg ends with a newline, we must preserve it in the formatted version
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
ss << "\n";
};
// format chat with new_msg
chat_new.push_back(new_msg);
auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
return formatted;
// get the diff part
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
return ss.str();
}
std::string llama_chat_format_example(const struct llama_model * model,
@ -2804,125 +2893,87 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
//
static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
int32_t n_tensors;
size_t n_bytes = 0;
uint32_t max_direction_layer = 0;
llama_control_vector_data result = { -1, {} };
// calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
{
struct ggml_init_params meta_params = {
/* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
/* .mem_buffer = */ nullptr,
/* .no_alloc = */ true,
};
ggml_context * meta_ctx = ggml_init(meta_params);
struct gguf_init_params meta_gguf_params = {
/* .no_alloc = */ true,
/* .ctx = */ &meta_ctx,
};
struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
if (!meta_ctx_gguf) {
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
return result;
}
n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
for (int i = 0; i < n_tensors; i++) {
std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
// split on '.'
size_t dotpos = name.find('.');
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
try {
uint32_t layer = std::stoi(name.substr(dotpos + 1));
if (layer == 0) {
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return result;
}
if (layer > max_direction_layer) {
max_direction_layer = layer;
}
} catch (...) {
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return result;
}
}
struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return result;
}
if (result.n_embd == -1) {
result.n_embd = ggml_nelements(tensor_meta);
} else if (ggml_nelements(tensor_meta) != result.n_embd) {
fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return result;
}
n_bytes += ggml_nbytes(tensor_meta);
}
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
ggml_context * ctx = nullptr;
struct gguf_init_params meta_gguf_params = {
/* .no_alloc = */ false,
/* .ctx = */ &ctx,
};
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
if (!ctx_gguf) {
fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
return result;
}
int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
if (n_tensors == 0) {
fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
return result;
}
// load and scale tensors into final control vector context
struct ggml_init_params ggml_params = {
/* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
/* .mem_buffer = */ nullptr,
/* .no_alloc = */ false,
};
struct ggml_context * ctx = ggml_init(ggml_params);
for (int i = 0; i < n_tensors; i++) {
std::string name = gguf_get_tensor_name(ctx_gguf, i);
struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ &ctx,
};
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params);
if (!ctx_gguf) {
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
ggml_free(ctx);
return result;
}
int layer_idx = -1;
// do not store data for layer 0 (it's not used)
result.data.resize(result.n_embd * max_direction_layer);
for (uint32_t il = 1; il <= max_direction_layer; il++) {
const std::string name = "direction." + std::to_string(il);
const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
float * dst = result.data.data() + result.n_embd * (il - 1);
if (tensor) {
const float * src = (const float *) tensor->data;
for (int j = 0; j < result.n_embd; j++) {
dst[j] = src[j] * load_info.strength;
}
} else {
for (int j = 0; j < result.n_embd; j++) {
dst[j] = 0.0f;
// split on '.'
size_t dotpos = name.find('.');
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
try {
layer_idx = std::stoi(name.substr(dotpos + 1));
} catch (...) {
layer_idx = -1;
}
}
if (layer_idx < 0) {
fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
} else if (layer_idx == 0) {
fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
}
struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
if (tensor->type != GGML_TYPE_F32) {
fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
}
if (ggml_n_dims(tensor) != 1) {
fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
}
if (result.n_embd == -1) {
result.n_embd = ggml_nelements(tensor);
} else if (ggml_nelements(tensor) != result.n_embd) {
fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
}
// extend if necessary - do not store data for layer 0 (it's not used)
result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
const float * src = (const float *) tensor->data;
float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0]
for (int j = 0; j < result.n_embd; j++) {
dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
}
}
if (result.n_embd == -1) {
fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
result.data.clear();
}
gguf_free(ctx_gguf);
ggml_free(ctx);
return result;
}
@ -2933,16 +2984,19 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
auto cur = llama_control_vector_load_one(info);
if (cur.n_embd == -1) {
return result;
result.n_embd = -1;
break;
}
if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) {
fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str());
return result;
if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
result.n_embd = -1;
break;
}
if (result.n_embd == -1) {
result = std::move(cur);
} else {
result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f); // extend if necessary
for (size_t i = 0; i < cur.data.size(); i++) {
result.data[i] += cur.data[i];
}
@ -2950,7 +3004,8 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
}
if (result.n_embd == -1) {
fprintf(stderr, "%s: no vectors passed\n", __func__);
fprintf(stderr, "%s: no valid control vector files passed\n", __func__);
result.data.clear();
}
return result;
@ -3118,7 +3173,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
}
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
}
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);

View File

@ -99,6 +99,7 @@ struct gpt_params {
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
// // sampling parameters
struct llama_sampling_params sparams;
@ -107,6 +108,7 @@ struct gpt_params {
std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias
std::string model_url = ""; // model url to download
std::string hf_token = ""; // HF token
std::string hf_repo = ""; // HF repo
std::string hf_file = ""; // HF file
std::string prompt = "";
@ -126,7 +128,6 @@ struct gpt_params {
// TODO: avoid tuple, use struct
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
std::string lora_base = ""; // base model path for the lora adapter
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
@ -200,6 +201,7 @@ struct gpt_params {
std::string public_path = "";
std::string chat_template = "";
std::string system_prompt = "";
bool enable_chat_template = true;
std::vector<std::string> api_keys;
@ -250,8 +252,13 @@ struct gpt_params {
std::string cvector_outfile = "control_vector.gguf";
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
bool spm_infill = false; // suffix/prefix/middle pattern for infill
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
};
void gpt_params_handle_hf_token(gpt_params & params);
void gpt_params_handle_model_default(gpt_params & params);
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
@ -307,8 +314,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params);
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params);
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
// Batch utils
@ -346,21 +353,13 @@ std::string llama_token_to_piece(
llama_token token,
bool special = true);
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
// that takes into account the tokenizer type and decides how to handle the leading space
//
// detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode`
// removes the leading space from the first non-BOS token
std::string llama_detokenize_spm(
// optionally renders special/control tokens
std::string llama_detokenize(
llama_context * ctx,
const std::vector<llama_token> & tokens);
// detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode`
std::string llama_detokenize_bpe(
llama_context * ctx,
const std::vector<llama_token> & tokens);
const std::vector<llama_token> & tokens,
bool special = true);
// Uses the value from the model metadata if possible, otherwise
// defaults to true when model type is SPM, otherwise false.
@ -380,6 +379,8 @@ struct llama_chat_msg {
bool llama_chat_verify_template(const std::string & tmpl);
// CPP wrapper for llama_chat_apply_template
// If the built-in template is not supported, we default to chatml
// If the custom "tmpl" is not supported, we throw an error
std::string llama_chat_apply_template(const struct llama_model * model,
const std::string & tmpl,
const std::vector<llama_chat_msg> & chat,
@ -454,4 +455,3 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
void yaml_dump_non_result_info(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

View File

@ -316,7 +316,7 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
};
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
template <typename Iterator>
std::string join(Iterator begin, Iterator end, const std::string & separator) {
@ -720,7 +720,7 @@ private:
}
prop_names.push_back(prop_name);
}
if (!(additional_properties.is_boolean() && !additional_properties.get<bool>())) {
if ((additional_properties.is_boolean() && additional_properties.get<bool>()) || additional_properties.is_object()) {
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
std::string value_rule =
additional_properties.is_object() ? visit(additional_properties, sub_name + "-value")

View File

@ -630,7 +630,7 @@ inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
buf << "[ ";
bool first = true;
for (const auto &token : tokens)
for (const auto & token : tokens)
{
if (!first) {
buf << ", ";

View File

@ -37,11 +37,18 @@ struct llama_ngram {
}
};
struct llama_token_hash_function {
size_t operator()(const llama_token token) const {
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
return token * 11400714819323198485llu;
}
};
struct llama_ngram_hash_function {
size_t operator()(const llama_ngram & ngram) const {
size_t hash = 0;
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
hash ^= llama_token_hash_function{}(ngram.tokens[i]);
}
return hash;
}

View File

@ -282,8 +282,6 @@ static llama_token llama_sampling_sample_impl(
GGML_ASSERT(!original_logits.empty());
}
llama_token id = 0;
// Get a pointer to the logits
float * logits = llama_get_logits_ith(ctx_main, idx);
if (temp < 0.0) {
// greedy sampling, with probs
@ -324,12 +322,15 @@ static llama_token llama_sampling_sample_impl(
}
if (ctx_sampling->grammar != NULL && !is_resampling) {
// Get a pointer to the logits
float * logits = llama_get_logits_ith(ctx_main, idx);
// Create an array with a single token data element for the sampled id
llama_token_data single_token_data = {id, logits[id], 0.0f};
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
// Apply grammar constraints to the single token
llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array);
// Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
@ -377,7 +378,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
if (ctx_sampling->grammar != NULL && !apply_grammar) {
GGML_ASSERT(original_logits != NULL);
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
*original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
*original_logits = {logits, logits + n_vocab};
}
// apply params.logit_bias map
@ -390,10 +391,10 @@ static llama_token_data_array llama_sampling_prepare_impl(
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
}
cur.clear();
cur.resize(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
}
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
@ -420,7 +421,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
// apply grammar checks before sampling logic
if (apply_grammar && ctx_sampling->grammar != NULL) {
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p);
}
return cur_p;
@ -454,6 +455,6 @@ void llama_sampling_accept(
ctx_sampling->prev.push_back(id);
if (ctx_sampling->grammar != NULL && apply_grammar) {
llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
# This script downloads the tokenizer models of the specified models from Huggingface and
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
#
# This is necessary in order to analyze the type of pre-tokenizer used by the model and
# provide the necessary information to llama.cpp via the GGUF header in order to implement
@ -15,9 +15,9 @@
# - Add a new model to the "models" list
# - Run the script with your huggingface token:
#
# python3 convert-hf-to-gguf-update.py <huggingface_token>
# python3 convert_hf_to_gguf_update.py <huggingface_token>
#
# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
# - Update llama.cpp with the new pre-tokenizer if necessary
#
# TODO: generate tokenizer tests for llama.cpp
@ -37,7 +37,7 @@ from enum import IntEnum, auto
from transformers import AutoTokenizer
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("convert-hf-to-gguf-update")
logger = logging.getLogger("convert_hf_to_gguf_update")
sess = requests.Session()
@ -45,20 +45,21 @@ class TOKENIZER_TYPE(IntEnum):
SPM = auto()
BPE = auto()
WPM = auto()
UGM = auto()
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
if len(sys.argv) == 2:
token = sys.argv[1]
if not token.startswith("hf_"):
logger.info("Huggingface token seems invalid")
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
sys.exit(1)
else:
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
sys.exit(1)
# TODO: add models here, base models preferred
@ -85,6 +86,14 @@ models = [
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
{"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
]
@ -93,8 +102,8 @@ def download_file_with_auth(url, token, save_path):
response = sess.get(url, headers=headers)
response.raise_for_status()
os.makedirs(os.path.dirname(save_path), exist_ok=True)
with open(save_path, 'wb') as f:
f.write(response.content)
with open(save_path, 'wb') as downloaded_file:
downloaded_file.write(response.content)
logger.info(f"File {save_path} downloaded successfully")
@ -106,9 +115,13 @@ def download_model(model):
os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
if tokt == TOKENIZER_TYPE.SPM:
files.append("tokenizer.model")
if tokt == TOKENIZER_TYPE.UGM:
files.append("spiece.model")
for file in files:
save_path = f"models/tokenizers/{name}/{file}"
if os.path.isfile(save_path):
@ -124,14 +137,14 @@ for model in models:
logger.error(f"Failed to download model {model['name']}. Error: {e}")
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
src_ifs = ""
for model in models:
name = model["name"]
tokt = model["tokt"]
if tokt == TOKENIZER_TYPE.SPM:
if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
continue
# Skip if the tokenizer folder does not exist or there are other download issues previously
@ -141,12 +154,15 @@ for model in models:
# create the tokenizer
try:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
if name == "t5":
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
else:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
except OSError as e:
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
continue # Skip to the next model if the tokenizer can't be loaded
chktok = tokenizer.encode(chktxt)
chktok = tokenizer.encode(CHK_TXT)
chkhsh = sha256(str(chktok).encode()).hexdigest()
logger.info(f"model: {name}")
@ -178,7 +194,7 @@ src_func = f"""
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
# use in llama.cpp to implement the same pre-tokenizer
chktxt = {repr(chktxt)}
chktxt = {repr(CHK_TXT)}
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
@ -188,7 +204,7 @@ src_func = f"""
res = None
# NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
# NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
# or pull the latest version of the model from Huggingface
# don't edit the hashes manually!
{src_ifs}
@ -197,9 +213,9 @@ src_func = f"""
logger.warning("**************************************************************************************")
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
logger.warning("** There are 2 possible reasons for this:")
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
logger.warning("** - the pre-tokenization config has changed upstream")
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
logger.warning("**")
logger.warning(f"** chkhsh: {{chkhsh}}")
@ -213,7 +229,7 @@ src_func = f"""
return res
"""
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
convert_py = convert_py_pth.read_text(encoding="utf-8")
convert_py = re.sub(
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
@ -224,7 +240,7 @@ convert_py = re.sub(
convert_py_pth.write_text(convert_py, encoding="utf-8")
logger.info("+++ convert-hf-to-gguf.py was updated")
logger.info("+++ convert_hf_to_gguf.py was updated")
# generate tests for each tokenizer model
@ -262,6 +278,7 @@ tests = [
"\n =",
"' era",
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天",
"!!!!!!",
"3",
"33",
"333",
@ -271,8 +288,9 @@ tests = [
"3333333",
"33333333",
"333333333",
# "Cửa Việt", # llama-bpe fails on this
chktxt,
"Cửa Việt", # llama-bpe fails on this
" discards",
CHK_TXT,
]
# write the tests to ./models/ggml-vocab-{name}.gguf.inp
@ -299,7 +317,10 @@ for model in models:
# create the tokenizer
try:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
if name == "t5":
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
else:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
except OSError as e:
logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
continue # Skip this model and continue with the next one in the loop
@ -325,6 +346,6 @@ logger.info("\nRun the following commands to generate the vocab files for testin
for model in models:
name = model["name"]
print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
print(f"python3 convert_hf_to_gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
logger.info("\n")

View File

@ -132,6 +132,10 @@ class Tensor:
class GGMLModel:
file_format: GGMLFormat
format_version: int
def __init__(self):
self.hyperparameters = None
self.vocab = None
@ -290,7 +294,7 @@ class GGMLToGGUF:
if self.vocab_override is not None:
vo = self.vocab_override
logger.info('* Adding vocab item(s)')
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
tokens.append(vbytes)
scores.append(score)
toktypes.append(ttype)
@ -354,7 +358,8 @@ class GGMLToGGUF:
def handle_metadata(cfg, hp):
import convert
import examples.convert_legacy_llama as convert
assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
hf_config_path = cfg.model_metadata_dir / "config.json"
orig_config_path = cfg.model_metadata_dir / "params.json"

393
convert_lora_to_gguf.py Executable file
View File

@ -0,0 +1,393 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from __future__ import annotations
from dataclasses import dataclass
import logging
import argparse
import os
import sys
import json
from math import prod
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
import torch
if TYPE_CHECKING:
from torch import Tensor
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
# reuse model definitions from convert_hf_to_gguf.py
from convert_hf_to_gguf import LazyTorchTensor, Model
logger = logging.getLogger("lora-to-gguf")
@dataclass
class PartialLoraTensor:
A: Tensor | None = None
B: Tensor | None = None
# magic to support tensor shape modifications and splitting
class LoraTorchTensor:
_lora_A: Tensor # (n_rank, row_size)
_lora_B: Tensor # (col_size, n_rank)
_rank: int
def __init__(self, A: Tensor, B: Tensor):
assert len(A.shape) == len(B.shape)
assert A.shape[-2] == B.shape[-1]
if A.dtype != B.dtype:
A = A.to(torch.float32)
B = B.to(torch.float32)
self._lora_A = A
self._lora_B = B
self._rank = B.shape[-1]
def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
return (self._lora_A, self._lora_B)
def __getitem__(
self,
indices: (
SupportsIndex
| slice
| tuple[SupportsIndex | slice | Tensor, ...] # TODO: add ellipsis in the type signature
),
) -> LoraTorchTensor:
shape = self.shape
if isinstance(indices, SupportsIndex):
if len(shape) > 2:
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
else:
raise NotImplementedError # can't return a vector
elif isinstance(indices, slice):
if len(shape) > 2:
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
else:
return LoraTorchTensor(self._lora_A, self._lora_B[indices])
elif isinstance(indices, tuple):
assert len(indices) > 0
if indices[-1] is Ellipsis:
return self[indices[:-1]]
# expand ellipsis
indices = tuple(
u
for v in (
(
(slice(None, None) for _ in range(len(indices) - 1))
if i is Ellipsis
else (i,)
)
for i in indices
)
for u in v
)
if len(indices) < len(shape):
indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape))))
# TODO: make sure this is correct
indices_A = (
*(
(
j.__index__() % self._lora_A.shape[i]
if isinstance(j, SupportsIndex)
else slice(None, None)
)
for i, j in enumerate(indices[:-2])
),
slice(None, None),
indices[-1],
)
indices_B = indices[:-1]
return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
else:
raise NotImplementedError # unknown indice type
@property
def dtype(self) -> torch.dtype:
assert self._lora_A.dtype == self._lora_B.dtype
return self._lora_A.dtype
@property
def shape(self) -> tuple[int, ...]:
assert len(self._lora_A.shape) == len(self._lora_B.shape)
return (*self._lora_B.shape[:-1], self._lora_A.shape[-1])
def size(self, dim=None):
assert dim is None
return self.shape
def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
if isinstance(shape[0], tuple):
new_shape: tuple[int, ...] = shape[0]
else:
new_shape = cast(tuple[int, ...], shape)
orig_shape = self.shape
if len(new_shape) < 2:
raise NotImplementedError # can't become a vector
# expand -1 in the shape
if any(dim == -1 for dim in new_shape):
n_elems = prod(orig_shape)
n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
assert n_elems % n_new_elems == 0
new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),)
if new_shape[-1] != orig_shape[-1]:
raise NotImplementedError # can't reshape the row size trivially
shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
shape_B = (*new_shape[:-1], self._rank)
return LoraTorchTensor(
self._lora_A.reshape(shape_A),
self._lora_B.reshape(shape_B),
)
def reshape_as(self, other: Tensor) -> LoraTorchTensor:
return self.reshape(*other.shape)
def view(self, *size: int) -> LoraTorchTensor:
return self.reshape(*size)
def permute(self, *dims: int) -> LoraTorchTensor:
shape = self.shape
dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
if dims[-1] == -1:
# TODO: support higher dimensional A shapes bigger than 1
assert all(dim == 1 for dim in self._lora_A.shape[:-2])
return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims))
else:
# TODO: compose the above two
raise NotImplementedError
def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
shape = self.shape
dims = [i for i in range(len(shape))]
dims[dim0], dims[dim1] = dims[dim1], dims[dim0]
return self.permute(*dims)
def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
return self.transpose(axis0, axis1)
def to(self, *args, **kwargs):
return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
@classmethod
def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
del types # unused
if kwargs is None:
kwargs = {}
if func is torch.permute:
return type(args[0]).permute(*args, **kwargs)
elif func is torch.reshape:
return type(args[0]).reshape(*args, **kwargs)
elif func is torch.stack:
assert isinstance(args[0], Sequence)
dim = kwargs.get("dim", 0)
assert dim == 0
return LoraTorchTensor(
torch.stack([a._lora_A for a in args[0]], dim),
torch.stack([b._lora_B for b in args[0]], dim),
)
elif func is torch.cat:
assert isinstance(args[0], Sequence)
dim = kwargs.get("dim", 0)
assert dim == 0
if len(args[0][0].shape) > 2:
return LoraTorchTensor(
torch.cat([a._lora_A for a in args[0]], dim),
torch.cat([b._lora_B for b in args[0]], dim),
)
elif all(torch.equal(args[0][0]._lora_A, t._lora_A) for t in args[0][1:]):
return LoraTorchTensor(
args[0][0]._lora_A,
torch.cat([b._lora_B for b in args[0]], dim),
)
else:
raise NotImplementedError
else:
raise NotImplementedError
def get_base_tensor_name(lora_tensor_name: str) -> str:
base_name = lora_tensor_name.replace("base_model.model.", "")
base_name = base_name.replace(".lora_A.weight", ".weight")
base_name = base_name.replace(".lora_B.weight", ".weight")
return base_name
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
)
parser.add_argument(
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
)
parser.add_argument(
"--bigendian", action="store_true",
help="model is executed on big endian machine",
)
parser.add_argument(
"--no-lazy", action="store_true",
help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
)
parser.add_argument(
"--verbose", action="store_true",
help="increase output verbosity",
)
parser.add_argument(
"--dry-run", action="store_true",
help="only print out what will be done, without writing any new files",
)
parser.add_argument(
"--base", type=Path, required=True,
help="directory containing base model file",
)
parser.add_argument(
"lora_path", type=Path,
help="directory containing LoRA adapter file",
)
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
ftype_map: dict[str, gguf.LlamaFileType] = {
"f32": gguf.LlamaFileType.ALL_F32,
"f16": gguf.LlamaFileType.MOSTLY_F16,
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
"auto": gguf.LlamaFileType.GUESSED,
}
ftype = ftype_map[args.outtype]
dir_base_model: Path = args.base
dir_lora: Path = args.lora_path
lora_config = dir_lora / "adapter_config.json"
input_model = dir_lora / "adapter_model.safetensors"
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_lora
if os.path.exists(input_model):
# lazy import load_file only if lora is in safetensors format.
from safetensors.torch import load_file
lora_model = load_file(input_model, device="cpu")
else:
input_model = os.path.join(dir_lora, "adapter_model.bin")
lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
# load base model
logger.info(f"Loading base model: {dir_base_model.name}")
hparams = Model.load_hparams(dir_base_model)
with torch.inference_mode():
try:
model_class = Model.from_model_architecture(hparams["architectures"][0])
except NotImplementedError:
logger.error(f"Model {hparams['architectures'][0]} is not supported")
sys.exit(1)
class LoraModel(model_class):
model_arch = model_class.model_arch
lora_alpha: float
def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
super().__init__(*args, **kwargs)
self.dir_model_card = dir_lora_model
self.lora_alpha = float(lora_alpha)
def set_type(self):
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
def set_gguf_parameters(self):
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
super().set_gguf_parameters()
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
tensor_map: dict[str, PartialLoraTensor] = {}
for name, tensor in lora_model.items():
if self.lazy:
tensor = LazyTorchTensor.from_eager(tensor)
base_name = get_base_tensor_name(name)
is_lora_a = ".lora_A.weight" in name
is_lora_b = ".lora_B.weight" in name
if not is_lora_a and not is_lora_b:
if ".base_layer.weight" in name:
continue
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
sys.exit(1)
if base_name in tensor_map:
if is_lora_a:
tensor_map[base_name].A = tensor
else:
tensor_map[base_name].B = tensor
else:
if is_lora_a:
tensor_map[base_name] = PartialLoraTensor(A=tensor)
else:
tensor_map[base_name] = PartialLoraTensor(B=tensor)
for name, tensor in tensor_map.items():
assert tensor.A is not None
assert tensor.B is not None
yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
dest = super().modify_tensors(data_torch, name, bid)
for dest_name, dest_data in dest:
assert isinstance(dest_data, LoraTorchTensor)
lora_a, lora_b = dest_data.get_lora_A_B()
yield (dest_name + ".lora_a", lora_a)
yield (dest_name + ".lora_b", lora_b)
with open(lora_config, "r") as f:
lparams: dict[str, Any] = json.load(f)
alpha: float = lparams["lora_alpha"]
model_instance = LoraModel(
dir_base_model,
ftype,
fname_out,
is_big_endian=args.bigendian,
use_temp_file=False,
eager=args.no_lazy,
dry_run=args.dry_run,
dir_lora_model=dir_lora,
lora_alpha=alpha,
)
logger.info("Exporting model...")
model_instance.write()
logger.info(f"Model successfully exported to {model_instance.fname_out}")

56
docs/android.md Normal file
View File

@ -0,0 +1,56 @@
# Android
## Build on Android using Termux
[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
```
apt update && apt upgrade -y
apt install git make cmake
```
It's recommended to move your model inside the `~/` directory for best performance:
```
cd storage/downloads
mv model.gguf ~/
```
[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
## Building the Project using Android NDK
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
```
$ mkdir build-android
$ cd build-android
$ export NDK=<your_ndk_directory>
$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
$ make
```
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
```
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
$cd /data/data/com.termux/files/home/bin
$chmod +x ./*
```
Download model [llama-2-7b-chat.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_K_M.gguf), and push it to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
```
$mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/home/model/
```
Now, you can start chatting:
```
$cd /data/data/com.termux/files/home/bin
$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
```
Here's a demo of an interactive session running on Pixel 5 phone:
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4

View File

@ -293,31 +293,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
```sh
./build/bin/llama-ls-sycl-device
```
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
```
found 6 SYCL devices:
found 2 SYCL devices:
| | | |Compute |Max compute|Max work|Max sub| |
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
```
| Attribute | Note |
|------------------------|-------------------------------------------------------------|
| compute capability 1.3 | Level-zero driver/runtime, recommended |
| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
4. Launch inference
There are two device selection modes:
- Single device: Use one device target specified by the user.
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
- Multiple devices: Automatically choose the devices with the same backend.
In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
| Device selection | Parameter |
|------------------|----------------------------------------|
@ -474,33 +469,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
build\bin\ls-sycl-device.exe
```
The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
```
found 6 SYCL devices:
found 2 SYCL devices:
| | | |Compute |Max compute|Max work|Max sub| |
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
```
| Attribute | Note |
|------------------------|-----------------------------------------------------------|
| compute capability 1.3 | Level-zero running time, recommended |
| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
4. Launch inference
There are two device selection modes:
- Single device: Use one device assigned by user.
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
- Single device: Use one device assigned by user. Default device id is 0.
- Multiple devices: Automatically choose the devices with the same backend.
In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
| Device selection | Parameter |
|------------------|----------------------------------------|

353
docs/build.md Normal file
View File

@ -0,0 +1,353 @@
# Build llama.cpp locally
**To get the Code:**
```bash
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
```
In order to build llama.cpp you have four different options.
- Using `make`:
- On Linux or MacOS:
```bash
make
```
- On Windows (x86/x64 only, arm64 requires cmake):
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
2. Extract `w64devkit` on your pc.
3. Run `w64devkit.exe`.
4. Use the `cd` command to reach the `llama.cpp` folder.
5. From here you can run:
```bash
make
```
- Notes:
- For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, run `make LLAMA_DEBUG=1`
- Using `CMake`:
```bash
cmake -B build
cmake --build build --config Release
```
**Notes**:
- For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, there are two cases:
1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
```bash
cmake -B build -DCMAKE_BUILD_TYPE=Debug
cmake --build build
```
2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
```bash
cmake -B build -G "Xcode"
cmake --build build --config Debug
```
- Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
- Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
- Tab Workload: Desktop-development with C++
- Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
- Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
- For Windows on ARM (arm64, WoA) build with:
```bash
cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
cmake --build build-arm64-windows-llvm-release
```
Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
- Using `gmake` (FreeBSD):
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
2. Add your user to **video** group
3. Install compilation dependencies.
```bash
sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
```
## Metal Build
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
argument.
## BLAS Build
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
### Accelerate Framework:
This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
### OpenBLAS:
This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
- Using `make`:
- On Linux:
```bash
make GGML_OPENBLAS=1
```
- On Windows:
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
3. Extract `w64devkit` on your pc.
4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
6. Run `w64devkit.exe`.
7. Use the `cd` command to reach the `llama.cpp` folder.
8. From here you can run:
```bash
make GGML_OPENBLAS=1
```
- Using `CMake` on Linux:
```bash
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
cmake --build build --config Release
```
### BLIS
Check [BLIS.md](./backend/BLIS.md) for more information.
### SYCL
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
### Intel oneMKL
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
- Using manual oneAPI installation:
By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
```bash
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON
cmake --build build --config Release
```
- Using oneAPI docker image:
If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
### CUDA
This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
- Using `make`:
```bash
make GGML_CUDA=1
```
- Using `CMake`:
```bash
cmake -B build -DGGML_CUDA=ON
cmake --build build --config Release
```
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
| Option | Legal values | Default | Description |
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| GGML_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
### MUSA
- Using `make`:
```bash
make GGML_MUSA=1
```
- Using `CMake`:
```bash
cmake -B build -DGGML_MUSA=ON
cmake --build build --config Release
```
### hipBLAS
This provides BLAS acceleration on HIP-supported AMD GPUs.
Make sure to have ROCm installed.
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
- Using `make`:
```bash
make GGML_HIPBLAS=1
```
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
```bash
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build --config Release -- -j 16
```
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
Note that if you get the following error:
```
clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
```
Try searching for a directory under `HIP_PATH` that contains the file
`oclc_abi_version_400.bc`. Then, add the following to the start of the
command: `HIP_DEVICE_LIB_PATH=<directory-you-just-found>`, so something
like:
```bash
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build -- -j 16
```
- Using `make` (example for target gfx1030, build with 16 CPU threads):
```bash
make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
```
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
```bash
set PATH=%HIP_PATH%\bin;%PATH%
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
cmake --build build
```
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
| Option | Legal values | Default | Description |
|------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
### Vulkan
**Windows**
#### w64devkit
Download and extract [w64devkit](https://github.com/skeeto/w64devkit/releases).
Download and install the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows). When selecting components, only the Vulkan SDK Core is required.
Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies:
```sh
SDK_VERSION=1.3.283.0
cp /VulkanSDK/$SDK_VERSION/Bin/glslc.exe $W64DEVKIT_HOME/bin/
cp /VulkanSDK/$SDK_VERSION/Lib/vulkan-1.lib $W64DEVKIT_HOME/x86_64-w64-mingw32/lib/
cp -r /VulkanSDK/$SDK_VERSION/Include/* $W64DEVKIT_HOME/x86_64-w64-mingw32/include/
cat > $W64DEVKIT_HOME/x86_64-w64-mingw32/lib/pkgconfig/vulkan.pc <<EOF
Name: Vulkan-Loader
Description: Vulkan Loader
Version: $SDK_VERSION
Libs: -lvulkan-1
EOF
```
Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
#### MSYS2
Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
```sh
pacman -S git \
mingw-w64-ucrt-x86_64-gcc \
mingw-w64-ucrt-x86_64-cmake \
mingw-w64-ucrt-x86_64-vulkan-devel \
mingw-w64-ucrt-x86_64-shaderc
```
Switch into `llama.cpp` directory and build using CMake.
```sh
cmake -B build -DGGML_VULKAN=ON
cmake --build build --config Release
```
**With docker**:
You don't need to install Vulkan SDK. It will be installed inside the container.
```sh
# Build the image
docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
# Then, use it:
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
```
**Without docker**:
Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
For example, on Ubuntu 22.04 (jammy), use the command below:
```bash
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
apt update -y
apt-get install -y vulkan-sdk
# To verify the installation, use the command below:
vulkaninfo
```
Alternatively your package manager might be able to provide the appropriate libraries.
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
Then, build llama.cpp using the cmake command below:
```bash
cmake -B build -DGGML_VULKAN=1
cmake --build build --config Release
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
# You should see in the output, ggml_vulkan detected your GPU. For example:
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
```
### Android
To read documentation for how to build on Android, [click here](./android.md)

View File

@ -1,4 +1,4 @@
## Add a new model architecture to `llama.cpp`
# Add a new model architecture to `llama.cpp`
Adding a model requires few steps:
@ -9,15 +9,15 @@ Adding a model requires few steps:
After following these steps, you can open PR.
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
- [main](../examples/main)
- [imatrix](../examples/imatrix)
- [quantize](../examples/quantize)
- [server](../examples/server)
- [main](/examples/main/)
- [imatrix](/examples/imatrix/)
- [quantize](/examples/quantize/)
- [server](/examples/server/)
### 1. Convert the model to GGUF
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format).
Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](/examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
@ -31,7 +31,7 @@ class MyModel(Model):
model_arch = gguf.MODEL_ARCH.GROK
```
2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py)
2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py)
Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.
@ -54,7 +54,7 @@ Example for `falcon` model:
As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.
Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file.
Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](/gguf-py/gguf/tensor_mapping.py) file.
If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.
@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback).
Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).
## GGUF specification

View File

@ -1,7 +1,7 @@
# Token generation performance troubleshooting
## Verifying that the model is running on the GPU with CUDA
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
Make sure you compiled llama with the correct env variables according to [this guide](/docs/build.md#cuda), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
```shell
./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
```

86
docs/docker.md Normal file
View File

@ -0,0 +1,86 @@
# Docker
## Prerequisites
* Docker must be installed and running on your system.
* Create a folder to store big models & intermediate files (ex. /llama/models)
## Images
We have three Docker images available for this project:
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
Additionally, there the following images, similar to the above:
- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
## Usage
The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
Replace `/path/to/models` below with the actual path where you downloaded the models.
```bash
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
```
On completion, you are ready to play!
```bash
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
```
or with a light image:
```bash
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
```
or with a server image:
```bash
docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
```
## Docker With CUDA
Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
## Building Docker locally
```bash
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
```
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
The defaults are:
- `CUDA_VERSION` set to `11.7.1`
- `CUDA_DOCKER_ARCH` set to `all`
The resulting images, are essentially the same as the non-CUDA images:
1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
## Usage
After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
```bash
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
```

39
docs/install.md Normal file
View File

@ -0,0 +1,39 @@
# Install pre-built version of llama.cpp
## Homebrew
On Mac and Linux, the homebrew package manager can be used via
```sh
brew install llama.cpp
```
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
## Nix
On Mac and Linux, the Nix package manager can be used via
```sh
nix profile install nixpkgs#llama-cpp
```
For flake enabled installs.
Or
```sh
nix-env --file '<nixpkgs>' --install --attr llama-cpp
```
For non-flake enabled installs.
This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
## Flox
On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
```sh
flox install llama-cpp
```
Flox follows the nixpkgs build of llama.cpp.

View File

@ -21,8 +21,8 @@ else()
add_subdirectory(embedding)
add_subdirectory(eval-callback)
add_subdirectory(export-lora)
add_subdirectory(finetune)
add_subdirectory(gbnf-validator)
add_subdirectory(gguf-hash)
add_subdirectory(gguf-split)
add_subdirectory(gguf)
add_subdirectory(gritlm)
@ -52,5 +52,4 @@ else()
add_subdirectory(simple)
add_subdirectory(speculative)
add_subdirectory(tokenize)
add_subdirectory(train-text-from-scratch)
endif()

View File

@ -229,7 +229,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
var result = [CChar](repeating: 0, count: 8)
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), false)
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), 0, false)
if nTokens < 0 {
let actualTokensCount = -Int(nTokens)
result = .init(repeating: 0, count: actualTokensCount)
@ -238,6 +238,7 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
token,
&result,
Int32(result.count),
0,
false
)
assert(check == actualTokensCount)

View File

@ -31,7 +31,7 @@ int main(int argc, char ** argv) {
int n_parallel = params.n_parallel;
// total length of the sequences including the prompt
int n_predict = 32;
int n_predict = params.n_predict;
// init LLM
@ -93,14 +93,34 @@ int main(int argc, char ** argv) {
// create a llama_batch
// we use this object to submit token data for decoding
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1);
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
std::vector<llama_seq_id> seq_ids(n_parallel, 0);
for (int32_t i = 0; i < n_parallel; ++i) {
seq_ids[i] = i;
}
// evaluate the initial prompt
for (size_t i = 0; i < tokens_list.size(); ++i) {
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
llama_batch_add(batch, tokens_list[i], i, seq_ids, false);
}
GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
if (llama_model_has_encoder(model)) {
if (llama_encode(ctx, batch)) {
LOG_TEE("%s : failed to eval\n", __func__);
return 1;
}
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
if (decoder_start_token_id == -1) {
decoder_start_token_id = llama_token_bos(model);
}
llama_batch_clear(batch);
llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
}
// llama_decode will output logits only for the last token of the prompt
batch.logits[batch.n_tokens - 1] = true;
@ -109,11 +129,11 @@ int main(int argc, char ** argv) {
return 1;
}
// assign the system KV cache to all parallel sequences
// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
for (int32_t i = 1; i < n_parallel; ++i) {
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
//// assign the system KV cache to all parallel sequences
//// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
//for (int32_t i = 1; i < n_parallel; ++i) {
// llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
//}
if (n_parallel > 1) {
LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);

View File

@ -24,7 +24,7 @@ from abc import ABC, abstractmethod
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional
from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar
import numpy as np
@ -346,42 +346,6 @@ class Params:
return params
@dataclass
class Metadata:
name: Optional[str] = None
author: Optional[str] = None
version: Optional[str] = None
url: Optional[str] = None
description: Optional[str] = None
licence: Optional[str] = None
source_url: Optional[str] = None
source_hf_repo: Optional[str] = None
@staticmethod
def load(metadata_path: Path) -> Metadata:
if metadata_path is None or not metadata_path.exists():
return Metadata()
with open(metadata_path, 'r') as file:
data = json.load(file)
# Create a new Metadata instance
metadata = Metadata()
# Assigning values to Metadata attributes if they exist in the JSON file
# This is based on LLM_KV_NAMES mapping in llama.cpp
metadata.name = data.get("general.name")
metadata.author = data.get("general.author")
metadata.version = data.get("general.version")
metadata.url = data.get("general.url")
metadata.description = data.get("general.description")
metadata.license = data.get("general.license")
metadata.source_url = data.get("general.source.url")
metadata.source_hf_repo = data.get("general.source.huggingface.repository")
return metadata
#
# data loading
# TODO: reuse (probably move to gguf.py?)
@ -492,12 +456,13 @@ class LazyTensor:
LazyModel: TypeAlias = 'dict[str, LazyTensor]'
ModelFormat: TypeAlias = Literal['ggml', 'torch', 'safetensors', 'none']
@dataclass
class ModelPlus:
model: LazyModel
paths: list[Path] # Where this was read from.
format: Literal['ggml', 'torch', 'safetensors', 'none']
format: ModelFormat
vocab: BaseVocab | None # For GGML models (which have vocab built in), the vocab.
@ -536,7 +501,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
formats = set(mp.format for mp in models_plus)
formats: set[ModelFormat] = set(mp.format for mp in models_plus)
assert len(formats) == 1, "different formats?"
format = formats.pop()
paths = [path for mp in models_plus for path in mp.paths]
@ -555,7 +520,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
else:
model = merge_sharded([mp.model for mp in models_plus])
return ModelPlus(model, paths, format, vocab) # pytype: disable=wrong-arg-types
return ModelPlus(model, paths, format, vocab)
def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
@ -805,7 +770,7 @@ class OutputFile:
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
def add_meta_model(self, params: Params, metadata: Metadata) -> None:
def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None:
# Metadata About The Model And Its Provenence
name = "LLaMA"
if metadata is not None and metadata.name is not None:
@ -823,16 +788,73 @@ class OutputFile:
self.gguf.add_author(metadata.author)
if metadata.version is not None:
self.gguf.add_version(metadata.version)
if metadata.url is not None:
self.gguf.add_url(metadata.url)
if metadata.organization is not None:
self.gguf.add_organization(metadata.organization)
if metadata.finetune is not None:
self.gguf.add_finetune(metadata.finetune)
if metadata.basename is not None:
self.gguf.add_basename(metadata.basename)
if metadata.description is not None:
self.gguf.add_description(metadata.description)
if metadata.licence is not None:
self.gguf.add_licence(metadata.licence)
if metadata.quantized_by is not None:
self.gguf.add_quantized_by(metadata.quantized_by)
if metadata.size_label is not None:
self.gguf.add_size_label(metadata.size_label)
if metadata.license is not None:
self.gguf.add_license(metadata.license)
if metadata.license_name is not None:
self.gguf.add_license_name(metadata.license_name)
if metadata.license_link is not None:
self.gguf.add_license_link(metadata.license_link)
if metadata.url is not None:
self.gguf.add_url(metadata.url)
if metadata.doi is not None:
self.gguf.add_doi(metadata.doi)
if metadata.uuid is not None:
self.gguf.add_uuid(metadata.uuid)
if metadata.repo_url is not None:
self.gguf.add_repo_url(metadata.repo_url)
if metadata.source_url is not None:
self.gguf.add_source_url(metadata.source_url)
if metadata.source_hf_repo is not None:
self.gguf.add_source_hf_repo(metadata.source_hf_repo)
if metadata.source_doi is not None:
self.gguf.add_source_doi(metadata.source_doi)
if metadata.source_uuid is not None:
self.gguf.add_source_uuid(metadata.source_uuid)
if metadata.source_repo_url is not None:
self.gguf.add_source_repo_url(metadata.source_repo_url)
if metadata.base_models is not None:
self.gguf.add_base_model_count(len(metadata.base_models))
for key, base_model_entry in enumerate(metadata.base_models):
if "name" in base_model_entry:
self.gguf.add_base_model_name(key, base_model_entry["name"])
if "author" in base_model_entry:
self.gguf.add_base_model_author(key, base_model_entry["author"])
if "version" in base_model_entry:
self.gguf.add_base_model_version(key, base_model_entry["version"])
if "organization" in base_model_entry:
self.gguf.add_base_model_organization(key, base_model_entry["organization"])
if "url" in base_model_entry:
self.gguf.add_base_model_url(key, base_model_entry["url"])
if "doi" in base_model_entry:
self.gguf.add_base_model_doi(key, base_model_entry["doi"])
if "uuid" in base_model_entry:
self.gguf.add_base_model_uuid(key, base_model_entry["uuid"])
if "repo_url" in base_model_entry:
self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])
if metadata.tags is not None:
self.gguf.add_tags(metadata.tags)
if metadata.languages is not None:
self.gguf.add_languages(metadata.languages)
if metadata.datasets is not None:
self.gguf.add_datasets(metadata.datasets)
def add_meta_arch(self, params: Params) -> None:
# Metadata About The Neural Architecture Itself
@ -943,7 +965,7 @@ class OutputFile:
@staticmethod
def write_vocab_only(
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: gguf.Metadata | None = None,
) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@ -977,7 +999,7 @@ class OutputFile:
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False,
metadata: Metadata = None,
metadata: gguf.Metadata | None = None,
) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@ -1020,35 +1042,32 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
raise ValueError(f"Unexpected combination of types: {name_to_type}")
def model_parameter_count(model: LazyModel) -> int:
total_model_parameters = 0
for i, (name, lazy_tensor) in enumerate(model.items()):
sum_weights_in_tensor = 1
def per_model_weight_count_estimation(tensors: Iterable[tuple[str, LazyTensor]]) -> tuple[int, int, int]:
total_params = 0
shared_params = 0
expert_params = 0
for name, lazy_tensor in tensors:
# We don't need these
if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
continue
# Got A Tensor
sum_weights_in_tensor: int = 1
# Tensor Volume
for dim in lazy_tensor.shape:
sum_weights_in_tensor *= dim
total_model_parameters += sum_weights_in_tensor
return total_model_parameters
if ".experts." in name:
if ".experts.0." in name:
expert_params += sum_weights_in_tensor
else:
shared_params += sum_weights_in_tensor
def model_parameter_count_rounded_notation(model_params_count: int) -> str:
if model_params_count > 1e12 :
# Trillions Of Parameters
scaled_model_params = model_params_count * 1e-12
scale_suffix = "T"
elif model_params_count > 1e9 :
# Billions Of Parameters
scaled_model_params = model_params_count * 1e-9
scale_suffix = "B"
elif model_params_count > 1e6 :
# Millions Of Parameters
scaled_model_params = model_params_count * 1e-6
scale_suffix = "M"
else:
# Thousands Of Parameters
scaled_model_params = model_params_count * 1e-3
scale_suffix = "K"
total_params += sum_weights_in_tensor
return f"{round(scaled_model_params)}{scale_suffix}"
return total_params, shared_params, expert_params
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
@ -1230,34 +1249,24 @@ class VocabFactory:
return vocab, special_vocab
def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
quantization = {
def default_convention_outfile(file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> str:
name = metadata.name if metadata.name is not None else None
basename = metadata.basename if metadata.basename is not None else None
finetune = metadata.finetune if metadata.finetune is not None else None
version = metadata.version if metadata.version is not None else None
size_label = metadata.size_label if metadata.size_label is not None else gguf.size_label(*model_params_count, expert_count=expert_count or 0)
output_type = {
GGMLFileType.AllF32: "F32",
GGMLFileType.MostlyF16: "F16",
GGMLFileType.MostlyQ8_0: "Q8_0",
}[file_type]
parameters = model_parameter_count_rounded_notation(model_params_count)
expert_count = ""
if params.n_experts is not None:
expert_count = f"{params.n_experts}x"
version = ""
if metadata is not None and metadata.version is not None:
version = f"-{metadata.version}"
name = "ggml-model"
if metadata is not None and metadata.name is not None:
name = metadata.name
elif params.path_model is not None:
name = params.path_model.name
return f"{name}{version}-{expert_count}{parameters}-{quantization}"
return gguf.naming_convention(name, basename, finetune, version, size_label, output_type)
def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
def default_outfile(model_paths: list[Path], file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> Path:
default_filename = default_convention_outfile(file_type, expert_count, model_params_count, metadata)
ret = model_paths[0].parent / f"{default_filename}.gguf"
if ret in model_paths:
logger.error(
@ -1296,8 +1305,9 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
parser.add_argument("--metadata", type=Path, help="Specify the path for a metadata file")
parser.add_argument("--metadata", type=Path, help="Specify the path for an authorship metadata override file")
parser.add_argument("--get-outfile", action="store_true", help="get calculated default outfile name")
parser.add_argument("--model-name", type=str, default=None, help="name of the model")
args = parser.parse_args(args_in)
@ -1309,32 +1319,36 @@ def main(args_in: list[str] | None = None) -> None:
else:
logging.basicConfig(level=logging.INFO)
metadata = Metadata.load(args.metadata)
model_name = args.model_name
dir_model = args.model
metadata = gguf.Metadata.load(args.metadata, dir_model, model_name)
if args.get_outfile:
model_plus = load_some_model(args.model)
model_plus = load_some_model(dir_model)
params = Params.load(model_plus)
model = convert_model_names(model_plus.model, params, args.skip_unknown)
model_params_count = model_parameter_count(model_plus.model)
ftype = pick_output_type(model, args.outtype)
print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
model = convert_model_names(model_plus.model, params, args.skip_unknown)
model_params_count = per_model_weight_count_estimation(model_plus.model.items())
ftype = pick_output_type(model, args.outtype)
if (metadata is None or metadata.name is None) and params.path_model is not None:
metadata.name = params.path_model.name
print(f"{default_convention_outfile(ftype, params.n_experts, model_params_count, metadata)}") # noqa: NP100
return
if args.no_vocab and args.vocab_only:
raise ValueError("--vocab-only does not make sense with --no-vocab")
if args.dump_single:
model_plus = lazy_load_file(args.model)
model_plus = lazy_load_file(dir_model)
do_dump_model(model_plus)
return
if not args.vocab_only:
model_plus = load_some_model(args.model)
model_plus = load_some_model(dir_model)
else:
model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
model_params_count = model_parameter_count(model_plus.model)
logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})")
model_plus = ModelPlus(model = {}, paths = [dir_model / 'dummy'], format = 'none', vocab = None)
if args.dump:
do_dump_model(model_plus)
@ -1367,7 +1381,7 @@ def main(args_in: list[str] | None = None) -> None:
logger.info(f"params = {params}")
model_parent_path = model_plus.paths[0].parent
vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
vocab_path = Path(args.vocab_dir or dir_model or model_parent_path)
vocab_factory = VocabFactory(vocab_path)
vocab_types = None if args.no_vocab else args.vocab_type.split(",")
vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)
@ -1396,13 +1410,23 @@ def main(args_in: list[str] | None = None) -> None:
if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
vocab = model_plus.vocab
assert params is not None
if metadata.name is None and params.path_model is not None:
metadata.name = params.path_model.name
model_params_count = per_model_weight_count_estimation(model_plus.model.items())
logger.info(f"model parameters count : {model_params_count} ({gguf.model_weight_count_rounded_notation(model_params_count[0])})")
logger.info(f"Vocab info: {vocab}")
logger.info(f"Special vocab info: {special_vocab}")
model = model_plus.model
model = convert_model_names(model, params, args.skip_unknown)
ftype = pick_output_type(model, args.outtype)
model = convert_to_output_type(model, ftype)
outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)
outfile = args.outfile or default_outfile(model_plus.paths, ftype, params.n_experts, model_params_count, metadata=metadata)
metadata.size_label = gguf.size_label(*model_params_count, expert_count=params.n_experts or 0)
params.ftype = ftype
logger.info(f"Writing {outfile}, format {ftype}")

View File

@ -0,0 +1,49 @@
# Migration notice for binary filenames
> [!IMPORTANT]
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
This migration was important, but it is a breaking change that may not always be immediately obvious to users.
Please update all scripts and workflows to use the new binary names.
| Old Filename | New Filename |
| ---- | ---- |
| main | llama-cli |
| server | llama-server |
| llama-bench | llama-bench |
| embedding | llama-embedding |
| quantize | llama-quantize |
| tokenize | llama-tokenize |
| export-lora | llama-export-lora |
| libllava.a | libllava.a |
| baby-llama | llama-baby-llama |
| batched | llama-batched |
| batched-bench | llama-batched-bench |
| benchmark-matmult | llama-benchmark-matmult |
| convert-llama2c-to-ggml | llama-convert-llama2c-to-ggml |
| eval-callback | llama-eval-callback |
| gbnf-validator | llama-gbnf-validator |
| gguf | llama-gguf |
| gguf-split | llama-gguf-split |
| gritlm | llama-gritlm |
| imatrix | llama-imatrix |
| infill | llama-infill |
| llava-cli | llama-llava-cli |
| lookahead | llama-lookahead |
| lookup | llama-lookup |
| lookup-create | llama-lookup-create |
| lookup-merge | llama-lookup-merge |
| lookup-stats | llama-lookup-stats |
| parallel | llama-parallel |
| passkey | llama-passkey |
| perplexity | llama-perplexity |
| q8dot | llama-q8dot |
| quantize-stats | llama-quantize-stats |
| retrieval | llama-retrieval |
| save-load-state | llama-save-load-state |
| simple | llama-simple |
| speculative | llama-speculative |
| vdot | llama-vdot |
| tests/test-c.o | tests/test-c.o |

View File

@ -0,0 +1,35 @@
// Warns users that this filename was deprecated, and provides a link for more information.
#include <cstdio>
#include <string>
#include <unordered_map>
// Main
int main(int argc, char** argv) {
std::string filename = "main";
if (argc >= 1) {
filename = argv[0];
}
// Get only the program name from the full path
auto pos = filename.find_last_of('/');
if (pos != std::string::npos) {
filename = filename.substr(pos+1);
}
// Append "llama-" to the beginning of filename to get the replacemnt filename
auto replacement_filename = "llama-" + filename;
// The exception is if the filename is "main", then our replacement filename is "llama-cli"
if (filename == "main") {
replacement_filename = "llama-cli";
}
fprintf(stdout, "\n");
fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
fprintf(stdout, "\n");
return EXIT_FAILURE;
}

View File

@ -58,4 +58,3 @@ The above command will output space-separated float values.
```powershell
embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
```

View File

@ -62,7 +62,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
} else if (type == GGML_TYPE_I8) {
v = (float) *(int8_t *) &data[i];
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
printf("%12.4f", v);
sum += v;
@ -99,7 +99,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
char src1_str[128] = {0};
if (src1) {
sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
}
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,

View File

@ -6,12 +6,11 @@ Apply LORA adapters to base model and export the resulting model.
usage: llama-export-lora [options]
options:
-h, --help show this help message and exit
-m FNAME, --model-base FNAME model path from which to load base model (default '')
-o FNAME, --model-out FNAME path to save exported model (default '')
-l FNAME, --lora FNAME apply LoRA adapter
-s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S
-t N, --threads N number of threads to use during computation (default: 4)
-m, --model model path from which to load base model (default '')
--lora FNAME path to LoRA adapter (can be repeated to use multiple adapters)
--lora-scaled FNAME S path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)
-t, --threads N number of threads to use during computation (default: 4)
-o, --output FNAME output file (default: 'ggml-lora-merged-f16.gguf')
```
For example:
@ -20,7 +19,15 @@ For example:
./bin/llama-export-lora \
-m open-llama-3b-v2-q8_0.gguf \
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
-l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
--lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf
```
Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters.
Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:
```bash
./bin/llama-export-lora \
-m your_base_model.gguf \
-o your_merged_model.gguf \
--lora-scaled lora_task_A.gguf 0.5 \
--lora-scaled lora_task_B.gguf 0.5
```

View File

@ -1,462 +1,420 @@
#include "common.h"
#include "ggml.h"
#include "ggml-alloc.h"
#include <map>
#include <vector>
#include <string>
#include <thread>
#include <fstream>
struct lora_info {
std::string filename;
static bool g_verbose = false;
static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
int id = gguf_find_key(ctx_gguf, key.c_str());
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
}
static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) {
int id = gguf_find_key(ctx_gguf, key.c_str());
return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
}
static void zeros(std::ofstream & file, size_t n) {
char zero = 0;
for (size_t i = 0; i < n; ++i) {
file.write(&zero, 1);
}
}
static std::string ggml_ne_string(const ggml_tensor * t) {
std::string str;
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
str += std::to_string(t->ne[i]);
if (i + 1 < GGML_MAX_DIMS) {
str += ", ";
}
}
return str;
}
static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) {
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ ctx_ggml,
};
struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
if (!ctx_gguf) {
throw std::runtime_error("failed to load input GGUF from " + fname);
}
return ctx_gguf;
}
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
std::string result;
for (size_t pos = 0; ; pos += search.length()) {
auto new_pos = s.find(search, pos);
if (new_pos == std::string::npos) {
result += s.substr(pos, s.size() - pos);
break;
}
result += s.substr(pos, new_pos - pos) + replace;
pos = new_pos;
}
s = std::move(result);
}
struct file_input {
struct ggml_context * ctx_meta = nullptr;
struct gguf_context * ctx_gguf = nullptr;
std::ifstream f_in;
std::map<std::string, ggml_tensor *> tensors;
float alpha;
float scale;
file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) {
if (!f_in.is_open()) {
throw std::runtime_error("failed to open input gguf from " + fname);
}
ctx_gguf = load_gguf(fname, &ctx_meta);
alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha");
printf("%s: loaded gguf from %s\n", __func__, fname.c_str());
for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) {
std::string name(cur->name);
tensors[name] = cur;
if (g_verbose) {
printf("%s: %s\n", __func__, cur->name);
}
}
}
ggml_tensor * get_tensor(std::string name) {
if (tensors.find(name) == tensors.end()) {
return nullptr;
}
return tensors[name];
}
void read_tensor_data(std::string name, std::vector<uint8_t> & buf) {
if (tensors.find(name) == tensors.end()) {
throw std::runtime_error("cannot find tensor with name: " + name);
}
auto len = ggml_nbytes(tensors[name]);
if (buf.size() < len) {
buf.resize(len);
}
auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file
auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
f_in.seekg(offset);
f_in.read((char* )buf.data(), len);
}
~file_input() {
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
}
};
struct export_lora_params {
std::string fn_model_base;
std::string fn_model_out;
std::vector<struct lora_info> lora;
struct lora_merge_ctx {
// input base model + adapters
file_input base_model;
std::vector<std::unique_ptr<file_input>> adapters;
// for computing merged tensor
int n_threads;
};
ggml_backend_t backend = nullptr;
ggml_gallocr_t allocr = nullptr;
std::vector<uint8_t> read_buf;
struct lora_data {
struct lora_info info;
std::vector<uint8_t> data;
struct ggml_context * ctx;
// output file
struct gguf_context * ctx_out;
struct ggml_context * ctx_out_ggml;
std::ofstream fout;
uint32_t lora_r;
uint32_t lora_alpha;
};
lora_merge_ctx(
std::string & base_fname,
std::vector<std::tuple<std::string, float>> & lora_files,
std::string & outfile,
int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
struct llama_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
size_t size;
if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) {
throw std::runtime_error("split model is not yet supported");
}
llama_file(const char * fname, const char * mode) {
fp = std::fopen(fname, mode);
if (fp == NULL) {
size = 0;
for (auto lora_inp : lora_files) {
auto fname = std::get<0>(lora_inp);
auto scale = std::get<1>(lora_inp);
std::unique_ptr<file_input> adapter(new file_input(fname, scale));
check_metadata_lora(adapter.get());
adapters.push_back(std::move(adapter));
}
ctx_out = gguf_init_empty();
struct ggml_init_params params = {
/*.mem_size =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ctx_out_ggml = ggml_init(params);
backend = ggml_backend_cpu_init();
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
}
void check_metadata_lora(file_input * adapter) {
auto general_type = get_kv_str(adapter->ctx_gguf, "general.type");
if (general_type != "adapter") {
throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
}
auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type");
if (adapter_type != "lora") {
throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
}
auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture");
auto general_arch_lora = get_kv_str(adapter->ctx_gguf, "general.architecture");
if (general_arch_base != general_arch_lora) {
throw std::runtime_error("model arch and LoRA arch mismatch");
}
}
ggml_type get_out_tensor_type(struct ggml_tensor * t) {
if (t->type == GGML_TYPE_F32) {
return GGML_TYPE_F32;
} else {
seek(0, SEEK_END);
size = tell();
seek(0, SEEK_SET);
return GGML_TYPE_F16;
}
}
size_t tell() const {
#ifdef _WIN32
__int64 ret = _ftelli64(fp);
#else
long ret = std::ftell(fp);
#endif
GGML_ASSERT(ret != -1); // this really shouldn't fail
return (size_t) ret;
}
void run_merge() {
// prepare metadata
gguf_set_kv(ctx_out, base_model.ctx_gguf);
// output is forced to f16 for now
gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16);
void seek(size_t offset, int whence) {
#ifdef _WIN32
int ret = _fseeki64(fp, (__int64) offset, whence);
#else
int ret = std::fseek(fp, (long) offset, whence);
#endif
GGML_ASSERT(ret == 0); // same
}
void read_raw(void * ptr, size_t size) {
if (size == 0) {
return;
// check if all lora adapters have the same tensors
// TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777
static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once.";
if (adapters.size() > 1) {
for (size_t i = 1; i < adapters.size(); ++i) {
if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) {
throw std::runtime_error(err_no_subset_adapter);
}
for (auto & it : adapters[i]->tensors) {
if (adapters[0]->get_tensor(it.first) == nullptr) {
throw std::runtime_error(err_no_subset_adapter);
}
}
}
}
errno = 0;
std::size_t ret = std::fread(ptr, size, 1, fp);
if (ferror(fp)) {
die_fmt("read error: %s", strerror(errno));
// mapping base tensor to out tensor (same shape with base, but different type)
// if out_tensor == nullptr, we only copy it
std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
for (auto & it : base_model.tensors) {
bool t_a = true;
bool t_b = true;
for (auto & adapter : adapters) {
t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a");
t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b");
}
auto base_tensor = it.second;
if (!t_a && !t_b) {
// only copy
struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
ggml_set_name(cpy_tensor, base_tensor->name);
base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr));
gguf_add_tensor(ctx_out, cpy_tensor);
} else if (t_a && t_b) {
// need merging
struct ggml_tensor * out_tensor = ggml_new_tensor(
ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
ggml_set_name(out_tensor, base_tensor->name);
base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor));
gguf_add_tensor(ctx_out, out_tensor);
} else {
throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
}
}
if (ret != 1) {
die("unexpectedly reached end of file");
// placeholder for the meta data
{
size_t meta_size = gguf_get_meta_size(ctx_out);
zeros(fout, meta_size);
}
}
std::uint32_t read_u32() {
std::uint32_t ret;
read_raw(&ret, sizeof(ret));
return ret;
}
std::string read_string(std::uint32_t len) {
std::vector<char> chars(len);
read_raw(chars.data(), len);
return std::string(chars.data(), len);
}
void write_raw(const void * ptr, size_t size) {
if (size == 0) {
return;
// process base model tensors
size_t n_merged = 0;
for (auto & it : base_to_out_tensors) {
if (it.second != nullptr) {
merge_tensor(it.first, it.second);
n_merged++;
} else {
copy_tensor(it.first);
}
}
errno = 0;
size_t ret = std::fwrite(ptr, size, 1, fp);
if (ret != 1) {
die_fmt("write error: %s", strerror(errno));
// write output metadata
{
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
gguf_get_meta_data(ctx_out, data.data());
fout.seekp(0);
fout.write((const char *)data.data(), data.size());
}
printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size());
}
void write_u32(std::uint32_t val) {
write_raw(&val, sizeof(val));
void copy_tensor(struct ggml_tensor * base) {
printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
size_t len = ggml_nbytes(base);
base_model.read_tensor_data(base->name, read_buf);
fout.write((char* )read_buf.data(), len);
zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
}
bool eof() {
return tell() >= size;
}
void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) {
std::string name_base(base->name);
std::string name_lora_a = name_base + ".lora_a";
std::string name_lora_b = name_base + ".lora_b";
~llama_file() {
if (fp) {
std::fclose(fp);
printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
// context for input tensor
std::vector<struct ggml_tensor *> inp_a(adapters.size());
std::vector<struct ggml_tensor *> inp_b(adapters.size());
struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead()*(2+adapters.size()*2),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
struct ggml_context * ctx = ggml_init(params);
// alloc tensors
struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne);
for (size_t i = 0; i < adapters.size(); ++i) {
auto t_a = adapters[i]->get_tensor(name_lora_a);
auto t_b = adapters[i]->get_tensor(name_lora_b);
inp_a[i] = ggml_dup_tensor(ctx, t_a);
inp_b[i] = ggml_dup_tensor(ctx, t_b);
}
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
// load base tensor to backend buffer
base_model.read_tensor_data(name_base, read_buf);
if (base->type != GGML_TYPE_F32) {
// optionally dequantize it
printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
auto nels = ggml_nelements(inp_base);
ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
} else {
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
}
// load lora tensors to backend buffer
for (size_t i = 0; i < adapters.size(); ++i) {
adapters[i]->read_tensor_data(name_lora_a, read_buf);
ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i]));
adapters[i]->read_tensor_data(name_lora_b, read_buf);
ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i]));
}
// build graph
struct ggml_cgraph * gf;
{
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params0 = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf.data(),
/*.no_alloc =*/ true,
};
struct ggml_context * ctx0 = ggml_init(params0);
gf = ggml_new_graph(ctx0);
struct ggml_tensor * cur = inp_base;
for (size_t i = 0; i < adapters.size(); ++i) {
struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
// scale
const float alpha = adapters[i]->alpha;
const float rank = (float) inp_b[i]->ne[0];
const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
delta = ggml_scale(ctx0, delta, scale);
cur = ggml_add(ctx0, delta, cur);
printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
}
cur = ggml_cast(ctx0, cur, out->type);
printf("%s : + output type is %s\n", __func__, ggml_type_name(out->type));
ggml_build_forward_expand(gf, cur);
ggml_free(ctx0);
}
// compute
{
ggml_gallocr_alloc_graph(allocr, gf);
ggml_backend_cpu_set_n_threads(backend, n_threads);
ggml_backend_graph_compute(backend, gf);
}
// write data to output file
{
auto result = gf->nodes[gf->n_nodes - 1];
size_t len = ggml_nbytes(result);
if (read_buf.size() < len) {
read_buf.resize(len);
}
ggml_backend_tensor_get(result, read_buf.data(), 0, len);
fout.write((char* )read_buf.data(), len);
zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
}
ggml_free(ctx);
ggml_backend_buffer_free(buffer);
}
~lora_merge_ctx() {
ggml_gallocr_free(allocr);
ggml_backend_free(backend);
gguf_free(ctx_out);
ggml_free(ctx_out_ggml);
}
};
static struct export_lora_params get_default_export_lora_params() {
struct export_lora_params result;
result.fn_model_base = "";
result.fn_model_out = "";
result.n_threads = GGML_DEFAULT_N_THREADS;
return result;
}
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -m FNAME, --model-base FNAME model path from which to load base model (default '%s')\n", params->fn_model_base.c_str());
fprintf(stderr, " -o FNAME, --model-out FNAME path to save exported model (default '%s')\n", params->fn_model_out.c_str());
fprintf(stderr, " -l FNAME, --lora FNAME apply LoRA adapter\n");
fprintf(stderr, " -s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params->n_threads);
}
static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) {
bool invalid_param = false;
std::string arg;
struct export_lora_params default_params = get_default_export_lora_params();
const std::string arg_prefix = "--";
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
if (arg == "-m" || arg == "--model-base") {
if (++i >= argc) {
invalid_param = true;
break;
}
params->fn_model_base = argv[i];
} else if (arg == "-o" || arg == "--model-out") {
if (++i >= argc) {
invalid_param = true;
break;
}
params->fn_model_out = argv[i];
} else if (arg == "-l" || arg == "--lora") {
if (++i >= argc) {
invalid_param = true;
break;
}
struct lora_info lora;
lora.filename = argv[i];
lora.scale = 1.0f;
params->lora.push_back(lora);
} else if (arg == "-s" || arg == "--lora-scaled") {
if (++i >= argc) {
invalid_param = true;
break;
}
struct lora_info lora;
lora.filename = argv[i];
if (++i >= argc) {
invalid_param = true;
break;
}
lora.scale = std::stof(argv[i]);
params->lora.push_back(lora);
} else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
params->n_threads = std::stoi(argv[i]);
if (params->n_threads <= 0) {
params->n_threads = std::thread::hardware_concurrency();
}
} else {
fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str());
export_lora_print_usage(argc, argv, &default_params);
exit(1);
}
}
if (params->fn_model_base == default_params.fn_model_base) {
fprintf(stderr, "error: please specify a filename for model-base.\n");
export_lora_print_usage(argc, argv, &default_params);
exit(1);
}
if (params->fn_model_out == default_params.fn_model_out) {
fprintf(stderr, "error: please specify a filename for model-out.\n");
export_lora_print_usage(argc, argv, &default_params);
exit(1);
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str());
export_lora_print_usage(argc, argv, &default_params);
exit(1);
}
return true;
}
static void free_lora(struct lora_data * lora) {
if (lora->ctx != NULL) {
ggml_free(lora->ctx);
}
delete lora;
}
static struct lora_data * load_lora(struct lora_info * info) {
struct lora_data * result = new struct lora_data;
result->info = *info;
result->ctx = NULL;
result->lora_r = 1;
result->lora_alpha = 1;
struct llama_file file(info->filename.c_str(), "rb");
if (file.fp == NULL) {
fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
info->filename.c_str());
free_lora(result);
return NULL;
}
struct ggml_init_params params_ggml;
params_ggml.mem_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE;
params_ggml.mem_buffer = NULL;
params_ggml.no_alloc = true;
result->ctx = ggml_init(params_ggml);
uint32_t magic = file.read_u32();
if (magic != LLAMA_FILE_MAGIC_GGLA) {
die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
}
uint32_t version = file.read_u32();
if (version != 1) {
die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str());
}
result->lora_r = file.read_u32();
result->lora_alpha = file.read_u32();
// read tensor infos from file
std::vector<char> name_buf;
std::vector<struct ggml_tensor *> tensors;
std::vector<size_t> tensors_offset;
size_t total_nbytes_pad = 0;
while(!file.eof()) {
int64_t ne[4] = {1,1,1,1};
uint32_t n_dims = file.read_u32();
uint32_t namelen = file.read_u32();
uint32_t type = file.read_u32();
for (uint32_t k = 0; k < n_dims; ++k) {
ne[k] = (int64_t)file.read_u32();
}
name_buf.clear();
name_buf.resize(namelen + 1, '\0');
file.read_raw(name_buf.data(), namelen);
file.seek((0-file.tell()) & 31, SEEK_CUR);
size_t offset = file.tell();
struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
ggml_set_name(tensor, name_buf.data());
size_t nbytes = ggml_nbytes(tensor);
size_t nbytes_pad = ggml_nbytes_pad(tensor);
total_nbytes_pad += nbytes_pad;
tensors.push_back(tensor);
tensors_offset.push_back(offset);
file.seek(nbytes, SEEK_CUR);
}
// read tensor data
result->data.resize(total_nbytes_pad);
size_t data_offset = 0;
for (size_t i = 0; i < tensors.size(); ++i) {
struct ggml_tensor * tensor = tensors[i];
size_t offset = tensors_offset[i];
size_t nbytes = ggml_nbytes(tensor);
size_t nbytes_pad = ggml_nbytes_pad(tensor);
file.seek(offset, SEEK_SET);
tensor->data = result->data.data() + data_offset;
file.read_raw(tensor->data, nbytes);
data_offset += nbytes_pad;
}
return result;
}
static struct ggml_cgraph * build_graph_lora(
struct ggml_context * ctx,
struct ggml_tensor * tensor,
struct ggml_tensor * lora_a,
struct ggml_tensor * lora_b,
float scaling
) {
struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
if (scaling != 1.0f) {
ab = ggml_scale(ctx, ab, scaling);
}
struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
struct ggml_cgraph * gf = ggml_new_graph(ctx);
ggml_build_forward_expand (gf, res);
return gf;
}
static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) {
if (lora->ctx == NULL) {
return false;
}
std::string name = ggml_get_name(tensor);
std::string name_a = name + std::string(".loraA");
std::string name_b = name + std::string(".loraB");
struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str());
struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str());
if (lora_a == NULL || lora_b == NULL) {
return false;
}
float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
struct ggml_init_params params;
params.mem_size = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
params.mem_buffer = NULL;
params.no_alloc = true;
struct ggml_context * ctx = NULL;
struct ggml_gallocr * alloc = NULL;
struct ggml_cgraph * gf = NULL;
ctx = ggml_init(params);
alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
ggml_gallocr_alloc_graph(alloc, gf);
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
static std::vector<uint8_t> data_work;
data_work.resize(cplan.work_size);
cplan.work_data = data_work.data();
ggml_graph_compute(gf, &cplan);
ggml_gallocr_free(alloc);
ggml_free(ctx);
return true;
}
static void export_lora(struct export_lora_params * params) {
// load all loras
std::vector<struct lora_data *> loras;
for (size_t i = 0; i < params->lora.size(); ++i) {
struct lora_data * lora = load_lora(&params->lora[i]);
if (lora != NULL) {
loras.push_back(lora);
}
}
if (loras.size() == 0) {
fprintf(stderr, "warning: no lora adapters will be applied.\n");
}
// open input file
struct llama_file fin(params->fn_model_base.c_str(), "rb");
if (!fin.fp) {
die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str());
}
// open base model gguf, read tensors without their data
struct ggml_context * ctx_in;
struct gguf_init_params params_gguf;
params_gguf.no_alloc = true;
params_gguf.ctx = &ctx_in;
struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
// create new gguf
struct gguf_context * gguf_out = gguf_init_empty();
// copy meta data from base model: kv and tensors
gguf_set_kv(gguf_out, gguf_in);
int n_tensors = gguf_get_n_tensors(gguf_in);
for (int i=0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(gguf_in, i);
struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
gguf_add_tensor(gguf_out, tensor);
}
// create output file
struct llama_file fout(params->fn_model_out.c_str(), "wb");
if (!fout.fp) {
die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str());
}
// write gguf meta data
std::vector<uint8_t> meta;
meta.resize(gguf_get_meta_size(gguf_out));
gguf_get_meta_data(gguf_out, meta.data());
fout.write_raw(meta.data(), meta.size());
std::vector<uint8_t> data;
std::vector<uint8_t> padding;
for (int i=0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(gguf_in, i);
struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
// read tensor data
data.resize(ggml_nbytes(tensor));
tensor->data = data.data();
size_t offset = gguf_get_tensor_offset(gguf_in, i);
fin.seek(offset + meta.size(), SEEK_SET);
fin.read_raw(data.data(), data.size());
// apply all loras
for (size_t k = 0; k < loras.size(); ++k) {
apply_lora(tensor, loras[k], params->n_threads);
}
// write tensor data + padding
padding.clear();
padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0);
GGML_ASSERT(fout.tell() == offset + meta.size());
// fout.seek(offset + meta.size(), SEEK_SET);
fout.write_raw(data.data(), data.size());
fout.write_raw(padding.data(), padding.size());
if (i % 2 == 0) {
printf(".");
}
}
printf("\nexample usage:\n");
printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
printf("\nNOTE: output model is F16\n");
printf("\n");
// close gguf
gguf_free(gguf_out);
gguf_free(gguf_in);
// free loras
for (size_t i = 0; i < loras.size(); ++i) {
free_lora(loras[i]);
}
}
int main(int argc, char ** argv) {
struct export_lora_params params = get_default_export_lora_params();
gpt_params params;
if (!export_lora_params_parse(argc, argv, &params)) {
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
return 1;
}
export_lora(&params);
g_verbose = (params.verbosity == 1);
try {
lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads);
ctx.run_merge();
} catch (const std::exception & err) {
fprintf(stderr, "%s\n", err.what());
exit(EXIT_FAILURE);
}
printf("done, output file is %s\n", params.lora_outfile.c_str());
return 0;
}

View File

@ -1,5 +0,0 @@
set(TARGET llama-finetune)
add_executable(${TARGET} finetune.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@ -1,90 +0,0 @@
# finetune
Basic usage instructions:
```bash
# get training data
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
# finetune LORA adapter
./bin/llama-finetune \
--model-base open-llama-3b-v2-q8_0.gguf \
--checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
--checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
--lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
--train-data "shakespeare.txt" \
--save-every 10 \
--threads 6 --adam-iter 30 --batch 4 --ctx 64 \
--use-checkpointing
# predict
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
```
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
So in above example after 10 iterations these files will be written:
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
- lora-open-llama-3b-v2-q8_0-shakespeare-10.bin
- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
After 10 more iterations:
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
- lora-open-llama-3b-v2-q8_0-shakespeare-20.bin
- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above.
In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together.
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
```bash
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
```
You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`.
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
```bash
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
```
The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values.
Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
The default LORA rank can be specified with `--lora-r N`.
The LORA rank can be configured for each model tensor type separately with these command line options:
```bash
--lora-r N LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4)
--rank-att-norm N LORA rank for attention norm tensor (default 1)
--rank-ffn-norm N LORA rank for feed-forward norm tensor (default 1)
--rank-out-norm N LORA rank for output norm tensor (default 1)
--rank-tok-embd N LORA rank for token embeddings tensor (default 4)
--rank-out N LORA rank for output tensor (default 4)
--rank-wq N LORA rank for wq tensor (default 4)
--rank-wk N LORA rank for wk tensor (default 4)
--rank-wv N LORA rank for wv tensor (default 4)
--rank-wo N LORA rank for wo tensor (default 4)
--rank-ffn_gate N LORA rank for ffn_gate tensor (default 4)
--rank-ffn_down N LORA rank for ffn_down tensor (default 4)
--rank-ffn_up N LORA rank for ffn_up tensor (default 4)
```
The LORA rank of 'norm' tensors should always be 1.
To see all available options use `finetune --help`.

View File

@ -1,487 +0,0 @@
#!/usr/bin/env python3
# finetune checkpoint --> gguf conversion
import argparse
import gguf
import struct
import numpy as np
from pathlib import Path
# gguf constants
LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"
LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"
LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"
LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"
LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"
LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"
LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"
LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"
LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"
LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"
LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"
LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"
LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"
LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"
LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"
LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
LLM_KV_TRAINING_TYPE = "training.type"
LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd"
LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"
LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output"
LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm"
LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q"
LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k"
LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v"
LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output"
LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm"
LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate"
LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down"
LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up"
class Tensor:
def __init__(self, dtype='f', ne=None):
if ne is None:
ne = []
self.dtype = dtype
self.ne = ne
self.nbytes = 0
if self.dtype == 'f':
if len(self.ne) == 0:
self.nbytes = 0
else:
self.nbytes = int(np.product(self.ne)) * 4
else:
raise ValueError(f"Unhandled data type '{self.dtype}'")
def load(self, data, offset):
nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
assert(nd == len(self.ne))
ne = []
for d in range(nd):
n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
ne.append(n)
if tuple(ne) != tuple(self.ne):
raise ValueError(f"Tensor.load: Expected number of elements {str(self.ne)} does not match what is read from file {str(ne)}")
if self.dtype == 'f':
assert(dtype == 0)
else:
raise ValueError(f"Unhandled data type '{self.dtype}'")
self.name = bytes(data[offset:offset+namelen]); offset += namelen
# 32-byte alignment
offset += (0 - offset) & 31
self.data = data[offset:offset+self.nbytes]
offset += self.nbytes
return offset
def max_storage_size(self):
result = 0
result += 4 # nd
result += 4 # namelen
result += 4 # dtype
result += len(self.ne)*8 # ne
result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
result += 31 # 32-byte alignment
result += self.nbytes
return result
def save_gguf(self, gguf_writer, name):
gguf_writer.add_tensor(
name=name,
tensor=self.data,
raw_shape=np.array(list(reversed(self.ne))),
raw_dtype=gguf.GGMLQuantizationType.F32)
class OptimizationContext:
def __init__(self):
pass
def load(self, data, offset):
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
offset += 4
if self.version != 1:
raise ValueError('Invalid version of optimization context in checkpoint file')
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
self.adam_m = Tensor('f', [self.nx])
self.adam_v = Tensor('f', [self.nx])
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
self.lbfgs_x = Tensor('f', [self.nx])
self.lbfgs_xp = Tensor('f', [self.nx])
self.lbfgs_g = Tensor('f', [self.nx])
self.lbfgs_gp = Tensor('f', [self.nx])
self.lbfgs_d = Tensor('f', [self.nx])
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
# forgot to save type in version 1:
# guess self.type from number of remaining bytes
size_type_0 = 12 + sum([t.max_storage_size() for t in
[self.adam_m, self.adam_v]
+([self.adam_pf] if (self.past > 0) else [])])
size_type_1 = 24 + sum([t.max_storage_size() for t in
[self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
self.lbfgs_lmal, self.lbfgs_lmys,
self.lbfgs_lms, self.lbfgs_lmy]
+([self.lbfgs_pf] if (self.past > 0) else [])])
# due to alignment padding the size might not by exact
# but the difference in size for both types is significant,
# so we can just use whichever is closest
remaining = len(data) - offset
if abs(remaining - size_type_0) < abs(remaining - size_type_1):
self.type = 0
else:
self.type = 1
if self.type == 0:
offset = self.adam_m.load(data, offset)
offset = self.adam_v.load(data, offset)
offset = self.adam_pf.load(data,offset)
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
elif self.type == 1:
offset = self.lbfgs_x.load(data, offset)
offset = self.lbfgs_xp.load(data, offset)
offset = self.lbfgs_g.load(data, offset)
offset = self.lbfgs_gp.load(data, offset)
offset = self.lbfgs_d.load(data, offset)
offset = self.lbfgs_pf.load(data, offset)
offset = self.lbfgs_lmal.load(data, offset)
offset = self.lbfgs_lmys.load(data, offset)
offset = self.lbfgs_lms.load(data, offset)
offset = self.lbfgs_lmy.load(data, offset)
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
else:
raise ValueError(f"Invalid optimizer type '{self.type}'")
return offset
def save_gguf(self, gguf_writer):
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
if self.type == 0:
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
if self.past > 0:
self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
elif self.type == 1:
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
if self.past > 0:
self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
else:
raise ValueError('Unknown optimizer type')
class LoraParams:
def __init__(self):
pass
def load(self, data, offset):
self.n_rank_attention_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_wq = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_wk = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_wv = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_wo = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_ffn_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_w1 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_w2 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_w3 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_tok_embeddings = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_output = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
return offset
def save_gguf(self, gguf_writer):
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD, self.n_rank_tok_embeddings)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, self.n_rank_norm)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT, self.n_rank_output)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_NORM, self.n_rank_attention_norm)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_Q, self.n_rank_wq)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_K, self.n_rank_wk)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_V, self.n_rank_wv)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, self.n_rank_wo)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_NORM, self.n_rank_ffn_norm)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_GATE, self.n_rank_w1)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, self.n_rank_w2)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_UP, self.n_rank_w3)
class ModelParams:
def __init__(self, n_ff = None):
self.n_ff = n_ff
def load(self, data, offset):
self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_embd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_mult = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_head = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rot = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
return offset
def get_n_ff(self):
if self.n_ff is None:
# struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
else:
return self.n_ff
def save_gguf(self, gguf_writer):
# self.n_vocab not saved
gguf_writer.add_embedding_length(self.n_embd)
gguf_writer.add_head_count(self.n_head)
gguf_writer.add_block_count(self.n_layer)
gguf_writer.add_rope_dimension_count(self.n_rot)
gguf_writer.add_feed_forward_length(self.get_n_ff())
def tensor_name(key, bid=None, suffix=".weight"):
return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix
class Layer:
def __init__(self, params, lora_params, bid):
self.bid = bid
self.att_norm_a = Tensor('f', [lora_params.n_rank_attention_norm, params.n_embd])
self.att_norm_b = Tensor('f', [lora_params.n_rank_attention_norm, 1])
self.wq_a = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
self.wq_b = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
self.wk_a = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
self.wk_b = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
self.wv_a = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
self.wv_b = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
self.wo_a = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
self.wo_b = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
self.ffn_norm_a = Tensor('f', [lora_params.n_rank_ffn_norm, params.n_embd])
self.ffn_norm_b = Tensor('f', [lora_params.n_rank_ffn_norm, 1])
self.w1_a = Tensor('f', [lora_params.n_rank_w1, params.n_embd])
self.w1_b = Tensor('f', [lora_params.n_rank_w1, params.get_n_ff()])
self.w2_a = Tensor('f', [lora_params.n_rank_w2, params.get_n_ff()])
self.w2_b = Tensor('f', [lora_params.n_rank_w2, params.n_embd])
self.w3_a = Tensor('f', [lora_params.n_rank_w3, params.n_embd])
self.w3_b = Tensor('f', [lora_params.n_rank_w3, params.get_n_ff()])
def load(self, data, offset):
offset = self.att_norm_a.load(data, offset)
offset = self.att_norm_b.load(data, offset)
offset = self.wq_a.load(data, offset)
offset = self.wq_b.load(data, offset)
offset = self.wk_a.load(data, offset)
offset = self.wk_b.load(data, offset)
offset = self.wv_a.load(data, offset)
offset = self.wv_b.load(data, offset)
offset = self.wo_a.load(data, offset)
offset = self.wo_b.load(data, offset)
offset = self.ffn_norm_a.load(data, offset)
offset = self.ffn_norm_b.load(data, offset)
offset = self.w1_a.load(data, offset)
offset = self.w1_b.load(data, offset)
offset = self.w2_a.load(data, offset)
offset = self.w2_b.load(data, offset)
offset = self.w3_a.load(data, offset)
offset = self.w3_b.load(data, offset)
return offset
def save_gguf(self, gguf_writer):
self.att_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_a"))
self.att_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_b"))
self.wq_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid, ".weight.lora_a"))
self.wq_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid, ".weight.lora_b"))
self.wk_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid, ".weight.lora_a"))
self.wk_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid, ".weight.lora_b"))
self.wv_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid, ".weight.lora_a"))
self.wv_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid, ".weight.lora_b"))
self.wo_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid, ".weight.lora_a"))
self.wo_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid, ".weight.lora_b"))
self.ffn_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid, ".weight.lora_a"))
self.ffn_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid, ".weight.lora_b"))
self.w1_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid, ".weight.lora_a"))
self.w1_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid, ".weight.lora_b"))
self.w2_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid, ".weight.lora_a"))
self.w2_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid, ".weight.lora_b"))
self.w3_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid, ".weight.lora_a"))
self.w3_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid, ".weight.lora_b"))
class LoraModel:
def __init__(self, n_ff = None):
self.params = ModelParams(n_ff = n_ff)
self.lora_params = LoraParams()
self.layers = []
def load(self, data, offset):
offset = self.params.load(data, offset)
offset = self.lora_params.load(data, offset)
self.tok_embd_a = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_embd])
self.tok_embd_b = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_vocab])
self.norm_a = Tensor('f', [self.lora_params.n_rank_norm, self.params.n_embd])
self.norm_b = Tensor('f', [self.lora_params.n_rank_norm, 1])
self.output_a = Tensor('f', [self.lora_params.n_rank_output, self.params.n_embd])
self.output_b = Tensor('f', [self.lora_params.n_rank_output, self.params.n_vocab])
offset = self.tok_embd_a.load(data, offset)
offset = self.tok_embd_b.load(data, offset)
offset = self.norm_a.load(data, offset)
offset = self.norm_b.load(data, offset)
offset = self.output_a.load(data, offset)
offset = self.output_b.load(data, offset)
self.layers.clear()
for bid in range(self.params.n_layer):
layer = Layer(self.params, self.lora_params, bid)
offset = layer.load(data, offset)
self.layers.append(layer)
return offset
def save_gguf(self, gguf_writer):
self.params.save_gguf(gguf_writer)
self.lora_params.save_gguf(gguf_writer)
self.tok_embd_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD, suffix=".weight.lora_a"))
self.tok_embd_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD, suffix=".weight.lora_b"))
self.norm_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_a"))
self.norm_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_b"))
self.output_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT, suffix=".weight.lora_a"))
self.output_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT, suffix=".weight.lora_b"))
for layer in self.layers:
layer.save_gguf(gguf_writer)
class LoraCheckpoint:
def __init__(self, n_ff = None):
self.model = LoraModel(n_ff = n_ff)
self.opt_ctx = OptimizationContext()
def load(self, data, offset):
magic = bytes(reversed(data[offset:offset + 4])); offset += 4
if magic != b'ggcl':
raise ValueError(f"File header magic indicates, that this is no finetune-lora checkpoint file. Expected 'ggcl', Got '{str(magic)}'")
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
if self.version != 0:
raise ValueError('Invalid version of checkpoint file')
self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
offset = self.model.load(data, offset)
offset = self.opt_ctx.load(data, offset)
return offset
def save_gguf(self, gguf_writer):
gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
gguf_writer.add_layer_norm_rms_eps(1e-5)
gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
gguf_writer.add_string(LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_FINETUNE_LORA)
gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
self.model.save_gguf(gguf_writer)
self.opt_ctx.save_gguf(gguf_writer)
def handle_args():
parser = argparse.ArgumentParser(description = 'Convert finetune checkpoints to GGUF')
parser.add_argument('--input', '-i', type = Path, help = 'Input finetune checkpoint filename', required=True)
parser.add_argument('--output', '-o', type = Path, help = 'Output GGUF filename', required=True)
parser.add_argument('--ff', type = int, help = "Feedforward size, if not provided compute from n_mult. Provide this if you get 'ValueError: Tensor.load: Expected number of elements does not match what is read from file'", required=False)
return parser.parse_args()
def main():
cfg = handle_args()
print(cfg)
data = np.memmap(cfg.input, mode = 'r')
chk = LoraCheckpoint(n_ff = cfg.ff)
offset = 0
offset = chk.load(data, offset)
# we should have read all available data
assert(offset == len(data))
gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
chk.save_gguf(gguf_writer)
print(" gguf: write header")
gguf_writer.write_header_to_file()
print(" gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print(" gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
if __name__ == '__main__':
main()

File diff suppressed because it is too large Load Diff

View File

@ -1,34 +0,0 @@
#!/bin/bash
cd `dirname $0`
cd ../..
EXE="./llama-finetune"
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing.
while getopts "dg" opt; do
case $opt in
d)
DEBUGGER="gdb --args"
;;
g)
EXE="./build/bin/Release/finetune"
GPUARG="--gpu-layers 25"
;;
esac
done
$DEBUGGER $EXE \
--model-base $MODEL \
$GPUARG \
--checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \
--checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
--lora-out lora-ol3b-shakespeare-ITERATION.bin \
--train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
--save-every 10 \
--threads 10 --adam-iter 30 --batch 4 --ctx 64 \
--use-checkpointing

View File

@ -16,20 +16,25 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
auto decoded = decode_utf8(input_str, {});
const auto & code_points = decoded.first;
const llama_grammar_rules & rules = llama_grammar_get_rules (grammar);
llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
size_t pos = 0;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
auto prev_stacks = grammar->stacks;
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
if (grammar->stacks.empty()) {
const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
if (cur_stacks.empty()) {
error_pos = pos;
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
grammar->stacks = prev_stacks;
cur_stacks = prev_stacks;
return false;
}
++pos;
}
for (const auto & stack : grammar->stacks) {
for (const auto & stack : cur_stacks) {
if (stack.empty()) {
return true;
}

View File

@ -0,0 +1,15 @@
set(TARGET llama-gguf-hash)
add_executable(${TARGET} gguf-hash.cpp)
install(TARGETS ${TARGET} RUNTIME)
# clibs dependencies
include_directories(deps/)
add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h)
target_link_libraries(${TARGET} PRIVATE xxhash)
add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h)
target_link_libraries(${TARGET} PRIVATE sha1)
add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
target_link_libraries(${TARGET} PRIVATE sha256)
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@ -0,0 +1,206 @@
# llama-gguf-hash
CLI to hash GGUF files to detect difference on a per model and per tensor level.
**Command line options:**
- `--help`: display help message
- `--xxh64`: use xhash 64bit hash mode (default)
- `--sha1`: use sha1
- `--uuid`: use uuid
- `--sha256`: use sha256
- `--all`: use all hash
- `--no-layer`: exclude per layer hash
- `--uuid`: generate UUIDv5 ID
- `-c`, `--check <manifest>`: verify against a manifest
## About
While most POSIX systems already have hash checking programs like sha256sum, it
is designed to check entire files. This is not ideal for our purpose if we want
to check for consistency of the tensor data even if the metadata content of the
gguf KV store has been updated.
This program is designed to hash a gguf tensor payload on a 'per tensor layer'
in addition to a 'entire tensor model' hash. The intent is that the entire
tensor layer can be checked first but if there is any detected inconsistencies,
then the per tensor hash can be used to narrow down the specific tensor layer
that has inconsistencies.
For Maintainers:
- Detection of tensor inconsistency during development and automated tests
- This is served by xxh64 which is fast
- This is also served by having per tensor layer to assist in narrowing down
the location of the faulty tensor layer
- This is also served by sha1 which is much slower but more widely supported
For Model Creators:
- Optional consistent UUID generation based on model tensor content
- This is served by UUIDv5 which is useful for databases keys
- llama.cpp UUIDv5 Namespace: `ef001206-dadc-5f6d-a15f-3359e577d4e5`
- Made via UUIDv5 URL namespace of `en.wikipedia.org/wiki/Llama.cpp`
For Model Users:
- Assurance of tensor layer integrity even if metadata was updated
- This is served by sha256 which is still considered very secure as of 2024
### Design Note
- The default behavior of this program if no arguments is provided is to hash
using xxhash's xxh32 mode because it is very fast and is primarily targeted
towards maintainers who may want to use this in automated tests.
- xxhash support xxh32 and xxh128 for 32bit hash and 128bit hash respectively
however we picked 64bit xxhash as most computers are 64bit as of 2024 and thus
would have a better affinity to calculating hash that is 64bit in size.
## Compile Example
```bash
cmake -B build -DCMAKE_BUILD_TYPE=Debug -DLLAMA_FATAL_WARNINGS=ON
make -C build clean
make -C build llama-gguf-hash VERBOSE=1
./build/bin/llama-gguf-hash test.gguf
./build/bin/llama-gguf-hash --xxh64 test.gguf
./build/bin/llama-gguf-hash --sha1 test.gguf
./build/bin/llama-gguf-hash --uuid test.gguf
./build/bin/llama-gguf-hash --sha256 test.gguf
```
## Generation and Verification Example
To generate we may use this command
```bash
./llama-gguf-hash --all test.gguf > test.gguf.manifest
```
Which would generate a manifest that looks like below, which contains multiple hash type and per tensor layer hashes as well
(This excludes UUID as that is an ID not a hash)
```bash
xxh64 f66e9cd66a4396a0 test.gguf:tensor_0
sha1 59f79ecefd8125a996fdf419239051a7e99e5f20 test.gguf:tensor_0
sha256 c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd test.gguf:tensor_0
xxh64 7d3a1f9ac04d0537 test.gguf:tensor_1
sha1 4765f592eacf096df4628ba59476af94d767080a test.gguf:tensor_1
sha256 8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7 test.gguf:tensor_1
xxh64 a0af5d700049693b test.gguf:tensor_2
sha1 25cbfbad4513cc348e2c95ebdee69d6ff2fd8753 test.gguf:tensor_2
sha256 947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180 test.gguf:tensor_2
xxh64 e83fddf559d7b6a6 test.gguf:tensor_3
sha1 a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c test.gguf:tensor_3
sha256 423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1 test.gguf:tensor_3
xxh64 1257733306b7992d test.gguf:tensor_4
sha1 d7bc61db93bb685ce9d598da89717c66729b7543 test.gguf:tensor_4
sha256 79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf test.gguf:tensor_4
xxh64 d238d16ba4711e58 test.gguf:tensor_5
sha1 0706566c198fe1072f37e0a5135b4b5f23654c52 test.gguf:tensor_5
sha256 60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b test.gguf:tensor_5
xxh64 3fbc3b65ab8c7f39 test.gguf:tensor_6
sha1 73922a0727226a409049f6fc3172a52219ca6f00 test.gguf:tensor_6
sha256 574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0 test.gguf:tensor_6
xxh64 c22021c29854f093 test.gguf:tensor_7
sha1 efc39cece6a951188fc41e354c73bbfe6813d447 test.gguf:tensor_7
sha256 4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75 test.gguf:tensor_7
xxh64 936df61f5d64261f test.gguf:tensor_8
sha1 c2490296d789a4f34398a337fed8377d943d9f06 test.gguf:tensor_8
sha256 c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01 test.gguf:tensor_8
xxh64 93fd20c64421c081 test.gguf:tensor_9
sha1 7047ce1e78437a6884337a3751c7ee0421918a65 test.gguf:tensor_9
sha256 23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514 test.gguf:tensor_9
xxh64 5a54d3aad816f302 test.gguf
sha1 d15be52c4ff213e823cb6dd13af7ee2f978e7042 test.gguf
sha256 7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4 test.gguf
```
We can then use the normal check command which will by default check for the highest security strength hash and verify against that:
```bash
$ ./llama-gguf-hash --check test.gguf.manifest test.gguf
manifest test.gguf.manifest sha256 sha1 xxh64
sha256 c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd test.gguf:tensor_0 - Ok
sha256 8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7 test.gguf:tensor_1 - Ok
sha256 947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180 test.gguf:tensor_2 - Ok
sha256 423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1 test.gguf:tensor_3 - Ok
sha256 79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf test.gguf:tensor_4 - Ok
sha256 60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b test.gguf:tensor_5 - Ok
sha256 574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0 test.gguf:tensor_6 - Ok
sha256 4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75 test.gguf:tensor_7 - Ok
sha256 c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01 test.gguf:tensor_8 - Ok
sha256 23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514 test.gguf:tensor_9 - Ok
sha256 7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4 test.gguf - Ok
Verification results for test.gguf.manifest - Success
```
Or we may explicitly ask for a faster hash like:
```bash
$ ./llama-gguf-hash --check test.gguf.manifest --xxh64 test.gguf
manifest test.gguf.manifest sha256 sha1 xxh64
xxh64 f66e9cd66a4396a0 test.gguf:tensor_0 - Ok
xxh64 7d3a1f9ac04d0537 test.gguf:tensor_1 - Ok
xxh64 a0af5d700049693b test.gguf:tensor_2 - Ok
xxh64 e83fddf559d7b6a6 test.gguf:tensor_3 - Ok
xxh64 1257733306b7992d test.gguf:tensor_4 - Ok
xxh64 d238d16ba4711e58 test.gguf:tensor_5 - Ok
xxh64 3fbc3b65ab8c7f39 test.gguf:tensor_6 - Ok
xxh64 c22021c29854f093 test.gguf:tensor_7 - Ok
xxh64 936df61f5d64261f test.gguf:tensor_8 - Ok
xxh64 93fd20c64421c081 test.gguf:tensor_9 - Ok
xxh64 5a54d3aad816f302 test.gguf - Ok
Verification results for test.gguf.manifest - Success
```
Or maybe we want to just check that all the hash is valid:
```bash
$./llama-gguf-hash --check test.gguf.manifest --all test.gguf.manifest
manifest test.gguf.manifest sha256 sha1 xxh64
xxh64 f66e9cd66a4396a0 test.gguf:tensor_0 - Ok
sha1 59f79ecefd8125a996fdf419239051a7e99e5f20 test.gguf:tensor_0 - Ok
sha256 c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd test.gguf:tensor_0 - Ok
xxh64 7d3a1f9ac04d0537 test.gguf:tensor_1 - Ok
sha1 4765f592eacf096df4628ba59476af94d767080a test.gguf:tensor_1 - Ok
sha256 8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7 test.gguf:tensor_1 - Ok
xxh64 a0af5d700049693b test.gguf:tensor_2 - Ok
sha1 25cbfbad4513cc348e2c95ebdee69d6ff2fd8753 test.gguf:tensor_2 - Ok
sha256 947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180 test.gguf:tensor_2 - Ok
xxh64 e83fddf559d7b6a6 test.gguf:tensor_3 - Ok
sha1 a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c test.gguf:tensor_3 - Ok
sha256 423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1 test.gguf:tensor_3 - Ok
xxh64 1257733306b7992d test.gguf:tensor_4 - Ok
sha1 d7bc61db93bb685ce9d598da89717c66729b7543 test.gguf:tensor_4 - Ok
sha256 79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf test.gguf:tensor_4 - Ok
xxh64 d238d16ba4711e58 test.gguf:tensor_5 - Ok
sha1 0706566c198fe1072f37e0a5135b4b5f23654c52 test.gguf:tensor_5 - Ok
sha256 60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b test.gguf:tensor_5 - Ok
xxh64 3fbc3b65ab8c7f39 test.gguf:tensor_6 - Ok
sha1 73922a0727226a409049f6fc3172a52219ca6f00 test.gguf:tensor_6 - Ok
sha256 574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0 test.gguf:tensor_6 - Ok
xxh64 c22021c29854f093 test.gguf:tensor_7 - Ok
sha1 efc39cece6a951188fc41e354c73bbfe6813d447 test.gguf:tensor_7 - Ok
sha256 4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75 test.gguf:tensor_7 - Ok
xxh64 936df61f5d64261f test.gguf:tensor_8 - Ok
sha1 c2490296d789a4f34398a337fed8377d943d9f06 test.gguf:tensor_8 - Ok
sha256 c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01 test.gguf:tensor_8 - Ok
xxh64 93fd20c64421c081 test.gguf:tensor_9 - Ok
sha1 7047ce1e78437a6884337a3751c7ee0421918a65 test.gguf:tensor_9 - Ok
sha256 23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514 test.gguf:tensor_9 - Ok
xxh64 5a54d3aad816f302 test.gguf - Ok
sha1 d15be52c4ff213e823cb6dd13af7ee2f978e7042 test.gguf - Ok
sha256 7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4 test.gguf - Ok
Verification results for test.gguf.manifest - Success
```
## Crypto/Hash Libraries Used
These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs)
- https://github.com/Cyan4973/xxHash
- https://github.com/clibs/sha1/
- https://github.com/jb55/sha256.c

View File

@ -0,0 +1,13 @@
{
"name": "rotate-bits",
"version": "0.1.1",
"repo": "jb55/rotate-bits.h",
"description": "rotate bits",
"keywords": ["rotl", "rotr"],
"src": ["rotate-bits.h"],
"license": "Public Domain",
"development": {
"thlorenz/tap.c": "*"
}
}

View File

@ -0,0 +1,46 @@
#ifndef __ROTATE_DEFS_H
#define __ROTATE_DEFS_H
#ifdef _MSC_VER
#include <stdlib.h>
#define ROTL32(v, n) _rotl((v), (n))
#define ROTL64(v, n) _rotl64((v), (n))
#define ROTR32(v, n) _rotr((v), (n))
#define ROTR64(v, n) _rotr64((v), (n))
#else
#include <stdint.h>
#define U8V(v) ((uint8_t)(v) & 0xFFU)
#define U16V(v) ((uint16_t)(v) & 0xFFFFU)
#define U32V(v) ((uint32_t)(v) & 0xFFFFFFFFU)
#define U64V(v) ((uint64_t)(v) & 0xFFFFFFFFFFFFFFFFU)
#define ROTL32(v, n) \
(U32V((uint32_t)(v) << (n)) | ((uint32_t)(v) >> (32 - (n))))
// tests fail if we don't have this cast...
#define ROTL64(v, n) \
(U64V((uint64_t)(v) << (n)) | ((uint64_t)(v) >> (64 - (n))))
#define ROTR32(v, n) ROTL32(v, 32 - (n))
#define ROTR64(v, n) ROTL64(v, 64 - (n))
#endif
#define ROTL8(v, n) \
(U8V((uint8_t)(v) << (n)) | ((uint8_t)(v) >> (8 - (n))))
#define ROTL16(v, n) \
(U16V((uint16_t)(v) << (n)) | ((uint16_t)(v) >> (16 - (n))))
#define ROTR8(v, n) ROTL8(v, 8 - (n))
#define ROTR16(v, n) ROTL16(v, 16 - (n))
#endif

View File

@ -0,0 +1,9 @@
{
"name": "sha1",
"version": "0.0.1",
"repo": "clibs/sha1",
"description": "sha1 hash algorithm",
"keywords": ["sha1", "hash"],
"license": "public domain",
"src": ["sha1.c", "sha1.h"]
}

View File

@ -0,0 +1,295 @@
/*
SHA-1 in C
By Steve Reid <steve@edmweb.com>
100% Public Domain
Test Vectors (from FIPS PUB 180-1)
"abc"
A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D
"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1
A million repetitions of "a"
34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F
*/
/* #define LITTLE_ENDIAN * This should be #define'd already, if true. */
/* #define SHA1HANDSOFF * Copies data before messing with it. */
#define SHA1HANDSOFF
#include <stdio.h>
#include <string.h>
/* for uint32_t */
#include <stdint.h>
#include "sha1.h"
#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
/* blk0() and blk() perform the initial expand. */
/* I got the idea of expanding during the round function from SSLeay */
#if BYTE_ORDER == LITTLE_ENDIAN
#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
|(rol(block->l[i],8)&0x00FF00FF))
#elif BYTE_ORDER == BIG_ENDIAN
#define blk0(i) block->l[i]
#else
#error "Endianness not defined!"
#endif
#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \
^block->l[(i+2)&15]^block->l[i&15],1))
/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30);
#define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30);
#define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);
#define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30);
#define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);
/* Hash a single 512-bit block. This is the core of the algorithm. */
void SHA1Transform(
uint32_t state[5],
const unsigned char buffer[64]
)
{
uint32_t a, b, c, d, e;
typedef union
{
unsigned char c[64];
uint32_t l[16];
} CHAR64LONG16;
#ifdef SHA1HANDSOFF
CHAR64LONG16 block[1]; /* use array to appear as a pointer */
memcpy(block, buffer, 64);
#else
/* The following had better never be used because it causes the
* pointer-to-const buffer to be cast into a pointer to non-const.
* And the result is written through. I threw a "const" in, hoping
* this will cause a diagnostic.
*/
CHAR64LONG16 *block = (const CHAR64LONG16 *) buffer;
#endif
/* Copy context->state[] to working vars */
a = state[0];
b = state[1];
c = state[2];
d = state[3];
e = state[4];
/* 4 rounds of 20 operations each. Loop unrolled. */
R0(a, b, c, d, e, 0);
R0(e, a, b, c, d, 1);
R0(d, e, a, b, c, 2);
R0(c, d, e, a, b, 3);
R0(b, c, d, e, a, 4);
R0(a, b, c, d, e, 5);
R0(e, a, b, c, d, 6);
R0(d, e, a, b, c, 7);
R0(c, d, e, a, b, 8);
R0(b, c, d, e, a, 9);
R0(a, b, c, d, e, 10);
R0(e, a, b, c, d, 11);
R0(d, e, a, b, c, 12);
R0(c, d, e, a, b, 13);
R0(b, c, d, e, a, 14);
R0(a, b, c, d, e, 15);
R1(e, a, b, c, d, 16);
R1(d, e, a, b, c, 17);
R1(c, d, e, a, b, 18);
R1(b, c, d, e, a, 19);
R2(a, b, c, d, e, 20);
R2(e, a, b, c, d, 21);
R2(d, e, a, b, c, 22);
R2(c, d, e, a, b, 23);
R2(b, c, d, e, a, 24);
R2(a, b, c, d, e, 25);
R2(e, a, b, c, d, 26);
R2(d, e, a, b, c, 27);
R2(c, d, e, a, b, 28);
R2(b, c, d, e, a, 29);
R2(a, b, c, d, e, 30);
R2(e, a, b, c, d, 31);
R2(d, e, a, b, c, 32);
R2(c, d, e, a, b, 33);
R2(b, c, d, e, a, 34);
R2(a, b, c, d, e, 35);
R2(e, a, b, c, d, 36);
R2(d, e, a, b, c, 37);
R2(c, d, e, a, b, 38);
R2(b, c, d, e, a, 39);
R3(a, b, c, d, e, 40);
R3(e, a, b, c, d, 41);
R3(d, e, a, b, c, 42);
R3(c, d, e, a, b, 43);
R3(b, c, d, e, a, 44);
R3(a, b, c, d, e, 45);
R3(e, a, b, c, d, 46);
R3(d, e, a, b, c, 47);
R3(c, d, e, a, b, 48);
R3(b, c, d, e, a, 49);
R3(a, b, c, d, e, 50);
R3(e, a, b, c, d, 51);
R3(d, e, a, b, c, 52);
R3(c, d, e, a, b, 53);
R3(b, c, d, e, a, 54);
R3(a, b, c, d, e, 55);
R3(e, a, b, c, d, 56);
R3(d, e, a, b, c, 57);
R3(c, d, e, a, b, 58);
R3(b, c, d, e, a, 59);
R4(a, b, c, d, e, 60);
R4(e, a, b, c, d, 61);
R4(d, e, a, b, c, 62);
R4(c, d, e, a, b, 63);
R4(b, c, d, e, a, 64);
R4(a, b, c, d, e, 65);
R4(e, a, b, c, d, 66);
R4(d, e, a, b, c, 67);
R4(c, d, e, a, b, 68);
R4(b, c, d, e, a, 69);
R4(a, b, c, d, e, 70);
R4(e, a, b, c, d, 71);
R4(d, e, a, b, c, 72);
R4(c, d, e, a, b, 73);
R4(b, c, d, e, a, 74);
R4(a, b, c, d, e, 75);
R4(e, a, b, c, d, 76);
R4(d, e, a, b, c, 77);
R4(c, d, e, a, b, 78);
R4(b, c, d, e, a, 79);
/* Add the working vars back into context.state[] */
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
/* Wipe variables */
a = b = c = d = e = 0;
#ifdef SHA1HANDSOFF
memset(block, '\0', sizeof(block));
#endif
}
/* SHA1Init - Initialize new context */
void SHA1Init(
SHA1_CTX * context
)
{
/* SHA1 initialization constants */
context->state[0] = 0x67452301;
context->state[1] = 0xEFCDAB89;
context->state[2] = 0x98BADCFE;
context->state[3] = 0x10325476;
context->state[4] = 0xC3D2E1F0;
context->count[0] = context->count[1] = 0;
}
/* Run your data through this. */
void SHA1Update(
SHA1_CTX * context,
const unsigned char *data,
uint32_t len
)
{
uint32_t i;
uint32_t j;
j = context->count[0];
if ((context->count[0] += len << 3) < j)
context->count[1]++;
context->count[1] += (len >> 29);
j = (j >> 3) & 63;
if ((j + len) > 63)
{
memcpy(&context->buffer[j], data, (i = 64 - j));
SHA1Transform(context->state, context->buffer);
for (; i + 63 < len; i += 64)
{
SHA1Transform(context->state, &data[i]);
}
j = 0;
}
else
i = 0;
memcpy(&context->buffer[j], &data[i], len - i);
}
/* Add padding and return the message digest. */
void SHA1Final(
unsigned char digest[20],
SHA1_CTX * context
)
{
unsigned i;
unsigned char finalcount[8];
unsigned char c;
#if 0 /* untested "improvement" by DHR */
/* Convert context->count to a sequence of bytes
* in finalcount. Second element first, but
* big-endian order within element.
* But we do it all backwards.
*/
unsigned char *fcp = &finalcount[8];
for (i = 0; i < 2; i++)
{
uint32_t t = context->count[i];
int j;
for (j = 0; j < 4; t >>= 8, j++)
*--fcp = (unsigned char) t}
#else
for (i = 0; i < 8; i++)
{
finalcount[i] = (unsigned char) ((context->count[(i >= 4 ? 0 : 1)] >> ((3 - (i & 3)) * 8)) & 255); /* Endian independent */
}
#endif
c = 0200;
SHA1Update(context, &c, 1);
while ((context->count[0] & 504) != 448)
{
c = 0000;
SHA1Update(context, &c, 1);
}
SHA1Update(context, finalcount, 8); /* Should cause a SHA1Transform() */
for (i = 0; i < 20; i++)
{
digest[i] = (unsigned char)
((context->state[i >> 2] >> ((3 - (i & 3)) * 8)) & 255);
}
/* Wipe variables */
memset(context, '\0', sizeof(*context));
memset(&finalcount, '\0', sizeof(finalcount));
}
void SHA1(
char *hash_out,
const char *str,
uint32_t len)
{
SHA1_CTX ctx;
unsigned int ii;
SHA1Init(&ctx);
for (ii=0; ii<len; ii+=1)
SHA1Update(&ctx, (const unsigned char*)str + ii, 1);
SHA1Final((unsigned char *)hash_out, &ctx);
}

View File

@ -0,0 +1,52 @@
#ifndef SHA1_H
#define SHA1_H
/*
SHA-1 in C
By Steve Reid <steve@edmweb.com>
100% Public Domain
*/
#include "stdint.h"
#if defined(__cplusplus)
extern "C" {
#endif
typedef struct
{
uint32_t state[5];
uint32_t count[2];
unsigned char buffer[64];
} SHA1_CTX;
void SHA1Transform(
uint32_t state[5],
const unsigned char buffer[64]
);
void SHA1Init(
SHA1_CTX * context
);
void SHA1Update(
SHA1_CTX * context,
const unsigned char *data,
uint32_t len
);
void SHA1Final(
unsigned char digest[20],
SHA1_CTX * context
);
void SHA1(
char *hash_out,
const char *str,
uint32_t len);
#if defined(__cplusplus)
}
#endif
#endif /* SHA1_H */

View File

@ -0,0 +1,15 @@
{
"name": "sha256",
"version": "0.0.2",
"repo": "jb55/sha256.c",
"description": "sha256 in c",
"keywords": ["sha256", "sha2"],
"src": ["sha256.c", "sha256.h"],
"dependencies": {
"jb55/rotate-bits.h": "0.1.1"
},
"development": {
"thlorenz/tap.c": "*"
}
}

View File

@ -0,0 +1,221 @@
/* Crypto/Sha256.c -- SHA-256 Hash
2010-06-11 : Igor Pavlov : Public domain
This code is based on public domain code from Wei Dai's Crypto++ library. */
#include "rotate-bits/rotate-bits.h"
#include "sha256.h"
/* define it for speed optimization */
#define _SHA256_UNROLL
#define _SHA256_UNROLL2
void
sha256_init(sha256_t *p)
{
p->state[0] = 0x6a09e667;
p->state[1] = 0xbb67ae85;
p->state[2] = 0x3c6ef372;
p->state[3] = 0xa54ff53a;
p->state[4] = 0x510e527f;
p->state[5] = 0x9b05688c;
p->state[6] = 0x1f83d9ab;
p->state[7] = 0x5be0cd19;
p->count = 0;
}
#define S0(x) (ROTR32(x, 2) ^ ROTR32(x,13) ^ ROTR32(x, 22))
#define S1(x) (ROTR32(x, 6) ^ ROTR32(x,11) ^ ROTR32(x, 25))
#define s0(x) (ROTR32(x, 7) ^ ROTR32(x,18) ^ (x >> 3))
#define s1(x) (ROTR32(x,17) ^ ROTR32(x,19) ^ (x >> 10))
#define blk0(i) (W[i] = data[i])
#define blk2(i) (W[i&15] += s1(W[(i-2)&15]) + W[(i-7)&15] + s0(W[(i-15)&15]))
#define Ch(x,y,z) (z^(x&(y^z)))
#define Maj(x,y,z) ((x&y)|(z&(x|y)))
#define a(i) T[(0-(i))&7]
#define b(i) T[(1-(i))&7]
#define c(i) T[(2-(i))&7]
#define d(i) T[(3-(i))&7]
#define e(i) T[(4-(i))&7]
#define f(i) T[(5-(i))&7]
#define g(i) T[(6-(i))&7]
#define h(i) T[(7-(i))&7]
#ifdef _SHA256_UNROLL2
#define R(a,b,c,d,e,f,g,h, i) h += S1(e) + Ch(e,f,g) + K[i+j] + (j?blk2(i):blk0(i));\
d += h; h += S0(a) + Maj(a, b, c)
#define RX_8(i) \
R(a,b,c,d,e,f,g,h, i); \
R(h,a,b,c,d,e,f,g, (i+1)); \
R(g,h,a,b,c,d,e,f, (i+2)); \
R(f,g,h,a,b,c,d,e, (i+3)); \
R(e,f,g,h,a,b,c,d, (i+4)); \
R(d,e,f,g,h,a,b,c, (i+5)); \
R(c,d,e,f,g,h,a,b, (i+6)); \
R(b,c,d,e,f,g,h,a, (i+7))
#else
#define R(i) h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + (j?blk2(i):blk0(i));\
d(i) += h(i); h(i) += S0(a(i)) + Maj(a(i), b(i), c(i))
#ifdef _SHA256_UNROLL
#define RX_8(i) R(i+0); R(i+1); R(i+2); R(i+3); R(i+4); R(i+5); R(i+6); R(i+7);
#endif
#endif
static const uint32_t K[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
static void
sha256_transform(uint32_t *state, const uint32_t *data)
{
uint32_t W[16] = {0};
unsigned j;
#ifdef _SHA256_UNROLL2
uint32_t a,b,c,d,e,f,g,h;
a = state[0];
b = state[1];
c = state[2];
d = state[3];
e = state[4];
f = state[5];
g = state[6];
h = state[7];
#else
uint32_t T[8];
for (j = 0; j < 8; j++)
T[j] = state[j];
#endif
for (j = 0; j < 64; j += 16)
{
#if defined(_SHA256_UNROLL) || defined(_SHA256_UNROLL2)
RX_8(0); RX_8(8);
#else
unsigned i;
for (i = 0; i < 16; i++) { R(i); }
#endif
}
#ifdef _SHA256_UNROLL2
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
#else
for (j = 0; j < 8; j++)
state[j] += T[j];
#endif
/* Wipe variables */
/* memset(W, 0, sizeof(W)); */
/* memset(T, 0, sizeof(T)); */
}
#undef S0
#undef S1
#undef s0
#undef s1
static void
sha256_write_byte_block(sha256_t *p)
{
uint32_t data32[16];
unsigned i;
for (i = 0; i < 16; i++)
data32[i] =
((uint32_t)(p->buffer[i * 4 ]) << 24) +
((uint32_t)(p->buffer[i * 4 + 1]) << 16) +
((uint32_t)(p->buffer[i * 4 + 2]) << 8) +
((uint32_t)(p->buffer[i * 4 + 3]));
sha256_transform(p->state, data32);
}
void
sha256_hash(unsigned char *buf, const unsigned char *data, size_t size)
{
sha256_t hash;
sha256_init(&hash);
sha256_update(&hash, data, size);
sha256_final(&hash, buf);
}
void
sha256_update(sha256_t *p, const unsigned char *data, size_t size)
{
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
while (size > 0)
{
p->buffer[curBufferPos++] = *data++;
p->count++;
size--;
if (curBufferPos == 64)
{
curBufferPos = 0;
sha256_write_byte_block(p);
}
}
}
void
sha256_final(sha256_t *p, unsigned char *digest)
{
uint64_t lenInBits = (p->count << 3);
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
unsigned i;
p->buffer[curBufferPos++] = 0x80;
while (curBufferPos != (64 - 8))
{
curBufferPos &= 0x3F;
if (curBufferPos == 0)
sha256_write_byte_block(p);
p->buffer[curBufferPos++] = 0;
}
for (i = 0; i < 8; i++)
{
p->buffer[curBufferPos++] = (unsigned char)(lenInBits >> 56);
lenInBits <<= 8;
}
sha256_write_byte_block(p);
for (i = 0; i < 8; i++)
{
*digest++ = (unsigned char)(p->state[i] >> 24);
*digest++ = (unsigned char)(p->state[i] >> 16);
*digest++ = (unsigned char)(p->state[i] >> 8);
*digest++ = (unsigned char)(p->state[i]);
}
sha256_init(p);
}

View File

@ -0,0 +1,24 @@
/* Sha256.h -- SHA-256 Hash
2010-06-11 : Igor Pavlov : Public domain */
#ifndef __CRYPTO_SHA256_H
#define __CRYPTO_SHA256_H
#include <stdlib.h>
#include <stdint.h>
#define SHA256_DIGEST_SIZE 32
typedef struct sha256_t
{
uint32_t state[8];
uint64_t count;
unsigned char buffer[64];
} sha256_t;
void sha256_init(sha256_t *p);
void sha256_update(sha256_t *p, const unsigned char *data, size_t size);
void sha256_final(sha256_t *p, unsigned char *digest);
void sha256_hash(unsigned char *buf, const unsigned char *data, size_t size);
#endif

View File

@ -0,0 +1,12 @@
{
"name": "xxhash",
"version": "0.8.2",
"repo": "Cyan4973/xxhash",
"description": "Extremely fast non-cryptographic hash algorithm",
"keywords": ["xxhash", "hashing"],
"license": "BSD-2-Clause",
"src": [
"xxhash.c",
"xxhash.h"
]
}

View File

@ -0,0 +1,42 @@
/*
* xxHash - Extremely Fast Hash algorithm
* Copyright (C) 2012-2023 Yann Collet
*
* BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* You can contact the author at:
* - xxHash homepage: https://www.xxhash.com
* - xxHash source repository: https://github.com/Cyan4973/xxHash
*/
/*
* xxhash.c instantiates functions defined in xxhash.h
*/
#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */
#define XXH_IMPLEMENTATION /* access definitions */
#include "xxhash.h"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,693 @@
#include "ggml.h"
#include <cstdlib> /* abort() */
#include <cstddef>
#include <cstdio>
#include <string>
#include <stdexcept>
#include <algorithm>
#include <cstring>
#include <sstream>
#include <fstream>
#ifdef __cplusplus
extern "C" {
#endif
#include "xxhash/xxhash.h"
#include "sha1/sha1.h"
#include "sha256/sha256.h"
#ifdef __cplusplus
}
#endif
// uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp')
#define UUID_NAMESPACE_LLAMA_CPP "ef001206-dadc-5f6d-a15f-3359e577d4e5"
#define UUID_NAMESPACE_LLAMA_CPP_HEX 0xef, 0x00, 0x12, 0x06, 0xda, 0xdc, 0x5f, 0x6d, 0xa1, 0x5f, 0x33, 0x59, 0xe5, 0x77, 0xd4, 0xe5
#define HASH_TYPE_SHA256_STR "sha256"
#define HASH_TYPE_SHA1_STR "sha1"
#define HASH_TYPE_XXH64_STR "xxh64"
#define HASH_TYPE_UUID_STR "uuid"
typedef enum {
HASH_EXIT_SUCCESS = 0, // All hash has been generated or validated
HASH_EXIT_FAILURE = 1, // Generic Failure
HASH_EXIT_MISMATCH = 2, // Hash mismatched during validation
HASH_EXIT_MANIFEST_MISSING_ENTRY = 3, // Hash attempted validation but missing entry in manifest
HASH_EXIT_MANIFEST_UNKNOWN_HASH = 4, // Manifest is present, but we do not know any hash format within it
HASH_EXIT_MANIFEST_FILE_ERROR = 5 // Manifest is either missing or not a known format
} hash_exit_code_t;
typedef enum {
HASH_MANIFEST_NOT_FOUND,
HASH_MANIFEST_MISMATCH,
HASH_MANIFEST_OK,
} hash_manifest_result_t;
struct hash_params {
std::string input;
bool xxh64 = false;
bool sha1 = false;
bool sha256 = false;
bool uuid = false;
bool no_layer = false;
bool manifest_is_usable = false;
std::string manifest_file;
};
struct manifest_check_params {
bool xxh64 = false;
bool sha1 = false;
bool sha256 = false;
bool uuid = false;
};
static char const * hash_manifest_result_to_str(hash_manifest_result_t value) {
switch (value) {
case HASH_MANIFEST_NOT_FOUND: return "Not Found";
case HASH_MANIFEST_MISMATCH: return "Mismatch";
case HASH_MANIFEST_OK: return "Ok";
}
return "?";
}
static char const * hash_exit_code_to_str(hash_exit_code_t value) {
switch (value) {
case HASH_EXIT_SUCCESS: return "Success";
case HASH_EXIT_FAILURE: return "Failure";
case HASH_EXIT_MISMATCH: return "Mismatch";
case HASH_EXIT_MANIFEST_MISSING_ENTRY: return "Manifest Missing Entry";
case HASH_EXIT_MANIFEST_UNKNOWN_HASH: return "Manifest Unknown Hash";
case HASH_EXIT_MANIFEST_FILE_ERROR: return "Manifest File Error";
}
return "?";
}
static void hash_print_usage(const char * executable) {
const hash_params default_params;
printf("\n");
printf("usage: %s [options] GGUF_IN\n", executable);
printf("\n");
printf("Hash a GGUF file");
printf("\n");
printf("options:\n");
printf(" -h, --help show this help message and exit\n");
printf(" --xxh64 use xxh64 hash\n");
printf(" --sha1 use sha1 hash\n");
printf(" --sha256 use sha256 hash\n");
printf(" --all use all hash\n");
printf(" --no-layer exclude per layer hash\n");
printf(" --uuid generate UUIDv5 ID\n");
printf(" -c, --check <manifest> verify against a manifest\n");
printf("\n");
}
static void hash_params_parse_ex(int argc, const char ** argv, hash_params & params) {
std::string arg;
bool invalid_param = false;
const std::string arg_prefix = "--";
int arg_idx = 1;
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
arg = argv[arg_idx];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
bool arg_found = false;
if (arg == "-h" || arg == "--help") {
hash_print_usage(argv[0]);
exit(0);
}
if (arg == "--xxh64") {
arg_found = true;
params.xxh64 = true;
}
if (arg == "--sha1") {
arg_found = true;
params.sha1 = true;
}
if (arg == "--uuid") {
arg_found = true;
params.uuid = true;
}
if (arg == "--sha256") {
arg_found = true;
params.sha256 = true;
}
if (arg == "--all") {
arg_found = true;
params.sha256 = true;
params.sha1 = true;
params.xxh64 = true;
}
if (arg == "--no-layer") {
arg_found = true;
params.no_layer = true;
}
if (arg == "-c" || arg == "--check") {
if (++arg_idx >= argc) {
invalid_param = true;
break;
}
arg_found = true;
params.manifest_file = argv[arg_idx];
}
if (!arg_found) {
throw std::invalid_argument("error: unknown argument: " + arg);
}
}
if (invalid_param) {
throw std::invalid_argument("error: invalid parameter for argument:" + arg);
}
if (argc - arg_idx < 1) {
throw std::invalid_argument("error: bad arguments");
}
params.input = argv[arg_idx++];
}
static bool hash_params_parse(int argc, const char ** argv, hash_params & params) {
bool result = true;
try {
hash_params_parse_ex(argc, argv, params);
}
catch (const std::invalid_argument & ex) {
fprintf(stderr, "%s\n", ex.what());
hash_print_usage(argv[0]);
exit(EXIT_FAILURE);
}
return result;
}
static bool manifest_type(const std::string & manifest_file, manifest_check_params & manifest_check) {
if (manifest_file.empty()) {
return false;
}
std::ifstream file(manifest_file);
if (!file.is_open()) {
return false;
}
std::string manifest_entry_line;
while (getline(file, manifest_entry_line)) {
// hash_type_str hash_str tensor_name
// e.g. 'xxh64 f66e9cd66a4396a0 test.gguf:tensor_0'
std::istringstream line_stream(manifest_entry_line);
std::string file_hash_type;
if (line_stream >> file_hash_type) {
if (file_hash_type == HASH_TYPE_SHA256_STR) {
manifest_check.sha256 = true;
} else if (file_hash_type == HASH_TYPE_SHA1_STR) {
manifest_check.sha1 = true;
} else if (file_hash_type == HASH_TYPE_XXH64_STR) {
manifest_check.xxh64 = true;
} else if (file_hash_type == HASH_TYPE_UUID_STR) {
manifest_check.uuid = true;
}
}
}
return true;
}
static hash_manifest_result_t manifest_verify(const std::string& manifest_file, const std::string& hash_type_str, const std::string& hash_str, const std::string& tensor_name) {
if (manifest_file.empty()) {
return HASH_MANIFEST_NOT_FOUND;
}
std::ifstream file(manifest_file);
if (!file.is_open()) {
return HASH_MANIFEST_NOT_FOUND;
}
std::string manifest_entry_line;
while (getline(file, manifest_entry_line)) {
std::istringstream line_stream(manifest_entry_line);
std::string file_hash_type;
std::string file_hash;
std::string file_tensor_name;
if (line_stream >> file_hash_type >> file_hash >> file_tensor_name) {
// Line parsed. Check hash validity
if (file_hash_type != hash_type_str) {
continue;
}
if (file_tensor_name != tensor_name) {
continue;
}
return (file_hash == hash_str) ? HASH_MANIFEST_OK : HASH_MANIFEST_MISMATCH;
}
}
return HASH_MANIFEST_NOT_FOUND;
}
static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char uuid[16]) {
// Ref: https://www.rfc-editor.org/rfc/rfc9562.html#section-5.5
// Assumes that digest was processed correctly with the expected namespace
for (int i = 0; i < 16; i++) {
uuid[i] = sha1_digest[i];
}
// Set bits corresponding to UUID ver 5
uuid[ 6] &= ~(0xF << 4);
uuid[ 6] |= (5 << 4);
// Set bits corresponding to UUID variant 0b10XX
uuid[ 8] &= ~(0xc << 4);
uuid[ 8] |= (0x8 << 4);
}
static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
const std::string & fname = hash_params.input;
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ &ctx_data,
};
// xxh64 init
XXH64_state_t* xxh64_model_hash_state = NULL;
if (hash_params.xxh64) {
xxh64_model_hash_state = XXH64_createState();
if (xxh64_model_hash_state==NULL) {
abort();
}
XXH64_hash_t const seed = 0;
if (XXH64_reset(xxh64_model_hash_state, seed) == XXH_ERROR) {
abort();
}
}
// sha1 init
SHA1_CTX sha1_model_hash_ctx;
if (hash_params.sha1) {
SHA1Init(&sha1_model_hash_ctx);
}
// sha256 init
sha256_t sha256_model_hash_ctx;
if (hash_params.sha256) {
sha256_init(&sha256_model_hash_ctx);
}
// sha1 for uuid init
SHA1_CTX sha1_for_uuid_ctx;
if (hash_params.uuid) {
unsigned char const uuidv5_namespace[] = {UUID_NAMESPACE_LLAMA_CPP_HEX};
SHA1Init(&sha1_for_uuid_ctx);
SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace));
}
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
const int n_tensors = gguf_get_n_tensors(ctx);
bool tensor_layer_in_manifest = false;
bool model_in_manifest = false;
bool tensor_layer_has_mismatch = false;
bool model_has_mismatch = false;
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
auto n_bytes = ggml_nbytes(cur);
auto *raw_data = cur->data;
const std::string tensor_layer_name = fname + ":" + name;
if (hash_params.xxh64) {
if (!hash_params.no_layer) {
// Per Layer Hash
XXH64_hash_t hash = XXH64(raw_data, n_bytes, 0);
char hex_result[17];
for (int offset = 0; offset < 8; offset++) {
unsigned int shift_bits_by = (8 * (8 - offset - 1));
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
tensor_layer_in_manifest = true;
tensor_layer_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
tensor_layer_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str());
}
}
// Overall Model Hash
if (XXH64_update(xxh64_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort();
}
if (hash_params.sha1) {
if (!hash_params.no_layer) {
// Per Layer Hash
char result[21]; // sha1 outputs 20 bytes
SHA1( result, (const char *)raw_data, n_bytes);
char hex_result[41] = {0};
for (int offset = 0; offset < 20; offset++) {
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
tensor_layer_in_manifest = true;
tensor_layer_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
tensor_layer_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str());
}
}
// Overall Model Hash
SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
}
if (hash_params.sha256) {
if (!hash_params.no_layer) {
// Per Layer Hash
unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes
sha256_hash((unsigned char*) result, (const unsigned char *)raw_data, n_bytes);
char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
for (int offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
tensor_layer_in_manifest = true;
tensor_layer_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
tensor_layer_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str());
}
}
// Overall Model Hash
sha256_update( &sha256_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
}
if (hash_params.uuid) {
SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)raw_data, n_bytes);
}
}
if (hash_params.xxh64) {
XXH64_hash_t const hash = XXH64_digest(xxh64_model_hash_state);
char hex_result[17];
for (int offset = 0; offset < 8; offset++) {
unsigned int shift_bits_by = (8 * (8 - offset - 1));
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, fname);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
model_in_manifest = true;
model_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
model_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str());
}
}
if (hash_params.sha1) {
unsigned char result[21];
SHA1Final(result, &sha1_model_hash_ctx);
char hex_result[41];
for (int offset = 0; offset < 20; offset++) {
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, fname);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
model_in_manifest = true;
model_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
model_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str());
}
}
if (hash_params.sha256) {
unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes
sha256_final( &sha256_model_hash_ctx, result);
char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
for (int offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, fname);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
model_in_manifest = true;
model_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
model_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str());
}
}
if (hash_params.uuid) {
unsigned char result[21];
SHA1Final(result, &sha1_for_uuid_ctx);
unsigned char uuid[16];
generate_uuidv5(result, uuid);
char string_buffer[37] = {0};
snprintf(string_buffer, sizeof(string_buffer), "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
uuid[0], uuid[1], uuid[2], uuid[3],
uuid[4], uuid[5], uuid[6], uuid[7],
uuid[8], uuid[9], uuid[10], uuid[11],
uuid[12], uuid[13], uuid[14], uuid[15]);
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, string_buffer, fname);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
model_in_manifest = true;
model_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
model_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str());
}
}
ggml_free(ctx_data);
gguf_free(ctx);
if (hash_params.manifest_is_usable) {
// In hash verification mode
if (!model_in_manifest) {
// model missing in manifest?
// Check tensor layer...
if (!tensor_layer_in_manifest) {
// Still missing? Maybe we are reading the wrong manifest.
return HASH_EXIT_MANIFEST_MISSING_ENTRY;
}
if (tensor_layer_has_mismatch) {
// Per tensor check found error
return HASH_EXIT_FAILURE;
}
// All per tensor layer checks passed? Sounds good enough.
return HASH_EXIT_SUCCESS;
}
// Overall model check passed, but let's check per layer just in case
// If missing, we don't care too much as the overall model checked
if (tensor_layer_in_manifest && tensor_layer_has_mismatch) {
return HASH_EXIT_FAILURE;
}
if (model_has_mismatch) {
// model has failed hash somewhere in the model
return HASH_EXIT_FAILURE;
}
// All checks appears to be fine
return HASH_EXIT_SUCCESS;
}
// In hash generation mode
return HASH_EXIT_SUCCESS;
}
int main(int argc, const char ** argv) {
hash_params params;
manifest_check_params manifest_check;
hash_params_parse(argc, argv, params);
if (!params.manifest_file.empty()) {
if (!manifest_type(params.manifest_file, manifest_check)) {
printf("ERROR cannot open manifest %s", params.manifest_file.c_str());
return HASH_EXIT_MANIFEST_FILE_ERROR;
}
if (!manifest_check.sha256 && !manifest_check.sha1 && !manifest_check.xxh64 && !manifest_check.uuid) {
printf("ERROR manifest does not have any known hash format in %s", params.manifest_file.c_str());
return HASH_EXIT_MANIFEST_UNKNOWN_HASH;
}
printf("manifest %s", params.manifest_file.c_str());
if (manifest_check.sha256) {
printf(" sha256");
}
if (manifest_check.sha1) {
printf(" sha1");
}
if (manifest_check.xxh64) {
printf(" xxh64");
}
if (manifest_check.uuid) {
printf(" uuid");
}
printf("\n");
// Autoselect the highest security hash if manifest is provided but
// the user has not specifically defined the hash they care about
if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
// User has not selected a specific value, pick most secure hash
if (manifest_check.sha256) {
params.sha256 = true;
} else if (manifest_check.sha1) {
params.sha1 = true;
} else if (manifest_check.xxh64) {
params.xxh64 = true;
} else if (manifest_check.uuid) {
params.uuid = true;
}
}
params.manifest_is_usable = true;
}
// By default if no swich argument provided, assume xxh64
if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
params.xxh64 = true;
}
hash_exit_code_t exit_code = gguf_hash(params);
if (params.manifest_is_usable) {
printf("\nVerification results for %s - %s\n", params.manifest_file.c_str(), hash_exit_code_to_str(exit_code));
}
return exit_code;
}

View File

@ -92,6 +92,11 @@ static bool gguf_ex_read_0(const std::string & fname) {
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
if (!ctx) {
fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str());
return false;
}
printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));

View File

@ -1,6 +1,6 @@
# llama.cpp/examples/imatrix
Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models.
Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models.
More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
## Usage

View File

@ -127,7 +127,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
}
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
exit(1); //GGML_ASSERT(false);
exit(1); //GGML_ABORT("fatal error");
}
if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
@ -176,7 +176,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
}
else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
exit(1); //GGML_ASSERT(false);
exit(1); //GGML_ABORT("fatal error");
}
++e.ncall;
if (m_params.verbosity > 1) {

View File

@ -15,6 +15,7 @@ In this section, we cover the most commonly used options for running the `infill
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
- `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
## Input Prompts

View File

@ -204,25 +204,23 @@ int main(int argc, char ** argv) {
GGML_ASSERT(llama_add_eos_token(model) != 1);
LOG("add_bos: %d\n", add_bos);
bool suff_rm_leading_spc = params.escape;
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
params.input_suffix.erase(0, 1);
suff_rm_leading_spc = false;
}
std::vector<llama_token> embd_inp;
std::vector<llama_token> embd_end;
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
const int space_token = 29871;
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
inp_sfx.erase(inp_sfx.begin());
}
GGML_ASSERT(llama_token_prefix(model) >= 0);
GGML_ASSERT(llama_token_suffix(model) >= 0);
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
if (add_bos) {
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
}
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
embd_inp = inp_pfx;
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
if (add_bos) {
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
}
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
const llama_token middle_token = llama_token_middle(model);
if (middle_token >= 0) {
@ -514,26 +512,21 @@ int main(int argc, char ** argv) {
string_process_escapes(params.input_prefix);
string_process_escapes(params.input_suffix);
}
suff_rm_leading_spc = params.escape;
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
params.input_suffix.erase(0, 1);
suff_rm_leading_spc = false;
}
// tokenize new prefix and suffix
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
inp_sfx.erase(inp_sfx.begin());
}
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
if (add_bos) {
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
}
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
embd_inp = inp_pfx;
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
const llama_token middle_token = llama_token_middle(model);
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
if (add_bos) {
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
}
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
if (middle_token >= 0) {
embd_inp.push_back(middle_token);
}
@ -657,4 +650,3 @@ int main(int argc, char ** argv) {
return 0;
}

View File

@ -1,9 +1,9 @@
# Usage:
#! ./llama-server -m some-model.gguf &
#! pip install pydantic
#! python json-schema-pydantic-example.py
#! python json_schema_pydantic_example.py
from pydantic import BaseModel, Extra, TypeAdapter
from pydantic import BaseModel, Field, TypeAdapter
from annotated_types import MinLen
from typing import Annotated, List, Optional
import json, requests
@ -17,6 +17,9 @@ if True:
The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below)
'''
response_format = None
type_adapter = None
if response_model:
type_adapter = TypeAdapter(response_model)
schema = type_adapter.json_schema()

View File

@ -1,4 +1,6 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import itertools
import json
@ -188,7 +190,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
raise RuntimeError("At least one of min_value or max_value must be set")
class BuiltinRule:
def __init__(self, content: str, deps: list = None):
def __init__(self, content: str, deps: list | None = None):
self.content = content
self.deps = deps or []
@ -231,7 +233,7 @@ GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}
NON_LITERAL_SET = set('|.()[]{}*+?')
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?')
class SchemaConverter:
@ -248,7 +250,7 @@ class SchemaConverter:
def _format_literal(self, literal):
escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal
lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)) or m.group(0), literal
)
return f'"{escaped}"'
@ -403,11 +405,11 @@ class SchemaConverter:
i = 0
length = len(pattern)
def to_rule(s: Tuple[str, bool]) -> str:
def to_rule(s: tuple[str, bool]) -> str:
(txt, is_literal) = s
return "\"" + txt + "\"" if is_literal else txt
def transform() -> Tuple[str, bool]:
def transform() -> tuple[str, bool]:
'''
Parse a unit at index i (advancing it), and return its string representation + whether it's a literal.
'''
@ -420,7 +422,7 @@ class SchemaConverter:
# We only need a flat structure here to apply repetition operators to the last item, and
# to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially
# (GBNF's syntax is luckily very close to regular expressions!)
seq: list[Tuple[str, bool]] = []
seq: list[tuple[str, bool]] = []
def get_dot():
if self._dotall:
@ -602,7 +604,7 @@ class SchemaConverter:
else:
add_component(t, is_required=True)
return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=[]))
return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
items = schema.get('items') or schema['prefixItems']
@ -691,7 +693,7 @@ class SchemaConverter:
required_props = [k for k in sorted_props if k in required]
optional_props = [k for k in sorted_props if k not in required]
if additional_properties != False:
if additional_properties is not None and additional_properties != False:
sub_name = f'{name}{"-" if name else ""}additional'
value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \
self._add_primitive('value', PRIMITIVE_RULES['value'])

View File

@ -23,6 +23,10 @@
#include "ggml-cuda.h"
#include "ggml-sycl.h"
#ifdef GGML_USE_CANN
#include "ggml-cann.h"
#endif
// utils
static uint64_t get_time_ns() {
using clock = std::chrono::high_resolution_clock;
@ -120,6 +124,17 @@ static std::string get_gpu_info() {
id += "/";
}
}
#endif
#ifdef GGML_USE_CANN
uint32_t count = ggml_backend_cann_get_device_count();
for (uint32_t i = 0; i < count; i++) {
char buf[128];
ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
id += buf;
if (i < count - 1) {
id += "/";
}
}
#endif
// TODO: other backends
return id;
@ -135,7 +150,7 @@ static const char * output_format_str(output_formats format) {
case JSON: return "json";
case MARKDOWN: return "md";
case SQL: return "sql";
default: GGML_ASSERT(!"invalid output format");
default: GGML_ABORT("invalid output format");
}
}
@ -161,7 +176,7 @@ static const char * split_mode_str(llama_split_mode mode) {
case LLAMA_SPLIT_MODE_NONE: return "none";
case LLAMA_SPLIT_MODE_LAYER: return "layer";
case LLAMA_SPLIT_MODE_ROW: return "row";
default: GGML_ASSERT(!"invalid split mode");
default: GGML_ABORT("invalid split mode");
}
}
@ -1311,7 +1326,7 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
case SQL:
return std::unique_ptr<printer>(new sql_printer());
}
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
int main(int argc, char ** argv) {

View File

@ -1,55 +0,0 @@
# For more information about using CMake with Android Studio, read the
# documentation: https://d.android.com/studio/projects/add-native-code.html.
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
# Sets the minimum CMake version required for this project.
cmake_minimum_required(VERSION 3.22.1)
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
# Since this is the top level CMakeLists.txt, the project name is also accessible
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
# build script scope).
project("llama-android")
## Fetch latest llama.cpp from GitHub
#include(FetchContent)
#FetchContent_Declare(
# llama
# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
# GIT_TAG master
#)
#
## Also provides "common"
#FetchContent_MakeAvailable(llama)
# llama.cpp CI uses the code from the current branch
# ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
add_subdirectory(../../../../../../ build-llama)
# Creates and names a library, sets it as either STATIC
# or SHARED, and provides the relative paths to its source code.
# You can define multiple libraries, and CMake builds them for you.
# Gradle automatically packages shared libraries with your APK.
#
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
# is preferred for the same purpose.
#
# In order to load a library into your app from Java/Kotlin, you must call
# System.loadLibrary() and pass the name of the library defined here;
# for GameActivity/NativeActivity derived applications, the same library name must be
# used in the AndroidManifest.xml file.
add_library(${CMAKE_PROJECT_NAME} SHARED
# List C/C++ source files with relative paths to this CMakeLists.txt.
llama-android.cpp)
# Specifies libraries CMake should link to your target library. You
# can link libraries from various origins, such as libraries defined in this
# build script, prebuilt third-party libraries, or Android system libraries.
target_link_libraries(${CMAKE_PROJECT_NAME}
# List libraries link to the target library
llama
common
android
log)

View File

@ -11,15 +11,15 @@ cmake_minimum_required(VERSION 3.22.1)
# build script scope).
project("llama-android")
include(FetchContent)
FetchContent_Declare(
llama
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
GIT_TAG master
)
#include(FetchContent)
#FetchContent_Declare(
# llama
# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
# GIT_TAG master
#)
# Also provides "common"
FetchContent_MakeAvailable(llama)
#FetchContent_MakeAvailable(llama)
# Creates and names a library, sets it as either STATIC
# or SHARED, and provides the relative paths to its source code.
@ -30,6 +30,10 @@ FetchContent_MakeAvailable(llama)
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
# is preferred for the same purpose.
#
#load local llama.cpp
add_subdirectory(../../../../../../ build-llama)
# In order to load a library into your app from Java/Kotlin, you must call
# System.loadLibrary() and pass the name of the library defined here;
# for GameActivity/NativeActivity derived applications, the same library name must be

View File

@ -409,7 +409,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
return env->NewStringUTF("");
return nullptr;
}
auto new_token_chars = llama_token_to_piece(context, new_token_id);

View File

@ -26,11 +26,12 @@ actor LlamaContext {
private var context: OpaquePointer
private var batch: llama_batch
private var tokens_list: [llama_token]
var is_done: Bool = false
/// This variable is used to store temporarily invalid cchars
private var temporary_invalid_cchars: [CChar]
var n_len: Int32 = 64
var n_len: Int32 = 1024
var n_cur: Int32 = 0
var n_decode: Int32 = 0
@ -160,6 +161,7 @@ actor LlamaContext {
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
print("\n")
is_done = true
let new_token_str = String(cString: temporary_invalid_cchars + [0])
temporary_invalid_cchars.removeAll()
return new_token_str
@ -322,7 +324,7 @@ actor LlamaContext {
defer {
result.deallocate()
}
let nTokens = llama_token_to_piece(model, token, result, 8, false)
let nTokens = llama_token_to_piece(model, token, result, 8, 0, false)
if nTokens < 0 {
let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
@ -330,7 +332,7 @@ actor LlamaContext {
defer {
newResult.deallocate()
}
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false)
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
return Array(bufferPointer)
} else {

View File

@ -132,7 +132,7 @@ class LlamaState: ObservableObject {
messageLog += "\(text)"
Task.detached {
while await llamaContext.n_cur < llamaContext.n_len {
while await !llamaContext.is_done {
let result = await llamaContext.completion_loop()
await MainActor.run {
self.messageLog += "\(result)"

View File

@ -30,16 +30,16 @@ git clone https://huggingface.co/mtgv/MobileVLM-1.7B
git clone https://huggingface.co/openai/clip-vit-large-patch14-336
```
2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
```sh
python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
```
3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
```sh
python ./examples/llava/convert-image-encoder-to-gguf \
python ./examples/llava/convert_image_encoder_to_gguf \
-m path/to/clip-vit-large-patch14-336 \
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
--output-dir path/to/MobileVLM-1.7B \
@ -47,17 +47,17 @@ python ./examples/llava/convert-image-encoder-to-gguf \
```
```sh
python ./examples/llava/convert-image-encoder-to-gguf \
python ./examples/llava/convert_image_encoder_to_gguf \
-m path/to/clip-vit-large-patch14-336 \
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
--output-dir path/to/MobileVLM-1.7B_V2 \
--projector-type ldpv2
```
4. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
```sh
python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B
python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B
```
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`

View File

@ -38,22 +38,22 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
pip install -r examples/llava/requirements.txt
```
3. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
```sh
python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
```
4. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
```sh
python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
```
5. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
```sh
python ./examples/convert-legacy-llama.py ../llava-v1.5-7b --skip-unknown
python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown
```
Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
@ -70,9 +70,9 @@ git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
pip install -r examples/llava/requirements.txt
```
3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
```console
python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
```
- you will find a llava.projector and a llava.clip file in your model directory
@ -86,13 +86,13 @@ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.jso
5) Create the visual gguf model:
```console
python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
```
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
6) Then convert the model to gguf format:
```console
python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
```
7) And finally we can run the llava cli using the 1.6 model version:

View File

@ -16,6 +16,10 @@
#include "ggml-metal.h"
#endif
#ifdef GGML_USE_CANN
#include "ggml-cann.h"
#endif
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
@ -865,7 +869,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = peg_0;
}
else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}
@ -1001,6 +1005,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
LOG_TEE("%s: CLIP using Metal backend\n", __func__);
#endif
#ifdef GGML_USE_CANN
new_clip->backend = ggml_backend_cann_init(0);
LOG_TEE("%s: CLIP using CANN backend\n", __func__);
#endif
if (!new_clip->backend) {
new_clip->backend = ggml_backend_cpu_init();

View File

@ -185,6 +185,8 @@ else:
fout.add_description("two-tower CLIP model")
if has_text_encoder:
assert t_hparams is not None
assert tokens is not None
# text_model hparams
fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
@ -259,8 +261,8 @@ if has_vision_encoder:
if processor is not None:
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean # pyright: ignore[reportAttributeAccessIssue]
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std # pyright: ignore[reportAttributeAccessIssue]
else:
image_mean = args.image_mean if args.image_mean is not None else default_image_mean
image_std = args.image_std if args.image_std is not None else default_image_std
@ -272,7 +274,7 @@ fout.add_bool("clip.use_gelu", use_gelu)
if has_llava_projector:
model.vision_model.encoder.layers.pop(-1)
model.vision_model.encoder.layers.pop(-1) # pyright: ignore[reportAttributeAccessIssue]
projector = torch.load(args.llava_projector)
for name, data in projector.items():
name = get_tensor_name(name)
@ -286,7 +288,7 @@ if has_llava_projector:
print("Projector tensors added\n")
state_dict = model.state_dict()
state_dict = model.state_dict() # pyright: ignore[reportAttributeAccessIssue]
for name, data in state_dict.items():
if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
# we don't need this

View File

@ -2,7 +2,9 @@ import argparse
import glob
import os
import torch
from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file
from safetensors import safe_open
from safetensors.torch import save_file
from typing import Any, ContextManager, cast
# Function to determine if file is a SafeTensor file
def is_safetensor_file(file_path):
@ -13,7 +15,7 @@ def is_safetensor_file(file_path):
def load_model(file_path):
if is_safetensor_file(file_path):
tensors = {}
with safe_open(file_path, framework="pt", device="cpu") as f:
with cast(ContextManager[Any], safe_open(file_path, framework="pt", device="cpu")) as f:
for key in f.keys():
tensors[key] = f.get_tensor(key).clone()
# output shape
@ -134,7 +136,7 @@ if len(mm_tensors) == 0:
if last_checkpoint is not None:
for k, v in last_checkpoint.items():
print(k)
print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.")
print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint) if last_checkpoint is not None else 0} tensors.")
print("No tensors found. Is this a LLaVA model?")
exit()
@ -143,8 +145,10 @@ print(f"Found additional {len(first_mm_tensors)} tensors to extract.")
# projector = {name: checkpoint.[name].float() for name in mm_tensors}
projector = {}
for name in mm_tensors:
assert last_checkpoint is not None
projector[name] = last_checkpoint[name].float()
for name in first_mm_tensors:
assert first_checkpoint is not None
projector[name] = first_checkpoint[name].float()
if len(projector) > 0:

View File

@ -1,3 +1,4 @@
-r ../../requirements/requirements-convert-legacy-llama.txt
-r ../../requirements/requirements-convert_legacy_llama.txt
--extra-index-url https://download.pytorch.org/whl/cpu
pillow~=10.2.0
torch~=2.1.1
torch~=2.2.1

Some files were not shown because too many files have changed in this diff Show More