Merge branch 'master' into support_glm_edge_model

This commit is contained in:
piDack 2024-12-19 16:50:23 +08:00 committed by GitHub
commit f91cf62b89
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
212 changed files with 25151 additions and 5841 deletions

View File

@ -3,23 +3,36 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION AS build FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1 apt-get install -y build-essential git cmake libcurl4-openssl-dev
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app WORKDIR /app
COPY . . COPY . .
ENV LLAMA_CURL=1 RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
cmake --build build -j $(nproc) && \
mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib/ \;
FROM ubuntu:$UBUNTU_VERSION as runtime
RUN make -j$(nproc) WORKDIR /app
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
COPY requirements.txt /app/requirements.txt
COPY requirements /app/requirements
COPY .devops/tools.sh /app/tools.sh
RUN pip install --upgrade pip setuptools wheel && \
pip install -r /app/requirements.txt
COPY --from=build /app/build/bin/ /app/
COPY --from=build /app/lib/ /app/
COPY --from=build /app/convert_hf_to_gguf.py /app/
COPY --from=build /app/gguf-py /app/gguf-py
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8
ENTRYPOINT ["/app/.devops/tools.sh"] ENTRYPOINT ["/app/tools.sh"]

View File

@ -3,21 +3,27 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION AS build FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential git apt-get install -y build-essential git cmake libcurl4-openssl-dev
WORKDIR /app WORKDIR /app
COPY . . COPY . .
RUN make -j$(nproc) llama-cli RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
cmake --build build -j $(nproc) && \
mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib/ \;
FROM ubuntu:$UBUNTU_VERSION AS runtime FROM ubuntu:$UBUNTU_VERSION AS runtime
RUN apt-get update && \ WORKDIR /app
apt-get install -y libgomp1
COPY --from=build /app/llama-cli /llama-cli RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl
COPY --from=build /app/build/bin/llama-cli /app/
COPY --from=build /app/lib/ /app/
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8
ENTRYPOINT [ "/llama-cli" ] ENTRYPOINT [ "/app/llama-cli" ]

View File

@ -9,28 +9,20 @@ WORKDIR /app
COPY . . COPY . .
RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
RUN \ cmake --build build -j $(nproc) && \
# Build multiple versions of the CPU backend
scripts/build-cpu.sh avx -DGGML_AVX=ON -DGGML_AVX2=OFF && \
scripts/build-cpu.sh avx2 -DGGML_AVX=ON -DGGML_AVX2=ON && \
scripts/build-cpu.sh avx512 -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON && \
scripts/build-cpu.sh amx -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON -DGGML_AVX_VNNI=ON -DGGML_AVX512_VNNI=ON -DGGML_AMX_TILE=ON -DGGML_AMX_INT8=ON && \
# Build llama-server
cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
cmake --build build --target llama-server -j $(nproc) && \
# Copy the built libraries to /app/lib
mkdir -p /app/lib && \ mkdir -p /app/lib && \
mv libggml-cpu* /app/lib/ && \
find build -name "*.so" -exec cp {} /app/lib/ \; find build -name "*.so" -exec cp {} /app/lib/ \;
FROM ubuntu:$UBUNTU_VERSION AS runtime FROM ubuntu:$UBUNTU_VERSION AS runtime
WORKDIR /app
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl apt-get install -y libcurl4-openssl-dev libgomp1 curl
COPY --from=build /app/build/bin/llama-server /llama-server COPY --from=build /app/build/bin/llama-server /app/
COPY --from=build /app/lib/ / COPY --from=build /app/lib/ /app/
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine # Must be set to 0.0.0.0 so it can listen to requests from host machine
@ -38,4 +30,4 @@ ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ] ENTRYPOINT [ "/app/llama-server" ]

View File

@ -31,6 +31,7 @@
# Increases the runtime closure size by ~700M # Increases the runtime closure size by ~700M
useMpi ? false, useMpi ? false,
useRocm ? config.rocmSupport, useRocm ? config.rocmSupport,
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
enableCurl ? true, enableCurl ? true,
useVulkan ? false, useVulkan ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
@ -188,7 +189,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
] ]
++ optionals useRocm [ ++ optionals useRocm [
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang") (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets)) (cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets)
] ]
++ optionals useMetalKit [ ++ optionals useMetalKit [
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")

View File

@ -8,11 +8,11 @@ arg1="$1"
shift shift
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
python3 ./convert_hf_to_gguf.py "$@" exec python3 ./convert_hf_to_gguf.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./llama-quantize "$@" exec ./llama-quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
./llama-cli "$@" exec ./llama-cli "$@"
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Converting PTH to GGML..." echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do for i in `ls $1/$2/ggml-model-f16.bin*`; do
@ -20,11 +20,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Skip model quantization, it already exists: ${i/f16/q4_0}" echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
else else
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..." echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
./llama-quantize "$i" "${i/f16/q4_0}" q4_0 exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
fi fi
done done
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
./llama-server "$@" exec ./llama-server "$@"
else else
echo "Unknown command: $arg1" echo "Unknown command: $arg1"
echo "Available commands: " echo "Available commands: "

View File

@ -317,7 +317,7 @@ jobs:
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add - wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
sudo apt-get update -y sudo apt-get update -y
sudo apt-get install -y build-essential vulkan-sdk sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk
- name: Build - name: Build
id: cmake_build id: cmake_build
@ -327,6 +327,12 @@ jobs:
cmake -DGGML_VULKAN=ON .. cmake -DGGML_VULKAN=ON ..
cmake --build . --config Release -j $(nproc) cmake --build . --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
ubuntu-22-cmake-hip: ubuntu-22-cmake-hip:
runs-on: ubuntu-22.04 runs-on: ubuntu-22.04
container: rocm/dev-ubuntu-22.04:6.0.2 container: rocm/dev-ubuntu-22.04:6.0.2
@ -552,35 +558,44 @@ jobs:
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
# TODO: tmp disabled. see for possible re-enable: macOS-latest-swift:
# https://github.com/ggerganov/llama.cpp/pull/10525 runs-on: macos-latest
# macOS-latest-swift:
# runs-on: macos-latest strategy:
# matrix:
# strategy: destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
# matrix:
# destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS'] steps:
# - name: Clone
# steps: id: checkout
# - name: Clone uses: actions/checkout@v4
# id: checkout
# uses: actions/checkout@v4 - name: Dependencies
# id: depends
# - name: Dependencies continue-on-error: true
# id: depends run: |
# continue-on-error: true brew update
# run: |
# brew update - name: Build llama.cpp with CMake
# id: cmake_build
# - name: xcodebuild for swift package run: |
# id: xcodebuild sysctl -a
# run: | mkdir build
# xcodebuild -scheme llama -destination "${{ matrix.destination }}" cd build
# cmake -G Xcode .. \
# - name: Build Swift Example -DGGML_METAL_USE_BF16=ON \
# id: make_build_swift_example -DGGML_METAL_EMBED_LIBRARY=ON \
# run: | -DLLAMA_BUILD_EXAMPLES=OFF \
# make swift -DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
sudo cmake --install . --config Release
- name: xcodebuild for swift package
id: xcodebuild
run: |
xcodebuild -scheme llama-Package -destination "${{ matrix.destination }}"
windows-msys2: windows-msys2:
runs-on: windows-latest runs-on: windows-latest
@ -653,6 +668,8 @@ jobs:
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'msvc-arm64' - build: 'msvc-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'llvm-arm64-opencl-adreno'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
steps: steps:
- name: Clone - name: Clone
@ -694,6 +711,28 @@ jobs:
run: | run: |
choco install ninja choco install ninja
- name: Install OpenCL Headers and Libs
id: install_opencl
if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
run: |
git clone https://github.com/KhronosGroup/OpenCL-Headers
cd OpenCL-Headers
mkdir build && cd build
cmake .. `
-DBUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build . --target install
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
cd OpenCL-ICD-Loader
mkdir build-arm64-release && cd build-arm64-release
cmake .. `
-A arm64 `
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build . --target install --config release
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
@ -723,7 +762,7 @@ jobs:
- name: Test - name: Test
id: cmake_test id: cmake_test
# not all machines have native AVX-512 # not all machines have native AVX-512
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }} if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
run: | run: |
cd build cd build
ctest -L main -C Release --verbose --timeout 900 ctest -L main -C Release --verbose --timeout 900
@ -1104,6 +1143,29 @@ jobs:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Build
id: cmake_build
run: |
sysctl -a
mkdir build
cd build
cmake -G Xcode .. \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=iOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
sudo cmake --install . --config Release
- name: xcodebuild for swift package
id: xcodebuild
run: |
xcodebuild -scheme llama-Package -destination 'generic/platform=iOS'
- name: Build Xcode project - name: Build Xcode project
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
@ -1131,23 +1193,6 @@ jobs:
./gradlew build --no-daemon ./gradlew build --no-daemon
# freeBSD-latest:
# runs-on: macos-12
# steps:
# - name: Clone
# uses: actions/checkout@v4
#
# - name: Build
# uses: cross-platform-actions/action@v0.19.0
# with:
# operating_system: freebsd
# version: '13.2'
# hypervisor: 'qemu'
# run: |
# sudo pkg update
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
release: release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

View File

@ -79,7 +79,7 @@ jobs:
# Setup nodejs (to be used for verifying bundled index.html) # Setup nodejs (to be used for verifying bundled index.html)
- uses: actions/setup-node@v4 - uses: actions/setup-node@v4
with: with:
node-version: 22 node-version: '22.11.0'
- name: Verify bundled index.html - name: Verify bundled index.html
id: verify_server_index_html id: verify_server_index_html

View File

@ -46,11 +46,9 @@ if (WIN32)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS) add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
endif() endif()
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") if (MSVC)
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/source-charset:utf-8>") add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/source-charset:utf-8>") add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/execution-charset:utf-8>")
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/execution-charset:utf-8>")
endif() endif()
# #

View File

@ -31,6 +31,13 @@
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } }, { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
{ "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } }, { "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } },
{
"name": "x64-windows-llvm", "hidden": true,
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/x64-windows-llvm.cmake"
}
},
{ {
"name": "arm64-windows-msvc", "hidden": true, "name": "arm64-windows-msvc", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" }, "architecture": { "value": "arm64", "strategy": "external" },
@ -70,6 +77,11 @@
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] }, { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] }, { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
{ "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
{ "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
{ "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
{ "name": "x64-windows-llvm+static-release", "inherits": [ "base", "x64-windows-llvm", "reldbg", "static" ] },
{ "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] }, { "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] },
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] }, { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] }, { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },

View File

@ -1,3 +1,5 @@
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs # collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
ci/ @ggerganov /ci/ @ggerganov
/.devops/ @ngxson
/examples/server/ @ngxson

View File

@ -22,6 +22,7 @@ BUILD_TARGETS = \
llama-infill \ llama-infill \
llama-llava-cli \ llama-llava-cli \
llama-minicpmv-cli\ llama-minicpmv-cli\
llama-qwen2vl-cli\
llama-lookahead \ llama-lookahead \
llama-lookup \ llama-lookup \
llama-lookup-create \ llama-lookup-create \
@ -445,6 +446,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
MK_CFLAGS += -march=native -mtune=native MK_CFLAGS += -march=native -mtune=native
HOST_CXXFLAGS += -march=native -mtune=native HOST_CXXFLAGS += -march=native -mtune=native
# Usage AMX build test
#MK_CFLAGS += -march=graniterapids -mtune=graniterapids
#HOST_CXXFLAGS += -march=graniterapids -mtune=graniterapids
# Usage AVX-only # Usage AVX-only
#MK_CFLAGS += -mfma -mf16c -mavx #MK_CFLAGS += -mfma -mf16c -mavx
#MK_CXXFLAGS += -mfma -mf16c -mavx #MK_CXXFLAGS += -mfma -mf16c -mavx
@ -948,7 +953,6 @@ DIR_COMMON = common
OBJ_GGML = \ OBJ_GGML = \
$(DIR_GGML)/src/ggml.o \ $(DIR_GGML)/src/ggml.o \
$(DIR_GGML)/src/ggml-aarch64.o \
$(DIR_GGML)/src/ggml-alloc.o \ $(DIR_GGML)/src/ggml-alloc.o \
$(DIR_GGML)/src/ggml-backend.o \ $(DIR_GGML)/src/ggml-backend.o \
$(DIR_GGML)/src/ggml-backend-reg.o \ $(DIR_GGML)/src/ggml-backend-reg.o \
@ -956,9 +960,11 @@ OBJ_GGML = \
$(DIR_GGML)/src/ggml-quants.o \ $(DIR_GGML)/src/ggml-quants.o \
$(DIR_GGML)/src/ggml-threading.o \ $(DIR_GGML)/src/ggml-threading.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
$(OBJ_GGML_EXT) $(OBJ_GGML_EXT)
OBJ_LLAMA = \ OBJ_LLAMA = \
@ -1098,17 +1104,10 @@ DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
# Default target # Default target
all: $(BUILD_TARGETS) all: $(BUILD_TARGETS)
# force c++ build for source file that have same name as c file
# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files # Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
# g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml $(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \ $(CXX) $(CXXFLAGS) -MMD -c $< -o $@
ggml/src/ggml-cpu/ggml-cpu.cpp \
ggml/include/ggml-backend.h \
ggml/include/ggml.h \
ggml/include/ggml-alloc.h \
ggml/src/ggml-backend-impl.h \
ggml/include/ggml-cpu.h \
ggml/src/ggml-impl.h
$(CXX) $(CXXFLAGS) -c $< -o $@
# Rules for building object files # Rules for building object files
$(DIR_GGML)/%.o: $(DIR_GGML)/%.c $(DIR_GGML)/%.o: $(DIR_GGML)/%.c
@ -1406,6 +1405,14 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
examples/llava/llava.cpp \
examples/llava/llava.h \
examples/llava/clip.cpp \
examples/llava/clip.h \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
swift: examples/batched.swift swift: examples/batched.swift
(cd examples/batched.swift; make build) (cd examples/batched.swift; make build)

View File

@ -2,59 +2,6 @@
import PackageDescription import PackageDescription
var sources = [
"src/llama.cpp",
"src/llama-vocab.cpp",
"src/llama-grammar.cpp",
"src/llama-sampling.cpp",
"src/unicode.cpp",
"src/unicode-data.cpp",
"ggml/src/ggml.c",
"ggml/src/ggml-aarch64.c",
"ggml/src/ggml-alloc.c",
"ggml/src/ggml-backend.cpp",
"ggml/src/ggml-backend-reg.cpp",
"ggml/src/ggml-cpu/ggml-cpu.c",
"ggml/src/ggml-cpu/ggml-cpu.cpp",
"ggml/src/ggml-cpu/ggml-cpu-aarch64.c",
"ggml/src/ggml-cpu/ggml-cpu-quants.c",
"ggml/src/ggml-threading.cpp",
"ggml/src/ggml-quants.c",
]
var resources: [Resource] = []
var linkerSettings: [LinkerSetting] = []
var cSettings: [CSetting] = [
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
.unsafeFlags(["-fno-objc-arc"]),
.headerSearchPath("ggml/src"),
.headerSearchPath("ggml/src/ggml-cpu"),
// NOTE: NEW_LAPACK will required iOS version 16.4+
// We should consider add this in the future when we drop support for iOS 14
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
// .define("ACCELERATE_NEW_LAPACK"),
// .define("ACCELERATE_LAPACK_ILP64")
.define("GGML_USE_CPU"),
]
#if canImport(Darwin)
sources.append("ggml/src/ggml-common.h")
sources.append("ggml/src/ggml-metal/ggml-metal.m")
resources.append(.process("ggml/src/ggml-metal/ggml-metal.metal"))
linkerSettings.append(.linkedFramework("Accelerate"))
cSettings.append(
contentsOf: [
.define("GGML_USE_ACCELERATE"),
.define("GGML_USE_METAL"),
]
)
#endif
#if os(Linux)
cSettings.append(.define("_GNU_SOURCE"))
#endif
let package = Package( let package = Package(
name: "llama", name: "llama",
platforms: [ platforms: [
@ -67,26 +14,6 @@ let package = Package(
.library(name: "llama", targets: ["llama"]), .library(name: "llama", targets: ["llama"]),
], ],
targets: [ targets: [
.target( .systemLibrary(name: "llama", pkgConfig: "llama"),
name: "llama", ]
path: ".",
exclude: [
"build",
"cmake",
"examples",
"scripts",
"models",
"tests",
"CMakeLists.txt",
"Makefile",
"ggml/src/ggml-metal-embed.metal"
],
sources: sources,
resources: resources,
publicHeadersPath: "spm-headers",
cSettings: cSettings,
linkerSettings: linkerSettings
)
],
cxxLanguageStandard: .cxx17
) )

View File

@ -98,6 +98,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat) - [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a) - [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM) - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
#### Multimodal #### Multimodal
@ -111,6 +112,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2) - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny) - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
- [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge) - [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge)
- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
</details> </details>
@ -220,7 +222,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU | | [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU | | [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
| [CUDA](docs/build.md#cuda) | Nvidia GPU | | [CUDA](docs/build.md#cuda) | Nvidia GPU |
| [hipBLAS](docs/build.md#hipblas) | AMD GPU | | [HIP](docs/build.md#hip) | AMD GPU |
| [Vulkan](docs/build.md#vulkan) | GPU | | [Vulkan](docs/build.md#vulkan) | GPU |
| [CANN](docs/build.md#cann) | Ascend NPU | | [CANN](docs/build.md#cann) | Ascend NPU |
@ -413,7 +415,7 @@ To learn more about model quantization, [read this documentation](examples/quant
[^1]: [examples/perplexity/README.md](examples/perplexity/README.md) [^1]: [examples/perplexity/README.md](examples/perplexity/README.md)
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity) [^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
## [`llama-bench`](example/bench) ## [`llama-bench`](examples/llama-bench)
#### Benchmark the performance of the inference for various parameters. #### Benchmark the performance of the inference for various parameters.
@ -434,6 +436,20 @@ To learn more about model quantization, [read this documentation](examples/quant
</details> </details>
## [`llama-run`](examples/run)
#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
- <details>
<summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
```bash
llama-run granite-code
```
</details>
[^3]: [RamaLama](https://github.com/containers/ramalama)
## [`llama-simple`](examples/simple) ## [`llama-simple`](examples/simple)

4
Sources/llama/llama.h Normal file
View File

@ -0,0 +1,4 @@
#pragma once
#include <llama.h>

View File

@ -0,0 +1,5 @@
module llama [system] {
header "llama.h"
link "llama"
export *
}

View File

@ -6,5 +6,5 @@ includedir=${prefix}/include
Name: llama Name: llama
Description: Port of Facebook's LLaMA model in C/C++ Description: Port of Facebook's LLaMA model in C/C++
Version: @PROJECT_VERSION@ Version: @PROJECT_VERSION@
Libs: -L${libdir} -lllama Libs: -L${libdir} -lggml -lggml-base -lllama
Cflags: -I${includedir} Cflags: -I${includedir}

View File

@ -0,0 +1,11 @@
set( CMAKE_SYSTEM_NAME Windows )
set( CMAKE_SYSTEM_PROCESSOR x86_64 )
set( CMAKE_C_COMPILER clang )
set( CMAKE_CXX_COMPILER clang++ )
set( arch_c_flags "-march=native" )
set( CMAKE_C_FLAGS_INIT "${arch_c_flags}" )
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )

View File

@ -81,7 +81,7 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)
# Use curl to download model url # Use curl to download model url
if (LLAMA_CURL) if (LLAMA_CURL)
find_package(CURL REQUIRED) find_package(CURL REQUIRED)
add_definitions(-DLLAMA_USE_CURL) target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
include_directories(${CURL_INCLUDE_DIRS}) include_directories(${CURL_INCLUDE_DIRS})
find_library(CURL_LIBRARY curl REQUIRED) find_library(CURL_LIBRARY curl REQUIRED)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY}) set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})

View File

@ -119,32 +119,65 @@ std::string common_arg::to_string() {
// utils // utils
// //
static void common_params_handle_model_default(common_params & params) { static void common_params_handle_model_default(
if (!params.hf_repo.empty()) { std::string & model,
std::string & model_url,
std::string & hf_repo,
std::string & hf_file) {
if (!hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model // short-hand to avoid specifying --hf-file -> default it to --model
if (params.hf_file.empty()) { if (hf_file.empty()) {
if (params.model.empty()) { if (model.empty()) {
throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
} }
params.hf_file = params.model; hf_file = model;
} else if (params.model.empty()) { } else if (model.empty()) {
// this is to avoid different repo having same file name, or same file name in different subdirs // this is to avoid different repo having same file name, or same file name in different subdirs
std::string filename = params.hf_repo + "_" + params.hf_file; std::string filename = hf_repo + "_" + hf_file;
// to make sure we don't have any slashes in the filename // to make sure we don't have any slashes in the filename
string_replace_all(filename, "/", "_"); string_replace_all(filename, "/", "_");
params.model = fs_get_cache_file(filename); model = fs_get_cache_file(filename);
} }
} else if (!params.model_url.empty()) { } else if (!model_url.empty()) {
if (params.model.empty()) { if (model.empty()) {
auto f = string_split<std::string>(params.model_url, '#').front(); auto f = string_split<std::string>(model_url, '#').front();
f = string_split<std::string>(f, '?').front(); f = string_split<std::string>(f, '?').front();
params.model = fs_get_cache_file(string_split<std::string>(f, '/').back()); model = fs_get_cache_file(string_split<std::string>(f, '/').back());
} }
} else if (params.model.empty()) { } else if (model.empty()) {
params.model = DEFAULT_MODEL_PATH; model = DEFAULT_MODEL_PATH;
} }
} }
const std::vector<ggml_type> kv_cache_types = {
GGML_TYPE_F32,
GGML_TYPE_F16,
GGML_TYPE_BF16,
GGML_TYPE_Q8_0,
GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1,
GGML_TYPE_IQ4_NL,
GGML_TYPE_Q5_0,
GGML_TYPE_Q5_1,
};
static ggml_type kv_cache_type_from_str(const std::string & s) {
for (const auto & type : kv_cache_types) {
if (ggml_type_name(type) == s) {
return type;
}
}
throw std::runtime_error("Unsupported cache type: " + s);
}
static std::string get_all_kv_cache_types() {
std::ostringstream msg;
for (const auto & type : kv_cache_types) {
msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
}
return msg.str();
}
// //
// CLI argument parsing functions // CLI argument parsing functions
// //
@ -247,7 +280,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
} }
common_params_handle_model_default(params); // TODO: refactor model params in a common struct
common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file);
common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file);
if (params.escape) { if (params.escape) {
string_process_escapes(params.prompt); string_process_escapes(params.prompt);
@ -591,7 +626,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) { [](common_params & params) {
params.ctx_shift = false; params.ctx_shift = false;
} }
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
add_opt(common_arg( add_opt(common_arg(
{"--chunks"}, "N", {"--chunks"}, "N",
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@ -786,7 +821,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) { [](common_params & params) {
params.warmup = false; params.warmup = false;
} }
).set_examples({LLAMA_EXAMPLE_MAIN})); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"--spm-infill"}, {"--spm-infill"},
string_format( string_format(
@ -813,7 +848,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_sparam()); ).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--sampling-seq"}, "SEQUENCE", {"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.sampling.samplers = common_sampler_types_from_chars(value); params.sampling.samplers = common_sampler_types_from_chars(value);
@ -826,13 +861,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.sampling.ignore_eos = true; params.sampling.ignore_eos = true;
} }
).set_sparam()); ).set_sparam());
add_opt(common_arg(
{"--penalize-nl"},
string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"),
[](common_params & params) {
params.sampling.penalize_nl = true;
}
).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--temp"}, "N", {"--temp"}, "N",
string_format("temperature (default: %.1f)", (double)params.sampling.temp), string_format("temperature (default: %.1f)", (double)params.sampling.temp),
@ -887,6 +915,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--repeat-last-n"}, "N", {"--repeat-last-n"}, "N",
string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n), string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
[](common_params & params, int value) { [](common_params & params, int value) {
if (value < -1) {
throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value));
}
params.sampling.penalty_last_n = value; params.sampling.penalty_last_n = value;
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n); params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
} }
@ -941,6 +972,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--dry-penalty-last-n"}, "N", {"--dry-penalty-last-n"}, "N",
string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n), string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
[](common_params & params, int value) { [](common_params & params, int value) {
if (value < -1) {
throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value));
}
params.sampling.dry_penalty_last_n = value; params.sampling.dry_penalty_last_n = value;
} }
).set_sparam()); ).set_sparam());
@ -1174,18 +1208,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
add_opt(common_arg( add_opt(common_arg(
{"-ctk", "--cache-type-k"}, "TYPE", {"-ctk", "--cache-type-k"}, "TYPE",
string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), string_format(
"KV cache data type for K\n"
"allowed values: %s\n"
"(default: %s)",
get_all_kv_cache_types().c_str(),
ggml_type_name(params.cache_type_k)
),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
// TODO: get the type right here params.cache_type_k = kv_cache_type_from_str(value);
params.cache_type_k = value;
} }
).set_env("LLAMA_ARG_CACHE_TYPE_K")); ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
add_opt(common_arg( add_opt(common_arg(
{"-ctv", "--cache-type-v"}, "TYPE", {"-ctv", "--cache-type-v"}, "TYPE",
string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), string_format(
"KV cache data type for V\n"
"allowed values: %s\n"
"(default: %s)",
get_all_kv_cache_types().c_str(),
ggml_type_name(params.cache_type_v)
),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
// TODO: get the type right here params.cache_type_v = kv_cache_type_from_str(value);
params.cache_type_v = value;
} }
).set_env("LLAMA_ARG_CACHE_TYPE_V")); ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
add_opt(common_arg( add_opt(common_arg(
@ -1543,6 +1587,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.hf_file = value; params.hf_file = value;
} }
).set_env("LLAMA_ARG_HF_FILE")); ).set_env("LLAMA_ARG_HF_FILE"));
add_opt(common_arg(
{"-hfrv", "--hf-repo-v"}, "REPO",
"Hugging Face model repository for the vocoder model (default: unused)",
[](common_params & params, const std::string & value) {
params.vocoder.hf_repo = value;
}
).set_env("LLAMA_ARG_HF_REPO_V"));
add_opt(common_arg(
{"-hffv", "--hf-file-v"}, "FILE",
"Hugging Face model file for the vocoder model (default: unused)",
[](common_params & params, const std::string & value) {
params.vocoder.hf_file = value;
}
).set_env("LLAMA_ARG_HF_FILE_V"));
add_opt(common_arg( add_opt(common_arg(
{"-hft", "--hf-token"}, "TOKEN", {"-hft", "--hf-token"}, "TOKEN",
"Hugging Face access token (default: value from HF_TOKEN environment variable)", "Hugging Face access token (default: value from HF_TOKEN environment variable)",
@ -1711,6 +1769,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.public_path = value; params.public_path = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
add_opt(common_arg(
{"--no-webui"},
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
[](common_params & params) {
params.webui = false;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
add_opt(common_arg( add_opt(common_arg(
{"--embedding", "--embeddings"}, {"--embedding", "--embeddings"},
string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@ -2076,35 +2141,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) { [](common_params & params, int value) {
params.speculative.n_max = value; params.speculative.n_max = value;
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
add_opt(common_arg( add_opt(common_arg(
{"--draft-min", "--draft-n-min"}, "N", {"--draft-min", "--draft-n-min"}, "N",
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min), string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
[](common_params & params, int value) { [](common_params & params, int value) {
params.speculative.n_min = value; params.speculative.n_min = value;
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
add_opt(common_arg( add_opt(common_arg(
{"--draft-p-split"}, "P", {"--draft-p-split"}, "P",
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split), string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.speculative.p_split = std::stof(value); params.speculative.p_split = std::stof(value);
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
add_opt(common_arg( add_opt(common_arg(
{"--draft-p-min"}, "P", {"--draft-p-min"}, "P",
string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min), string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.speculative.p_min = std::stof(value); params.speculative.p_min = std::stof(value);
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
add_opt(common_arg( add_opt(common_arg(
{"-cd", "--ctx-size-draft"}, "N", {"-cd", "--ctx-size-draft"}, "N",
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx), string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
[](common_params & params, int value) { [](common_params & params, int value) {
params.speculative.n_ctx = value; params.speculative.n_ctx = value;
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
add_opt(common_arg( add_opt(common_arg(
{"-devd", "--device-draft"}, "<dev1,dev2,..>", {"-devd", "--device-draft"}, "<dev1,dev2,..>",
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n" "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@ -2124,14 +2189,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n"); fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
} }
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
add_opt(common_arg( add_opt(common_arg(
{"-md", "--model-draft"}, "FNAME", {"-md", "--model-draft"}, "FNAME",
"draft model for speculative decoding (default: unused)", "draft model for speculative decoding (default: unused)",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.speculative.model = value; params.speculative.model = value;
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
add_opt(common_arg(
{"-mv", "--model-vocoder"}, "FNAME",
"vocoder model for audio generation (default: unused)",
[](common_params & params, const std::string & value) {
params.vocoder.model = value;
}
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
return ctx_arg; return ctx_arg;
} }

View File

@ -940,6 +940,25 @@ struct common_init_result common_init_from_params(common_params & params) {
params.sampling.ignore_eos = false; params.sampling.ignore_eos = false;
} }
if (params.sampling.ignore_eos) {
for (llama_token i = 0; i < llama_n_vocab(model); i++) {
if (llama_token_is_eog(model, i)) {
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
params.sampling.logit_bias.push_back({i, -INFINITY});
}
}
}
if (params.sampling.penalty_last_n == -1) {
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
params.sampling.penalty_last_n = llama_n_ctx(lctx);
}
if (params.sampling.dry_penalty_last_n == -1) {
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
}
if (params.warmup) { if (params.warmup) {
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
@ -1015,38 +1034,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
return mparams; return mparams;
} }
static ggml_type kv_cache_type_from_str(const std::string & s) {
if (s == "f32") {
return GGML_TYPE_F32;
}
if (s == "f16") {
return GGML_TYPE_F16;
}
if (s == "bf16") {
return GGML_TYPE_BF16;
}
if (s == "q8_0") {
return GGML_TYPE_Q8_0;
}
if (s == "q4_0") {
return GGML_TYPE_Q4_0;
}
if (s == "q4_1") {
return GGML_TYPE_Q4_1;
}
if (s == "iq4_nl") {
return GGML_TYPE_IQ4_NL;
}
if (s == "q5_0") {
return GGML_TYPE_Q5_0;
}
if (s == "q5_1") {
return GGML_TYPE_Q5_1;
}
throw std::runtime_error("Unsupported cache type: " + s);
}
struct llama_context_params common_context_params_to_llama(const common_params & params) { struct llama_context_params common_context_params_to_llama(const common_params & params) {
auto cparams = llama_context_default_params(); auto cparams = llama_context_default_params();
@ -1081,8 +1068,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK; cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
} }
cparams.type_k = kv_cache_type_from_str(params.cache_type_k); cparams.type_k = params.cache_type_k;
cparams.type_v = kv_cache_type_from_str(params.cache_type_v); cparams.type_v = params.cache_type_v;
return cparams; return cparams;
} }
@ -1108,13 +1095,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
#define CURL_MAX_RETRY 3 #define CURL_MAX_RETRY 3
#define CURL_RETRY_DELAY_SECONDS 2 #define CURL_RETRY_DELAY_SECONDS 2
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
static bool starts_with(const std::string & str, const std::string & prefix) {
// While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
}
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
int remaining_attempts = max_attempts; int remaining_attempts = max_attempts;
while (remaining_attempts > 0) { while (remaining_attempts > 0) {
@ -1138,7 +1119,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
} }
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
// Initialize libcurl // Initialize libcurl
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup); std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
if (!curl) { if (!curl) {
@ -1211,11 +1191,13 @@ static bool common_download_file(const std::string & url, const std::string & pa
std::string etag; std::string etag;
std::string last_modified; std::string last_modified;
}; };
common_load_model_from_url_headers headers; common_load_model_from_url_headers headers;
{ {
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *); typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata; common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
static std::regex header_regex("([^:]+): (.*)\r\n"); static std::regex header_regex("([^:]+): (.*)\r\n");
static std::regex etag_regex("ETag", std::regex_constants::icase); static std::regex etag_regex("ETag", std::regex_constants::icase);
@ -1799,7 +1781,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
break; break;
case 0: // max absolute case 0: // max absolute
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
if (sum < std::abs(inp[i])) sum = std::abs(inp[i]); if (sum < std::abs(inp[i])) {
sum = std::abs(inp[i]);
}
} }
sum /= 32760.0; // make an int16 range sum /= 32760.0; // make an int16 range
break; break;

View File

@ -37,9 +37,9 @@ using llama_tokens = std::vector<llama_token>;
// build info // build info
extern int LLAMA_BUILD_NUMBER; extern int LLAMA_BUILD_NUMBER;
extern char const * LLAMA_COMMIT; extern const char * LLAMA_COMMIT;
extern char const * LLAMA_COMPILER; extern const char * LLAMA_COMPILER;
extern char const * LLAMA_BUILD_TARGET; extern const char * LLAMA_BUILD_TARGET;
struct common_control_vector_load_info; struct common_control_vector_load_info;
@ -80,6 +80,7 @@ enum llama_example {
LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_LLAVA,
LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL, LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_TTS,
LLAMA_EXAMPLE_COUNT, LLAMA_EXAMPLE_COUNT,
}; };
@ -95,6 +96,7 @@ enum common_sampler_type {
COMMON_SAMPLER_TYPE_TEMPERATURE = 7, COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
COMMON_SAMPLER_TYPE_XTC = 8, COMMON_SAMPLER_TYPE_XTC = 8,
COMMON_SAMPLER_TYPE_INFILL = 9, COMMON_SAMPLER_TYPE_INFILL = 9,
COMMON_SAMPLER_TYPE_PENALTIES = 10,
}; };
// dimensionality reduction methods, used by cvector-generator // dimensionality reduction methods, used by cvector-generator
@ -130,7 +132,6 @@ struct common_params_sampling {
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
bool ignore_eos = false; bool ignore_eos = false;
bool no_perf = false; // disable performance metrics bool no_perf = false; // disable performance metrics
bool timing_per_token = false; bool timing_per_token = false;
@ -139,6 +140,7 @@ struct common_params_sampling {
std::vector<enum common_sampler_type> samplers = { std::vector<enum common_sampler_type> samplers = {
COMMON_SAMPLER_TYPE_PENALTIES,
COMMON_SAMPLER_TYPE_DRY, COMMON_SAMPLER_TYPE_DRY,
COMMON_SAMPLER_TYPE_TOP_K, COMMON_SAMPLER_TYPE_TOP_K,
COMMON_SAMPLER_TYPE_TYPICAL_P, COMMON_SAMPLER_TYPE_TYPICAL_P,
@ -158,6 +160,7 @@ struct common_params_sampling {
struct common_params_speculative { struct common_params_speculative {
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
int32_t n_ctx = 0; // draft context size int32_t n_ctx = 0; // draft context size
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
@ -171,6 +174,14 @@ struct common_params_speculative {
std::string model = ""; // draft model for speculative decoding // NOLINT std::string model = ""; // draft model for speculative decoding // NOLINT
}; };
struct common_params_vocoder {
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string model = ""; // model path // NOLINT
std::string model_url = ""; // model url to download // NOLINT
};
struct common_params { struct common_params {
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 4096; // context size int32_t n_ctx = 4096; // context size
@ -194,9 +205,11 @@ struct common_params {
// offload params // offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
struct cpu_params cpuparams; struct cpu_params cpuparams;
@ -213,9 +226,10 @@ struct common_params {
struct common_params_sampling sampling; struct common_params_sampling sampling;
struct common_params_speculative speculative; struct common_params_speculative speculative;
struct common_params_vocoder vocoder;
std::string model = ""; // model path // NOLINT std::string model = ""; // model path // NOLINT
std::string model_alias = "unknown"; // model alias // NOLINT std::string model_alias = ""; // model alias // NOLINT
std::string model_url = ""; // model url to download // NOLINT std::string model_url = ""; // model url to download // NOLINT
std::string hf_token = ""; // HF token // NOLINT std::string hf_token = ""; // HF token // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT std::string hf_repo = ""; // HF repo // NOLINT
@ -286,8 +300,8 @@ struct common_params {
bool warmup = true; // warmup run bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data bool check_tensors = false; // validate tensor data
std::string cache_type_k = "f16"; // KV cache data type for the K ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
// multimodal models (see examples/llava) // multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector // NOLINT std::string mmproj = ""; // path to multimodal projector // NOLINT
@ -437,6 +451,11 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
return parts; return parts;
} }
static bool string_starts_with(const std::string & str,
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
}
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides); bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input); void string_process_escapes(std::string & input);
@ -588,7 +607,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
// Embedding utils // Embedding utils
// //
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2); // TODO: repace embd_norm with an enum
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n); float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);

View File

@ -161,26 +161,14 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
params.logit_bias.size(), params.logit_bias.size(),
params.logit_bias.data())); params.logit_bias.data()));
llama_sampler_chain_add(result->chain,
llama_sampler_init_penalties(
llama_n_vocab (model),
llama_token_eos(model),
llama_token_nl (model),
params.penalty_last_n,
params.penalty_repeat,
params.penalty_freq,
params.penalty_present,
params.penalize_nl,
params.ignore_eos));
if (params.mirostat == 0) { if (params.mirostat == 0) {
for (const auto & cnstr : params.samplers) { for (const auto & cnstr : params.samplers) {
switch (cnstr) { switch (cnstr) {
case COMMON_SAMPLER_TYPE_DRY: case COMMON_SAMPLER_TYPE_DRY:
{ {
std::vector<const char*> c_breakers; std::vector<const char *> c_breakers;
c_breakers.reserve(params.dry_sequence_breakers.size()); c_breakers.reserve(params.dry_sequence_breakers.size());
for (const auto& str : params.dry_sequence_breakers) { for (const auto & str : params.dry_sequence_breakers) {
c_breakers.push_back(str.c_str()); c_breakers.push_back(str.c_str());
} }
@ -208,6 +196,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
case COMMON_SAMPLER_TYPE_INFILL: case COMMON_SAMPLER_TYPE_INFILL:
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model)); llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
break; break;
case COMMON_SAMPLER_TYPE_PENALTIES:
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
break;
default: default:
GGML_ASSERT(false && "unknown sampler type"); GGML_ASSERT(false && "unknown sampler type");
} }
@ -415,6 +406,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't'; case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
case COMMON_SAMPLER_TYPE_XTC: return 'x'; case COMMON_SAMPLER_TYPE_XTC: return 'x';
case COMMON_SAMPLER_TYPE_INFILL: return 'i'; case COMMON_SAMPLER_TYPE_INFILL: return 'i';
case COMMON_SAMPLER_TYPE_PENALTIES: return 'e';
default : return '?'; default : return '?';
} }
} }
@ -429,6 +421,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature"; case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
case COMMON_SAMPLER_TYPE_XTC: return "xtc"; case COMMON_SAMPLER_TYPE_XTC: return "xtc";
case COMMON_SAMPLER_TYPE_INFILL: return "infill"; case COMMON_SAMPLER_TYPE_INFILL: return "infill";
case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties";
default : return ""; default : return "";
} }
} }
@ -443,6 +436,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE }, { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
{ "xtc", COMMON_SAMPLER_TYPE_XTC }, { "xtc", COMMON_SAMPLER_TYPE_XTC },
{ "infill", COMMON_SAMPLER_TYPE_INFILL }, { "infill", COMMON_SAMPLER_TYPE_INFILL },
{ "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
}; };
// since samplers names are written multiple ways // since samplers names are written multiple ways
@ -489,6 +483,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES },
}; };
std::vector<common_sampler_type> samplers; std::vector<common_sampler_type> samplers;

View File

@ -62,6 +62,10 @@ struct common_speculative * common_speculative_init(
} }
void common_speculative_free(struct common_speculative * spec) { void common_speculative_free(struct common_speculative * spec) {
if (spec == nullptr) {
return;
}
common_sampler_free(spec->smpl); common_sampler_free(spec->smpl);
llama_batch_free(spec->batch); llama_batch_free(spec->batch);

View File

@ -221,7 +221,7 @@ class Model:
self.gguf_writer.add_context_length(n_ctx) self.gguf_writer.add_context_length(n_ctx)
logger.info(f"gguf: context length = {n_ctx}") logger.info(f"gguf: context length = {n_ctx}")
n_embd = self.find_hparam(["hidden_size", "n_embd"]) if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
self.gguf_writer.add_embedding_length(n_embd) self.gguf_writer.add_embedding_length(n_embd)
logger.info(f"gguf: embedding length = {n_embd}") logger.info(f"gguf: embedding length = {n_embd}")
@ -229,7 +229,7 @@ class Model:
self.gguf_writer.add_feed_forward_length(n_ff) self.gguf_writer.add_feed_forward_length(n_ff)
logger.info(f"gguf: feed forward length = {n_ff}") logger.info(f"gguf: feed forward length = {n_ff}")
n_head = self.find_hparam(["num_attention_heads", "n_head"]) if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count(n_head)
logger.info(f"gguf: head count = {n_head}") logger.info(f"gguf: head count = {n_head}")
@ -296,7 +296,9 @@ class Model:
break break
for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)): for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
data = data_torch.squeeze().numpy() # TODO: why do we squeeze here?
# data = data_torch.squeeze().numpy()
data = data_torch.numpy()
# if data ends up empty, it means data_torch was a scalar tensor -> restore # if data ends up empty, it means data_torch was a scalar tensor -> restore
if len(data.shape) == 0: if len(data.shape) == 0:
@ -324,6 +326,8 @@ class Model:
gguf.MODEL_TENSOR.TIME_MIX_W2, gguf.MODEL_TENSOR.TIME_MIX_W2,
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
gguf.MODEL_TENSOR.POSNET_NORM1,
gguf.MODEL_TENSOR.POSNET_NORM2,
) )
) )
or not new_name.endswith(".weight") or not new_name.endswith(".weight")
@ -658,6 +662,15 @@ class Model:
if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450": if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
# ref: https://huggingface.co/facebook/chameleon-7b # ref: https://huggingface.co/facebook/chameleon-7b
res = "chameleon" res = "chameleon"
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
res = "minerva-7b"
if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
# ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
res = "roberta-bpe"
if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
# ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
res = "gigachat"
if res is None: if res is None:
logger.warning("\n") logger.warning("\n")
@ -680,6 +693,9 @@ class Model:
return res return res
# Marker: End get_vocab_base_pre # Marker: End get_vocab_base_pre
def _set_vocab_none(self) -> None:
self.gguf_writer.add_tokenizer_model("none")
def _set_vocab_gpt2(self) -> None: def _set_vocab_gpt2(self) -> None:
tokens, toktypes, tokpre = self.get_vocab_base() tokens, toktypes, tokpre = self.get_vocab_base()
self.gguf_writer.add_tokenizer_model("gpt2") self.gguf_writer.add_tokenizer_model("gpt2")
@ -1831,29 +1847,40 @@ class MiniCPMModel(Model):
model_arch = gguf.MODEL_ARCH.MINICPM model_arch = gguf.MODEL_ARCH.MINICPM
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"] super().set_gguf_parameters()
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) embedding_scale = float(self.hparams["scale_emb"])
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_embedding_scale(embedding_scale)
self.gguf_writer.add_block_count(block_count) logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) self.gguf_writer.add_residual_scale(residual_scale)
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_logit_scale(logit_scale)
self.gguf_writer.add_file_type(self.ftype) logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
if self.hparams.get("rope_scaling") is not None:
if self.hparams["rope_scaling"].get("type") == "longrope":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
rope_scaling = self.find_hparam(['rope_scaling'], True)
if rope_scaling is not None:
long_factors = rope_scaling.get('long_factor', None)
short_factors = rope_scaling.get('short_factor', None)
if long_factors is None or short_factors is None:
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
def set_vocab(self): def set_vocab(self):
self._set_vocab_llama_hf() self._set_vocab_sentencepiece()
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
if n_kv_head is not None and n_head != n_kv_head:
n_head //= n_kv_head
return (
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape)
)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused del bid # unused
@ -1863,9 +1890,9 @@ class MiniCPMModel(Model):
# HF models permute some of the tensors, so we need to undo that # HF models permute some of the tensors, so we need to undo that
if name.endswith(("q_proj.weight")): if name.endswith(("q_proj.weight")):
data_torch = self._reverse_hf_permute(data_torch, n_head, n_head) data_torch = LlamaModel.permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight")): if name.endswith(("k_proj.weight")):
data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head) data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@ -1975,6 +2002,75 @@ class Qwen2Model(Model):
except FileNotFoundError: except FileNotFoundError:
self._set_vocab_gpt2() self._set_vocab_gpt2()
def set_gguf_parameters(self):
super().set_gguf_parameters()
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "yarn":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
@Model.register("Qwen2VLForConditionalGeneration")
class Qwen2VLModel(Model):
model_arch = gguf.MODEL_ARCH.QWEN2VL
def set_gguf_parameters(self):
super().set_gguf_parameters()
mrope_section = self.hparams["rope_scaling"]["mrope_section"]
mrope_section += [0] * max(0, 4 - len(mrope_section))
self.gguf_writer.add_rope_dimension_sections(mrope_section)
def set_vocab(self):
try:
self._set_vocab_sentencepiece()
except FileNotFoundError:
self._set_vocab_gpt2()
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
for name, data in super().get_tensors():
if name.startswith("visual."):
continue
yield name, data
@Model.register("WavTokenizerDec")
class WavTokenizerDecModel(Model):
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
if \
name.endswith("codebook.cluster_size") or \
name.endswith("codebook.embed_avg") or \
name.endswith("codebook.inited"):
logger.debug(f"Skipping {name!r}")
return []
logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
return [(self.map_tensor_name(name), data_torch)]
def set_vocab(self):
self._set_vocab_none()
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_vocab_size (self.hparams["vocab_size"])
self.gguf_writer.add_features_length (self.hparams["n_embd_features"])
self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"])
self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"])
self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"])
self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"])
self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"])
self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"])
self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"])
self.gguf_writer.add_causal_attention(False)
@Model.register("Qwen2MoeForCausalLM") @Model.register("Qwen2MoeForCausalLM")
class Qwen2MoeModel(Model): class Qwen2MoeModel(Model):
@ -2519,7 +2615,7 @@ class InternLM2Model(Model):
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@Model.register("BertModel", "CamembertModel") @Model.register("BertModel", "CamembertModel", "RobertaModel")
class BertModel(Model): class BertModel(Model):
model_arch = gguf.MODEL_ARCH.BERT model_arch = gguf.MODEL_ARCH.BERT
@ -2560,7 +2656,8 @@ class BertModel(Model):
# we need this to validate the size of the token_type embeddings # we need this to validate the size of the token_type embeddings
# though currently we are passing all zeros to the token_type embeddings # though currently we are passing all zeros to the token_type embeddings
self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B" # "Sequence A" or "Sequence B"
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
# convert to phantom space vocab # convert to phantom space vocab
def phantom(tok): def phantom(tok):
@ -3378,6 +3475,97 @@ class ArcticModel(Model):
raise ValueError(f"Unprocessed experts: {experts}") raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("DeepseekForCausalLM")
class DeepseekModel(Model):
model_arch = gguf.MODEL_ARCH.DEEPSEEK
def set_vocab(self):
try:
self._set_vocab_sentencepiece()
except FileNotFoundError:
self._set_vocab_gpt2()
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
if "head_dim" in hparams:
rope_dim = hparams["head_dim"]
else:
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(rope_dim)
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
self.gguf_writer.add_expert_weights_scale(1.0)
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
_experts: list[dict[str, Tensor]] | None = None
@staticmethod
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
if n_head_kv is not None and n_head != n_head_kv:
n_head = n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = DeepseekModel.permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head)
# process the experts separately
if name.find("mlp.experts") != -1:
n_experts = self.hparams["n_routed_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
tensors: list[tuple[str, Tensor]] = []
# merge the experts into a single 3d tensor
for w_name in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
return []
return [(self.map_tensor_name(name), data_torch)]
def prepare_tensors(self):
super().prepare_tensors()
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("DeepseekV2ForCausalLM") @Model.register("DeepseekV2ForCausalLM")
class DeepseekV2Model(Model): class DeepseekV2Model(Model):
model_arch = gguf.MODEL_ARCH.DEEPSEEK2 model_arch = gguf.MODEL_ARCH.DEEPSEEK2

View File

@ -17,7 +17,7 @@
# #
# python3 convert_hf_to_gguf_update.py <huggingface_token> # python3 convert_hf_to_gguf_update.py <huggingface_token>
# #
# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py # - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
# - Update llama.cpp with the new pre-tokenizer if necessary # - Update llama.cpp with the new pre-tokenizer if necessary
# #
# TODO: generate tokenizer tests for llama.cpp # TODO: generate tokenizer tests for llama.cpp
@ -102,6 +102,9 @@ models = [
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", }, {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", }, {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
] ]

View File

@ -55,7 +55,14 @@ cmake --build build --config Release
cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
cmake --build build-arm64-windows-llvm-release cmake --build build-arm64-windows-llvm-release
``` ```
Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_4_8 CPU kernels. Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels.
For building with ninja generator and clang compiler as default:
-set path:set LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\x64;C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.41.34120\lib\x64\uwp;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\x64
```bash
cmake --preset x64-windows-llvm-release
cmake --build build-x64-windows-llvm-release
```
## BLAS Build ## BLAS Build

View File

@ -20,7 +20,12 @@ else()
add_subdirectory(batched) add_subdirectory(batched)
add_subdirectory(embedding) add_subdirectory(embedding)
add_subdirectory(eval-callback) add_subdirectory(eval-callback)
if (NOT WIN32)
# disabled on Windows because it uses internal functions not exported with LLAMA_API
add_subdirectory(gbnf-validator) add_subdirectory(gbnf-validator)
endif()
add_subdirectory(gguf-hash) add_subdirectory(gguf-hash)
add_subdirectory(gguf-split) add_subdirectory(gguf-split)
add_subdirectory(gguf) add_subdirectory(gguf)
@ -46,12 +51,17 @@ else()
add_subdirectory(speculative) add_subdirectory(speculative)
add_subdirectory(speculative-simple) add_subdirectory(speculative-simple)
add_subdirectory(tokenize) add_subdirectory(tokenize)
add_subdirectory(tts)
add_subdirectory(gen-docs)
if (NOT GGML_BACKEND_DL) if (NOT GGML_BACKEND_DL)
# these examples use the backends directly and cannot be built with dynamic loading # these examples use the backends directly and cannot be built with dynamic loading
add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(cvector-generator) add_subdirectory(cvector-generator)
add_subdirectory(export-lora) add_subdirectory(export-lora)
if (NOT WIN32)
# disabled on Windows because it uses internal functions not exported with LLAMA_API
add_subdirectory(quantize-stats) add_subdirectory(quantize-stats)
endif()
add_subdirectory(llava) add_subdirectory(llava)
if (GGML_RPC) if (GGML_RPC)
add_subdirectory(rpc) add_subdirectory(rpc)

View File

@ -65,6 +65,7 @@ int main(int argc, char ** argv) {
llama_context * ctx = llama_new_context_with_model(model, ctx_params); llama_context * ctx = llama_new_context_with_model(model, ctx_params);
auto sparams = llama_sampler_chain_default_params(); auto sparams = llama_sampler_chain_default_params();
sparams.no_perf = false;
llama_sampler * smpl = llama_sampler_chain_init(sparams); llama_sampler * smpl = llama_sampler_chain_init(sparams);

View File

@ -12,7 +12,7 @@ int main(int argc, char** argv) {
} }
// Get only the program name from the full path // Get only the program name from the full path
auto pos = filename.find_last_of('/'); auto pos = filename.find_last_of("/\\");
if (pos != std::string::npos) { if (pos != std::string::npos) {
filename = filename.substr(pos+1); filename = filename.substr(pos+1);
} }

View File

@ -287,7 +287,7 @@ struct split_strategy {
} }
void print_info() { void print_info() {
printf("n_split: %ld\n", ctx_outs.size()); printf("n_split: %zu\n", ctx_outs.size());
int i_split = 0; int i_split = 0;
for (auto & ctx_out : ctx_outs) { for (auto & ctx_out : ctx_outs) {
// re-calculate the real gguf size for each split (= metadata size + total size of all tensors) // re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
@ -297,7 +297,7 @@ struct split_strategy {
total_size += ggml_nbytes(t); total_size += ggml_nbytes(t);
} }
total_size = total_size / 1000 / 1000; // convert to megabytes total_size = total_size / 1000 / 1000; // convert to megabytes
printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
i_split++; i_split++;
} }
} }

View File

@ -75,7 +75,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
} }
std::vector<float> emb_norm(emb_unorm.size()); std::vector<float> emb_norm(emb_unorm.size());
common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd); common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd, 2);
result.push_back(emb_norm); result.push_back(emb_norm);
#ifdef GRIT_DEBUG #ifdef GRIT_DEBUG

View File

@ -1521,7 +1521,7 @@ int main(int argc, char ** argv) {
for (const auto & inst : params_instances) { for (const auto & inst : params_instances) {
params_idx++; params_idx++;
if (params.progress) { if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count); fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
} }
// keep the same model between tests when possible // keep the same model between tests when possible
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@ -1573,14 +1573,14 @@ int main(int argc, char ** argv) {
// warmup run // warmup run
if (t.n_prompt > 0) { if (t.n_prompt > 0) {
if (params.progress) { if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count); fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
} }
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads); //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
} }
if (t.n_gen > 0) { if (t.n_gen > 0) {
if (params.progress) { if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count); fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
} }
test_gen(ctx, 1, t.n_threads); test_gen(ctx, 1, t.n_threads);
} }
@ -1592,14 +1592,14 @@ int main(int argc, char ** argv) {
if (t.n_prompt > 0) { if (t.n_prompt > 0) {
if (params.progress) { if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
i + 1, params.reps); i + 1, params.reps);
} }
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
} }
if (t.n_gen > 0) { if (t.n_gen > 0) {
if (params.progress) { if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
i + 1, params.reps); i + 1, params.reps);
} }
test_gen(ctx, t.n_gen, t.n_threads); test_gen(ctx, t.n_gen, t.n_threads);

View File

@ -19,6 +19,7 @@ android {
externalNativeBuild { externalNativeBuild {
cmake { cmake {
arguments += "-DLLAMA_BUILD_COMMON=ON" arguments += "-DLLAMA_BUILD_COMMON=ON"
arguments += "-DGGML_LLAMAFILE=OFF"
arguments += "-DCMAKE_BUILD_TYPE=Release" arguments += "-DCMAKE_BUILD_TYPE=Release"
cppFlags += listOf() cppFlags += listOf()
arguments += listOf() arguments += listOf()

View File

@ -210,20 +210,20 @@ actor LlamaContext {
llama_kv_cache_clear(context) llama_kv_cache_clear(context)
let t_pp_start = ggml_time_us() let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;
if llama_decode(context, batch) != 0 { if llama_decode(context, batch) != 0 {
print("llama_decode() failed during prompt") print("llama_decode() failed during prompt")
} }
llama_synchronize(context) llama_synchronize(context)
let t_pp_end = ggml_time_us() let t_pp_end = DispatchTime.now().uptimeNanoseconds / 1000;
// bench text generation // bench text generation
llama_kv_cache_clear(context) llama_kv_cache_clear(context)
let t_tg_start = ggml_time_us() let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;
for i in 0..<tg { for i in 0..<tg {
llama_batch_clear(&batch) llama_batch_clear(&batch)
@ -238,7 +238,7 @@ actor LlamaContext {
llama_synchronize(context) llama_synchronize(context)
} }
let t_tg_end = ggml_time_us() let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;
llama_kv_cache_clear(context) llama_kv_cache_clear(context)

View File

@ -7,6 +7,7 @@
objects = { objects = {
/* Begin PBXBuildFile section */ /* Begin PBXBuildFile section */
1809696D2D05A39F00400EE8 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = 1809696C2D05A39F00400EE8 /* llama */; };
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; }; 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; }; 79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; };
7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; }; 7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
@ -17,7 +18,6 @@
8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; }; 8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; }; 8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; }; 8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
DF810E132B4A5BA200301144 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = DF810E122B4A5BA200301144 /* llama */; };
F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; }; F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; };
/* End PBXBuildFile section */ /* End PBXBuildFile section */
@ -42,7 +42,7 @@
isa = PBXFrameworksBuildPhase; isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647; buildActionMask = 2147483647;
files = ( files = (
DF810E132B4A5BA200301144 /* llama in Frameworks */, 1809696D2D05A39F00400EE8 /* llama in Frameworks */,
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */, 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */, 8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
); );
@ -151,7 +151,7 @@
); );
name = llama.swiftui; name = llama.swiftui;
packageProductDependencies = ( packageProductDependencies = (
DF810E122B4A5BA200301144 /* llama */, 1809696C2D05A39F00400EE8 /* llama */,
); );
productName = llama.swiftui; productName = llama.swiftui;
productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */; productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
@ -429,7 +429,7 @@
/* End XCConfigurationList section */ /* End XCConfigurationList section */
/* Begin XCSwiftPackageProductDependency section */ /* Begin XCSwiftPackageProductDependency section */
DF810E122B4A5BA200301144 /* llama */ = { 1809696C2D05A39F00400EE8 /* llama */ = {
isa = XCSwiftPackageProductDependency; isa = XCSwiftPackageProductDependency;
productName = llama; productName = llama;
}; };

View File

@ -43,3 +43,10 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17) target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-qwen2vl-cli)
add_executable(${TARGET} qwen2vl-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -103,7 +103,9 @@ static std::string format(const char * fmt, ...) {
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector" #define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
#define KEY_HAS_GLM_PROJ "clip.has_glm_projector" #define KEY_HAS_GLM_PROJ "clip.has_glm_projector"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version" #define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
#define KEY_USE_GELU "clip.use_gelu" #define KEY_USE_GELU "clip.use_gelu"
#define KEY_USE_SILU "clip.use_silu"
#define KEY_N_EMBD "clip.%s.embedding_length" #define KEY_N_EMBD "clip.%s.embedding_length"
#define KEY_N_FF "clip.%s.feed_forward_length" #define KEY_N_FF "clip.%s.feed_forward_length"
#define KEY_N_BLOCK "clip.%s.block_count" #define KEY_N_BLOCK "clip.%s.block_count"
@ -130,7 +132,8 @@ static std::string format(const char * fmt, ...) {
#define TN_TOKEN_EMBD "%s.token_embd.weight" #define TN_TOKEN_EMBD "%s.token_embd.weight"
#define TN_POS_EMBD "%s.position_embd.weight" #define TN_POS_EMBD "%s.position_embd.weight"
#define TN_CLASS_EMBD "v.class_embd" #define TN_CLASS_EMBD "v.class_embd"
#define TN_PATCH_EMBD "v.patch_embd.weight" #define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
#define TN_PATCH_BIAS "v.patch_embd.bias" #define TN_PATCH_BIAS "v.patch_embd.bias"
#define TN_ATTN_K "%s.blk.%d.attn_k.%s" #define TN_ATTN_K "%s.blk.%d.attn_k.%s"
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s" #define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
@ -174,6 +177,7 @@ enum projector_type {
PROJECTOR_TYPE_LDPV2, PROJECTOR_TYPE_LDPV2,
PROJECTOR_TYPE_RESAMPLER, PROJECTOR_TYPE_RESAMPLER,
PROJECTOR_TYPE_ADAPTER, PROJECTOR_TYPE_ADAPTER,
PROJECTOR_TYPE_MERGER,
PROJECTOR_TYPE_UNKNOWN, PROJECTOR_TYPE_UNKNOWN,
}; };
@ -183,6 +187,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_LDPV2, "ldpv2"}, { PROJECTOR_TYPE_LDPV2, "ldpv2"},
{ PROJECTOR_TYPE_RESAMPLER, "resampler"}, { PROJECTOR_TYPE_RESAMPLER, "resampler"},
{ PROJECTOR_TYPE_ADAPTER, "adapter"} { PROJECTOR_TYPE_ADAPTER, "adapter"}
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
}; };
@ -475,7 +480,8 @@ struct clip_vision_model {
// embeddings // embeddings
struct ggml_tensor * class_embedding; struct ggml_tensor * class_embedding;
struct ggml_tensor * patch_embeddings; struct ggml_tensor * patch_embeddings_0;
struct ggml_tensor * patch_embeddings_1; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
struct ggml_tensor * patch_bias; struct ggml_tensor * patch_bias;
struct ggml_tensor * position_embeddings; struct ggml_tensor * position_embeddings;
@ -572,6 +578,7 @@ struct clip_ctx {
bool has_llava_projector = false; bool has_llava_projector = false;
bool has_minicpmv_projector = false; bool has_minicpmv_projector = false;
bool has_glm_projector = false; bool has_glm_projector = false;
bool has_qwen2vl_merger = false;
int minicpmv_version = 2; int minicpmv_version = 2;
struct clip_vision_model vision_model; struct clip_vision_model vision_model;
@ -580,6 +587,7 @@ struct clip_ctx {
float image_mean[3]; float image_mean[3];
float image_std[3]; float image_std[3];
bool use_gelu = false; bool use_gelu = false;
bool use_silu = false;
int32_t ftype = 1; int32_t ftype = 1;
bool has_class_embedding = true; bool has_class_embedding = true;
@ -625,14 +633,26 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
image_size_height = imgs->data->ny; image_size_height = imgs->data->ny;
} }
} }
else if (ctx->has_qwen2vl_merger) {
// use the image's native resolution when image is avaible
if (is_inf) {
// if (imgs->data->nx && imgs->data->ny) {
image_size_width = imgs->data->nx;
image_size_height = imgs->data->ny;
}
}
const int patch_size = hparams.patch_size; const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int patches_w = image_size_width / patch_size;
const int patches_h = image_size_height / patch_size;
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions;
const int hidden_size = hparams.hidden_size; const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head; const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head; const int d_head = hidden_size / n_head;
int n_layer = hparams.n_layer; int n_layer = hparams.n_layer;
const float eps = hparams.eps; const float eps = hparams.eps;
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
const int batch_size = imgs->size; const int batch_size = imgs->size;
@ -653,10 +673,30 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
ggml_set_name(inp_raw, "inp_raw"); ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw); ggml_set_input(inp_raw);
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1); struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
if (ctx->has_qwen2vl_merger) {
GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_reshape_4d(
ctx0, inp,
hidden_size * 2, patches_w / 2, patches_h, batch_size);
inp = ggml_reshape_4d(
ctx0, inp,
hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
inp = ggml_reshape_3d(
ctx0, inp,
hidden_size, patches_w * patches_h, batch_size);
}
else {
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
}
if (ctx->has_patch_bias) { if (ctx->has_patch_bias) {
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp)); // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
@ -678,12 +718,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
} }
} }
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
ggml_set_name(positions, "positions"); ggml_set_name(positions, "positions");
ggml_set_input(positions); ggml_set_input(positions);
if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding
embeddings = embeddings =
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
}
if (ctx->has_minicpmv_projector) { if (ctx->has_minicpmv_projector) {
int pos_w = image_size_width/patch_size; int pos_w = image_size_width/patch_size;
@ -707,7 +749,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
} }
// loop over layers // loop over layers
if (ctx->has_minicpmv_projector || ctx->has_glm_projector) { if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
n_layer += 1; n_layer += 1;
} }
for (int il = 0; il < n_layer - 1; il++) { for (int il = 0; il < n_layer - 1; il++) {
@ -729,8 +771,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
struct ggml_tensor * Q = struct ggml_tensor * Q =
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
if (ctx->has_qwen2vl_merger) {
Q = ggml_rope_multi(
ctx0, Q, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
}
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size); Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
@ -738,6 +785,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b); ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
if (ctx->has_qwen2vl_merger) {
K = ggml_rope_multi(
ctx0, K, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
}
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
@ -777,6 +829,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
if (ctx->use_gelu) { if (ctx->use_gelu) {
cur = ggml_gelu_inplace(ctx0, cur); cur = ggml_gelu_inplace(ctx0, cur);
} else if (ctx->use_silu) {
cur = ggml_silu_inplace(ctx0, cur);
} else { } else {
cur = ggml_gelu_quick_inplace(ctx0, cur); cur = ggml_gelu_quick_inplace(ctx0, cur);
} }
@ -788,6 +842,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
cur = ggml_add(ctx0, embeddings, cur); cur = ggml_add(ctx0, embeddings, cur);
embeddings = cur; embeddings = cur;
} }
// post-layernorm // post-layernorm
@ -859,7 +914,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3)); mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
// stride = 1, padding = 1, bias is nullptr // stride = 1, padding = 1, bias is nullptr
block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
// layer norm // layer norm
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
@ -907,7 +962,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
// block_2 // block_2
{ {
// stride = 2 // stride = 2
block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
// layer norm // layer norm
@ -968,7 +1023,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
// mlp_2 ne [24, 24, 2048, 1] // mlp_2 ne [24, 24, 2048, 1]
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
// weight ne = [3, 3, 2048, 1] // weight ne = [3, 3, 2048, 1]
struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
@ -1075,6 +1130,18 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
}else{ }else{
GGML_ABORT("fatel error"); GGML_ABORT("fatel error");
} }
else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
// GELU activation
embeddings = ggml_gelu(ctx0, embeddings);
// Second linear layer
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
} }
// build the graph // build the graph
@ -1257,6 +1324,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->has_glm_projector = gguf_get_val_bool(ctx, idx); new_clip->has_glm_projector = gguf_get_val_bool(ctx, idx);
} }
idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER);
if (idx != -1) {
new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx);
}
// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
GGML_ASSERT(new_clip->has_vision_encoder); GGML_ASSERT(new_clip->has_vision_encoder);
@ -1265,6 +1336,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
idx = get_key_idx(ctx, KEY_USE_GELU); idx = get_key_idx(ctx, KEY_USE_GELU);
new_clip->use_gelu = gguf_get_val_bool(ctx, idx); new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
try {
idx = get_key_idx(ctx, KEY_USE_SILU);
new_clip->use_silu = gguf_get_val_bool(ctx, idx);
} catch (std::runtime_error & /*e*/) {
new_clip->use_silu = false;
}
if (verbosity >= 1) { if (verbosity >= 1) {
LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
@ -1441,11 +1519,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
try { try {
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
} catch(const std::exception& /*e*/) { } catch(const std::exception& /*e*/) {
LOG_ERR("%s: failed to load vision model tensors\n", __func__); LOG_ERR("%s: failed to load vision model tensors\n", __func__);
} }
try {
vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
} catch(const std::exception& /*e*/) {
new_clip->has_qwen2vl_merger = false;
}
// LLaVA projection // LLaVA projection
if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) { if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
@ -1544,6 +1627,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_4H_2_H,"weight")); vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
vision_model.boi_w = get_tensor(new_clip->ctx_data, TN_GLM_BOI_W); vision_model.boi_w = get_tensor(new_clip->ctx_data, TN_GLM_BOI_W);
vision_model.eoi_w = get_tensor(new_clip->ctx_data, TN_GLM_EOI_W); vision_model.eoi_w = get_tensor(new_clip->ctx_data, TN_GLM_EOI_W);
else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) {
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
} }
else { else {
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
@ -1583,6 +1671,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend)); new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
clip_image_f32_batch batch; clip_image_f32_batch batch;
batch.size = 1; batch.size = 1;
batch.data = nullptr;
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false); ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
ggml_gallocr_reserve(new_clip->compute_alloc, gf); ggml_gallocr_reserve(new_clip->compute_alloc, gf);
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
@ -1596,6 +1685,10 @@ void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size
ctx_clip->load_image_size = load_image_size; ctx_clip->load_image_size = load_image_size;
} }
struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
return ctx_clip->load_image_size;
}
struct clip_image_size * clip_image_size_init() { struct clip_image_size * clip_image_size_init() {
struct clip_image_size * load_image_size = new struct clip_image_size(); struct clip_image_size * load_image_size = new struct clip_image_size();
load_image_size->width = 448; load_image_size->width = 448;
@ -2048,6 +2141,23 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
} }
return true; return true;
} }
else if (ctx->has_qwen2vl_merger) {
clip_image_u8 * resized = clip_image_u8_init();
auto patch_size = clip_patch_size(ctx) * 2;
int nx = ceil((float)img->nx / patch_size) * patch_size;
int ny = ceil((float)img->ny / patch_size) * patch_size;
bicubic_resize(*img, *resized, nx, ny);
res_imgs->data = new clip_image_f32[1];
// clip_image_f32 * res = clip_image_f32_init();
normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std);
// res_imgs->data[0] = *res;
res_imgs->size = 1;
// clip_image_f32_free(res);
clip_image_u8_free(resized);
return true;
}
if(ctx->has_glm_projector){ if(ctx->has_glm_projector){
res_imgs->size = 1; res_imgs->size = 1;
@ -2253,6 +2363,13 @@ size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
} }
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
clip_image_f32 img;
img.nx = img_w;
img.ny = img_h;
return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
}
int32_t clip_image_size(const struct clip_ctx * ctx) { int32_t clip_image_size(const struct clip_ctx * ctx) {
return ctx->vision_model.hparams.image_size; return ctx->vision_model.hparams.image_size;
} }
@ -2274,6 +2391,13 @@ const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
} }
int clip_n_patches(const struct clip_ctx * ctx) { int clip_n_patches(const struct clip_ctx * ctx) {
clip_image_f32 img;
img.nx = ctx->vision_model.hparams.image_size;
img.ny = ctx->vision_model.hparams.image_size;
return clip_n_patches_by_img(ctx, &img);
}
int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->vision_model.hparams; const auto & params = ctx->vision_model.hparams;
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
@ -2287,6 +2411,11 @@ int clip_n_patches(const struct clip_ctx * ctx) {
else if (ctx->minicpmv_version == 3) { else if (ctx->minicpmv_version == 3) {
n_patches = 64; n_patches = 64;
} }
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
int patch_size = params.patch_size * 2;
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
n_patches = x_patch * y_patch;
} }
return n_patches; return n_patches;
@ -2421,7 +2550,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int image_size = hparams.image_size; const int image_size = hparams.image_size;
int image_size_width = image_size; int image_size_width = image_size;
int image_size_height = image_size; int image_size_height = image_size;
if (ctx->has_minicpmv_projector) { if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
image_size_width = imgs->data[0].nx; image_size_width = imgs->data[0].nx;
image_size_height = imgs->data[0].ny; image_size_height = imgs->data[0].ny;
} }
@ -2441,7 +2570,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
for (size_t i = 0; i < imgs->size; i++) { for (size_t i = 0; i < imgs->size; i++) {
const int nx = imgs->data[i].nx; const int nx = imgs->data[i].nx;
const int ny = imgs->data[i].ny; const int ny = imgs->data[i].ny;
if (!ctx->has_minicpmv_projector) { if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) {
GGML_ASSERT(nx == image_size && ny == image_size); GGML_ASSERT(nx == image_size && ny == image_size);
} }
@ -2499,9 +2628,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed)); float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
for(int i=0;i<pos_w * pos_h;++i){ for(int i=0;i < pos_w * pos_h; ++i){
for(int j=0;j<embed_dim;++j){ for(int j=0; j < embed_dim; ++j){
pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j]; pos_embed_data[i * embed_dim + j] = pos_embed_t[i][j];
} }
} }
@ -2521,7 +2650,34 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} }
} }
if (ctx->has_qwen2vl_merger) {
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
const int pw = image_size_width / patch_size;
const int ph = image_size_height / patch_size;
int* positions_data = (int*)malloc(ggml_nbytes(positions));
int ptr = 0;
for (int y = 0; y < ph; y+=2)
{ {
for (int x = 0; x < pw; x+=2)
{
for (int dy = 0; dy < 2; dy++) {
for (int dx = 0; dx < 2; dx++) {
positions_data[ptr] = y + dy;
positions_data[num_patches + ptr] = x + dx;
positions_data[num_patches * 2 + ptr] = y + dy;
positions_data[num_patches * 3 + ptr] = x + dx;
ptr++;
}
}
}
}
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
}
else {
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
int* positions_data = (int*)malloc(ggml_nbytes(positions)); int* positions_data = (int*)malloc(ggml_nbytes(positions));
@ -2530,7 +2686,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} }
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data); free(positions_data);
}
if (!ctx->has_glm_projector){ if (!ctx->has_glm_projector){
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
@ -2542,6 +2697,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
free(patches_data); free(patches_data);
} }
} }
}
if (ggml_backend_is_cpu(ctx->backend)) { if (ggml_backend_is_cpu(ctx->backend)) {
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads); ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
@ -2722,6 +2878,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
if (ctx->proj_type == PROJECTOR_TYPE_ADAPTER){ if (ctx->proj_type == PROJECTOR_TYPE_ADAPTER){
return ctx->vision_model.mm_model_mlp_3_w->ne[1]; return ctx->vision_model.mm_model_mlp_3_w->ne[1];
} }
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
return ctx->vision_model.mm_1_b->ne[0];
}
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@ -2737,3 +2896,20 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
bool clip_is_glm(const struct clip_ctx * ctx) { bool clip_is_glm(const struct clip_ctx * ctx) {
return ctx->has_glm_projector; return ctx->has_glm_projector;
} }
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
return ctx->has_qwen2vl_merger;
}
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
clip_image_f32 clip_img;
clip_img.buf.resize(h * w * 3);
for (int i = 0; i < h*w*3; i++)
{
clip_img.buf[i] = img[i];
}
clip_img.nx = w;
clip_img.ny = h;
clip_image_encode(ctx, n_threads, &clip_img, vec);
return true;
}

View File

@ -45,6 +45,7 @@ CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity
CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API void clip_free(struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx); CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx); CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
@ -56,10 +57,12 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
CLIP_API int clip_n_patches (const struct clip_ctx * ctx); CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size); CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
CLIP_API struct clip_image_size * clip_image_size_init(); CLIP_API struct clip_image_size * clip_image_size_init();
CLIP_API struct clip_image_u8 * clip_image_u8_init (); CLIP_API struct clip_image_u8 * clip_image_u8_init ();
@ -86,6 +89,9 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx); CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx); CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);

View File

@ -259,18 +259,24 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip); const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
if (clip_is_minicpmv(ctx_clip)) { if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
std::vector<float *> image_embd_v; std::vector<float *> image_embd_v;
image_embd_v.resize(img_res_v.size); image_embd_v.resize(img_res_v.size);
struct clip_image_size * load_image_size = clip_image_size_init(); struct clip_image_size * load_image_size = clip_image_size_init();
for (size_t i = 0; i < img_res_v.size; i++) { for (size_t i = 0; i < img_res_v.size; i++) {
const int64_t t_img_enc_step_start_us = ggml_time_us(); const int64_t t_img_enc_step_start_us = ggml_time_us();
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
int patch_size=14; int patch_size=14;
load_image_size->width = img_res_v.data[i].nx; load_image_size->width = img_res_v.data[i].nx;
load_image_size->height = img_res_v.data[i].ny; load_image_size->height = img_res_v.data[i].ny;
clip_add_load_image_size(ctx_clip, load_image_size); clip_add_load_image_size(ctx_clip, load_image_size);
bool encoded = false; bool encoded = false;
if (clip_is_qwen2vl(ctx_clip)) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
else {
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip); int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
if (has_minicpmv_projector == 2) { if (has_minicpmv_projector == 2) {
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
@ -278,6 +284,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
else if (has_minicpmv_projector == 3) { else if (has_minicpmv_projector == 3) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
} }
}
if (!encoded) { if (!encoded) {
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false; return false;
@ -290,8 +298,11 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
int n_img_pos_out = 0; int n_img_pos_out = 0;
for (size_t i = 0; i < image_embd_v.size(); i++) { for (size_t i = 0; i < image_embd_v.size(); i++) {
std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip)); std::memcpy(
n_img_pos_out += clip_n_patches(ctx_clip); image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
image_embd_v[i],
clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]);
} }
*n_img_pos = n_img_pos_out; *n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) { for (size_t i = 0; i < image_embd_v.size(); i++) {
@ -405,6 +416,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
num_max_patches = 1; num_max_patches = 1;
} }
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
float * image_embd;
if (clip_is_qwen2vl(ctx_clip)) {
// qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img->nx, img->ny));
} else {
image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
}
if (!image_embd) { if (!image_embd) {
LOG_ERR("Unable to allocate memory for image embeddings\n"); LOG_ERR("Unable to allocate memory for image embeddings\n");
return false; return false;

View File

@ -0,0 +1,165 @@
import argparse
from typing import Dict
import torch
import numpy as np
from gguf import *
from transformers import (
Qwen2VLForConditionalGeneration,
Qwen2VLProcessor,
AutoProcessor,
Qwen2VLConfig
)
VISION = "clip.vision"
def k(raw_key: str, arch: str) -> str:
return raw_key.format(arch=arch)
def to_gguf_name(name: str) -> str:
og = name
name = name.replace("text_model", "t").replace("vision_model", "v")
name = name.replace("blocks", "blk").replace("embeddings.", "")
name = name.replace("attn.", "attn_")
name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
# name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
name = name.replace("merger.mlp", 'mm')
print(f"[to_gguf_name] {og} --> {name}")
return name
def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
vision_model = qwen2vl.visual
tensor_map = {}
for name, ten in vision_model.state_dict().items():
ten = ten.numpy()
if 'qkv' in name:
if ten.ndim == 2: # weight
c3, _ = ten.shape
else: # bias
c3 = ten.shape[0]
assert c3 % 3 == 0
c = c3 // 3
wq = ten[:c]
wk = ten[c: c * 2]
wv = ten[c * 2:]
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
elif 'merger' in name:
if name.endswith("ln_q.weight"):
tensor_map['v.post_ln.weight'] = ten
elif name.endswith("ln_q.bias"):
tensor_map['v.post_ln.bias'] = ten
else:
# "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
tensor_map[to_gguf_name(name)] = ten
elif 'patch_embed.proj.weight' in name:
# NOTE: split Conv3D into Conv2Ds
c1, c2, kt, kh, kw = ten.shape
assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
else:
tensor_map[to_gguf_name(f"vision_model.{name}")] = ten
for new_name, ten in tensor_map.items():
if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
tensor_map[new_name] = ten.astype(np.float32)
else:
tensor_map[new_name] = ten.astype(dtype)
tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder
return tensor_map
def main(args):
if args.data_type == 'fp32':
dtype = torch.float32
np_dtype = np.float32
ftype = 0
elif args.data_type == 'fp16':
dtype = torch.float32
np_dtype = np.float16
ftype = 1
else:
raise ValueError()
local_model = False
model_path = ""
model_name = args.model_name
print("model_name: ", model_name)
qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
model_name, torch_dtype=dtype, device_map="cpu"
)
cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
vcfg = cfg.vision_config
if os.path.isdir(model_name):
local_model = True
if model_name.endswith(os.sep):
model_name = model_name[:-1]
model_path = model_name
model_name = os.path.basename(model_name)
fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf"
fout = GGUFWriter(path=fname_out, arch="clip")
fout.add_description("image encoder for Qwen2VL")
fout.add_file_type(ftype)
fout.add_bool("clip.has_text_encoder", False)
fout.add_bool("clip.has_vision_encoder", True)
fout.add_bool("clip.has_qwen2vl_merger", True)
fout.add_string("clip.projector_type", "qwen2vl_merger")
print(cfg.vision_config)
if 'silu' in cfg.vision_config.hidden_act.lower():
fout.add_bool("clip.use_silu", True)
fout.add_bool("clip.use_gelu", False)
elif 'gelu' in cfg.vision_config.hidden_act.lower():
fout.add_bool("clip.use_silu", False)
fout.add_bool("clip.use_gelu", 'quick' not in cfg.vision_config.hidden_act.lower())
else:
raise ValueError()
tensor_map = find_vision_tensors(qwen2vl, np_dtype)
for name, data in tensor_map.items():
fout.add_tensor(name, data)
fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2)
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0) # not sure what this does, put 0 here as a placeholder
fout.add_name(model_name)
"""
HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig,
it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`.
"""
if local_model:
processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path)
else:
processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name)
fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue]
fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue]
fout.write_header_to_file()
fout.write_kv_data_to_file()
fout.write_tensors_to_file()
fout.close()
print("save model as: ", fname_out)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
args = parser.parse_args()
main(args)

View File

@ -0,0 +1,581 @@
#include "arg.h"
#include "base64.hpp"
#include "log.h"
#include "common.h"
#include "sampling.h"
#include "clip.h"
#include "llava.h"
#include "llama.h"
#include "ggml.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef NDEBUG
#include "ggml-alloc.h"
#include "ggml-backend.h"
#endif
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <vector>
#include <algorithm>
#include <iostream>
#include <fstream>
static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
const int patch_size = 14 * 2;
const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0);
const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0);
auto img_tokens = image_embed->n_image_pos;
// llama_pos mrope_pos[img_tokens * 4];
std::vector<llama_pos> mrope_pos;
mrope_pos.resize(img_tokens * 4);
for (int y = 0; y < ph; y++)
{
for (int x = 0; x < pw; x++)
{
int i = y * pw + x;
mrope_pos[i] = *st_pos_id;
mrope_pos[i + img_tokens] = *st_pos_id + y;
mrope_pos[i + img_tokens * 2] = *st_pos_id + x;
mrope_pos[i + img_tokens * 3] = 0;
}
}
*st_pos_id += std::max(pw, ph);
int processed = 0;
std::vector<llama_pos> batch_mrope_pos;
batch_mrope_pos.resize(img_tokens * 4);
for (int i = 0; i < img_tokens; i += n_batch) {
int n_eval = img_tokens - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
// llama_pos batch_mrope_pos[n_eval * 4];
std::fill(batch_mrope_pos.begin(), batch_mrope_pos.end(), 0);
memcpy(batch_mrope_pos.data(), &mrope_pos[processed], n_eval * sizeof(llama_pos));
memcpy(&batch_mrope_pos[n_eval * 1], &mrope_pos[img_tokens * 1 + processed], n_eval * sizeof(llama_pos));
memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos));
memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos));
llama_batch batch = {
int32_t(n_eval), // n_tokens
nullptr, // token
(image_embed->embed+i*n_embd), // embed
batch_mrope_pos.data(), // pos
nullptr, // n_seq_id
nullptr, // seq_id
nullptr, // logits
};
if (llama_decode(ctx_llama, batch)) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
*n_past += n_eval;
processed += n_eval;
}
return true;
}
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
int N = (int) tokens.size();
std::vector<llama_pos> pos;
for (int i = 0; i < N; i += n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
auto batch = llama_batch_get_one(&tokens[i], n_eval);
// TODO: add mrope pos ids somewhere else
pos.resize(batch.n_tokens * 4);
std::fill(pos.begin(), pos.end(), 0);
for (int j = 0; j < batch.n_tokens * 3; j ++) {
pos[j] = *st_pos_id + (j % batch.n_tokens);
}
batch.pos = pos.data();
if (llama_decode(ctx_llama, batch)) {
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}
*n_past += n_eval;
*st_pos_id += n_eval;
}
return true;
}
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past, int * st_pos_id) {
std::vector<llama_token> tokens;
tokens.push_back(id);
return eval_tokens(ctx_llama, tokens, 1, n_past, st_pos_id);
}
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, int * st_pos_id, bool add_bos){
std::string str2 = str;
std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
eval_tokens(ctx_llama, embd_inp, n_batch, n_past, st_pos_id);
return true;
}
static const char * sample(struct common_sampler * smpl,
struct llama_context * ctx_llama,
int * n_past, int * st_pos_id) {
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
common_sampler_accept(smpl, id, true);
static std::string ret;
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
ret = "</s>";
} else {
ret = common_token_to_piece(ctx_llama, id);
}
eval_id(ctx_llama, id, n_past, st_pos_id);
return ret.c_str();
}
static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
static const char* IMG_BASE64_TAG_END = "\">";
static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
}
static bool prompt_contains_image(const std::string& prompt) {
size_t begin, end;
find_image_tag_in_prompt(prompt, begin, end);
return (begin != std::string::npos);
}
// replaces the base64 image tag in the prompt with `replacement`
static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
size_t img_base64_str_start, img_base64_str_end;
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
return NULL;
}
auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
auto required_bytes = base64::required_encode_size(base64_str.size());
auto img_bytes = std::vector<unsigned char>(required_bytes);
base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
if (!embed) {
LOG_ERR("%s: could not load image from base64 string.\n", __func__);
return NULL;
}
return embed;
}
static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
size_t begin, end;
find_image_tag_in_prompt(prompt, begin, end);
if (begin == std::string::npos || end == std::string::npos) {
return prompt;
}
auto pre = prompt.substr(0, begin);
auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
return pre + replacement + post;
}
struct llava_context {
struct clip_ctx * ctx_clip = NULL;
struct llama_context * ctx_llama = NULL;
struct llama_model * model = NULL;
};
static void print_usage(int, char ** argv) {
LOG("\n example usage:\n");
LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
// load and preprocess the image
llava_image_embed * embed = NULL;
auto prompt = params->prompt;
if (prompt_contains_image(prompt)) {
if (!params->image.empty()) {
LOG_INF("using base64 encoded image instead of command line image path\n");
}
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
if (!embed) {
LOG_ERR("%s: can't load image from prompt\n", __func__);
return NULL;
}
params->prompt = remove_image_from_prompt(prompt);
} else {
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embed) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
return NULL;
}
}
return embed;
}
static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
int n_past = 0;
int cur_pos_id = 0;
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
std::string system_prompt, user_prompt;
size_t image_pos = prompt.find("<|vision_start|>");
if (image_pos != std::string::npos) {
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
system_prompt = prompt.substr(0, image_pos);
user_prompt = prompt.substr(image_pos + std::string("<|vision_pad|>").length());
LOG_INF("system_prompt: %s\n", system_prompt.c_str());
if (params->verbose_prompt) {
auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
LOG_INF("user_prompt: %s\n", user_prompt.c_str());
if (params->verbose_prompt) {
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
} else {
// llava-1.5 native mode
system_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|>";
user_prompt = "<|vision_end|>" + prompt + "<|im_end|>\n<|im_start|>assistant\n";
if (params->verbose_prompt) {
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
}
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, true);
if (image_embed != nullptr) {
auto image_size = clip_get_load_image_size(ctx_llava->ctx_clip);
qwen2vl_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past, &cur_pos_id, image_size);
}
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, false);
// generate the response
LOG("\n");
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
if (!smpl) {
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
exit(1);
}
std::string response = "";
for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past, &cur_pos_id);
response += tmp;
if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior
LOG("%s", tmp);
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
fflush(stdout);
}
common_sampler_free(smpl);
LOG("\n");
}
static struct llama_model * llava_init(common_params * params) {
llama_backend_init();
llama_numa_init(params->numa);
llama_model_params model_params = common_model_params_to_llama(*params);
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
}
return model;
}
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
const char * clip_path = params->mmproj.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
llama_context_params ctx_params = common_context_params_to_llama(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL) {
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return NULL;
}
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
ctx_llava->ctx_llama = ctx_llama;
ctx_llava->ctx_clip = ctx_clip;
ctx_llava->model = model;
return ctx_llava;
}
static void llava_free(struct llava_context * ctx_llava) {
if (ctx_llava->ctx_clip) {
clip_free(ctx_llava->ctx_clip);
ctx_llava->ctx_clip = NULL;
}
llama_free(ctx_llava->ctx_llama);
llama_free_model(ctx_llava->model);
llama_backend_free();
}
#ifndef NDEBUG
static void debug_test_mrope_2d() {
// 1. Initialize backend
ggml_backend_t backend = NULL;
std::string backend_name = "";
#ifdef GGML_USE_CUDA
fprintf(stderr, "%s: using CUDA backend\n", __func__);
backend = ggml_backend_cuda_init(0); // init device 0
backend_name = "cuda";
if (!backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
#endif
// if there aren't GPU Backends fallback to CPU backend
if (!backend) {
backend = ggml_backend_cpu_init();
backend_name = "cpu";
}
// Calculate the size needed to allocate
size_t ctx_size = 0;
ctx_size += 2 * ggml_tensor_overhead(); // tensors
// no need to allocate anything else!
// 2. Allocate `ggml_context` to store tensor data
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors()
};
struct ggml_context * ctx = ggml_init(params);
struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 12, 30);
ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);
struct ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 30 * 4);
ggml_set_name(pos, "pos");
ggml_set_input(pos);
std::vector<float> dummy_q;
dummy_q.resize(128 * 12 * 30);
std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
// memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
std::vector<int> pos_id;
pos_id.resize(30 * 4);
for (int i = 0; i < 30; i ++) {
pos_id[i] = i;
pos_id[i + 30] = i + 10;
pos_id[i + 60] = i + 20;
pos_id[i + 90] = i + 30;
}
int sections[4] = {32, 32, 0, 0};
// 4. Allocate a `ggml_backend_buffer` to store all tensors
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
// 5. Copy tensor data from main memory (RAM) to backend buffer
ggml_backend_tensor_set(inp_raw, dummy_q.data(), 0, ggml_nbytes(inp_raw));
ggml_backend_tensor_set(pos, pos_id.data(), 0, ggml_nbytes(pos));
// 6. Create a `ggml_cgraph` for mul_mat operation
struct ggml_cgraph * gf = NULL;
struct ggml_context * ctx_cgraph = NULL;
// create a temporally context to build the graph
struct ggml_init_params params0 = {
/*.mem_size =*/ ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
};
ctx_cgraph = ggml_init(params0);
gf = ggml_new_graph(ctx_cgraph);
struct ggml_tensor * result0 = ggml_rope_multi(
ctx_cgraph, inp_raw, pos, nullptr,
128/2, sections, LLAMA_ROPE_TYPE_VISION, 32768, 1000000, 1,
0, 1, 32, 1);
// Add "result" tensor and all of its dependencies to the cgraph
ggml_build_forward_expand(gf, result0);
// 7. Create a `ggml_gallocr` for cgraph computation
ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
ggml_gallocr_alloc_graph(allocr, gf);
// 9. Run the computation
int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading
if (ggml_backend_is_cpu(backend)) {
ggml_backend_cpu_set_n_threads(backend, n_threads);
}
ggml_backend_graph_compute(backend, gf);
// 10. Retrieve results (output tensors)
// in this example, output tensor is always the last tensor in the graph
struct ggml_tensor * result = result0;
// struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
float * result_data = (float *)malloc(ggml_nbytes(result));
// because the tensor data is stored in device buffer, we need to copy it back to RAM
ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result));
const std::string bin_file = "mrope_2d_" + backend_name +".bin";
std::ofstream outFile(bin_file, std::ios::binary);
if (outFile.is_open()) {
outFile.write(reinterpret_cast<const char*>(result_data), ggml_nbytes(result));
outFile.close();
std::cout << "Data successfully written to " + bin_file << std::endl;
} else {
std::cerr << "Error opening file!" << std::endl;
}
free(result_data);
// 11. Free memory and exit
ggml_free(ctx_cgraph);
ggml_gallocr_free(allocr);
ggml_free(ctx);
ggml_backend_buffer_free(buffer);
ggml_backend_free(backend);
}
static void debug_dump_img_embed(struct llava_context * ctx_llava) {
int n_embd = llama_n_embd(llama_get_model(ctx_llava->ctx_llama));
int ne = n_embd * 4;
float vals[56 * 56 * 3];
// float embd[ne];
std::vector<float> embd;
embd.resize(ne);
for (int i = 0; i < 56*56; i++)
{
for (int c = 0; c < 3; c++)
vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56);
}
clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd.data());
std::ofstream outFile("img_embed.bin", std::ios::binary);
if (outFile.is_open()) {
outFile.write(reinterpret_cast<const char*>(embd.data()), ne * sizeof(float));
outFile.close();
std::cout << "Data successfully written to mrope.bin" << std::endl;
} else {
std::cerr << "Error opening file!" << std::endl;
}
}
#endif
int main(int argc, char ** argv) {
ggml_time_init();
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
return 1;
}
common_init();
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv);
return 1;
}
auto * model = llava_init(&params);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
return 1;
}
if (prompt_contains_image(params.prompt)) {
auto * ctx_llava = llava_init_context(&params, model);
auto * image_embed = load_image(ctx_llava, &params, "");
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_perf_context_print(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed);
ctx_llava->model = NULL;
llava_free(ctx_llava);
#ifndef NDEBUG
} else if (params.image[0].empty()) {
auto ctx_llava = llava_init_context(&params, model);
debug_test_mrope_2d();
debug_dump_img_embed(ctx_llava);
llama_perf_context_print(ctx_llava->ctx_llama);
ctx_llava->model = NULL;
llava_free(ctx_llava);
#endif
} else {
for (auto & image : params.image) {
auto * ctx_llava = llava_init_context(&params, model);
auto * image_embed = load_image(ctx_llava, &params, image);
if (!image_embed) {
LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
return 1;
}
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_perf_context_print(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed);
ctx_llava->model = NULL;
llava_free(ctx_llava);
}
}
llama_free_model(model);
return 0;
}

View File

@ -177,16 +177,11 @@ Example usage: `--temp 0`
- `--repeat-penalty N`: Control the repetition of token sequences in the generated text default: 1.0, 1.0 = disabled). - `--repeat-penalty N`: Control the repetition of token sequences in the generated text default: 1.0, 1.0 = disabled).
- `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size). - `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
- `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.
The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1. The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.
The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`). The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).
Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases.
Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl`
### DRY Repetition Penalty ### DRY Repetition Penalty
DRY (Don't Repeat Yourself) sampling is an effective technique for reducing repetition in generated text even across long contexts by penalizing tokens based on their recent usage patterns (original [PR link](https://github.com/oobabooga/text-generation-webui/pull/5677)). DRY (Don't Repeat Yourself) sampling is an effective technique for reducing repetition in generated text even across long contexts by penalizing tokens based on their recent usage patterns (original [PR link](https://github.com/oobabooga/text-generation-webui/pull/5677)).

View File

@ -54,8 +54,6 @@ As the models are currently fully loaded into memory, you will need adequate dis
Several quantization methods are supported. They differ in the resulting model disk size and inference speed. Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
*(outdated)* *(outdated)*
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | | Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
@ -83,7 +81,7 @@ The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interle
- [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930) - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
- [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957) - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
- [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969) - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
- [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996) - [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
- [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060) - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
- [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196) - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
- [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361) - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)

View File

@ -48,9 +48,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", }, { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", }, { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", }, { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },

View File

@ -107,7 +107,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
} }
float * out = output + batch.seq_id[i][0] * n_embd; float * out = output + batch.seq_id[i][0] * n_embd;
common_embd_normalize(embd, out, n_embd); common_embd_normalize(embd, out, n_embd, 2);
} }
} }
@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator); std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end()); chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
} }
LOG_INF("Number of chunks: %ld\n", chunks.size()); LOG_INF("Number of chunks: %zu\n", chunks.size());
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);

View File

@ -1,5 +1,5 @@
set(TARGET llama-run) set(TARGET llama-run)
add_executable(${TARGET} run.cpp) add_executable(${TARGET} run.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -3,5 +3,47 @@
The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models. The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
```bash ```bash
./llama-run Meta-Llama-3.1-8B-Instruct.gguf llama-run granite-code
... ```
```bash
llama-run -h
Description:
Runs a llm
Usage:
llama-run [options] model [prompt]
Options:
-c, --context-size <value>
Context size (default: 2048)
-n, --ngl <value>
Number of GPU layers (default: 0)
-v, --verbose, --log-verbose
Set verbosity level to infinity (i.e. log all messages, useful for debugging)
-h, --help
Show help message
Commands:
model
Model is a string with an optional prefix of
huggingface:// (hf://), ollama://, https:// or file://.
If no protocol is specified and a file exists in the specified
path, file:// is assumed, otherwise if a file does not exist in
the specified path, ollama:// is assumed. Models that are being
pulled are downloaded with .partial extension while being
downloaded and then renamed as the file without the .partial
extension when complete.
Examples:
llama-run llama3
llama-run ollama://granite-code
llama-run ollama://smollm:135m
llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf
llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf
llama-run https://example.com/some-file1.gguf
llama-run some-file2.gguf
llama-run file://some-file3.gguf
llama-run --ngl 999 some-file4.gguf
llama-run --ngl 999 some-file5.gguf Hello World
```

File diff suppressed because it is too large Load Diff

View File

@ -15,7 +15,7 @@ set(TARGET_SRCS
httplib.h httplib.h
) )
set(PUBLIC_ASSETS set(PUBLIC_ASSETS
index.html index.html.gz
loading.html loading.html
) )
@ -34,14 +34,6 @@ endforeach()
add_executable(${TARGET} ${TARGET_SRCS}) add_executable(${TARGET} ${TARGET_SRCS})
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
# clean up generated files in pre-build step
foreach(asset ${PUBLIC_ASSETS})
set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND "${CMAKE_COMMAND}" -E remove -f "${output}"
)
endforeach()
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
if (LLAMA_SERVER_SSL) if (LLAMA_SERVER_SSL)

View File

@ -62,8 +62,8 @@ The project is under active development, and we are [looking for feedback and co
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) | | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
| `-dkvc, --dump-kv-cache` | verbose print of the KV cache | | `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) | | `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) | | `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
@ -104,7 +104,6 @@ The project is under active development, and we are [looking for feedback and co
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: dkypmxt) | | `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: dkypmxt) |
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
| `--penalize-nl` | penalize newline tokens (default: false) |
| `--temp N` | temperature (default: 0.8) | | `--temp N` | temperature (default: 0.8) |
| `--top-k N` | top-k sampling (default: 40, 0 = disabled) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) | | `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
@ -138,6 +137,7 @@ The project is under active development, and we are [looking for feedback and co
| -------- | ----------- | | -------- | ----------- |
| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) | | `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
| `-sp, --special` | special tokens output enabled (default: false) | | `-sp, --special` | special tokens output enabled (default: false) |
| `--no-warmup` | skip warming up the model with an empty run |
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) | | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) | | `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) | | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
@ -146,6 +146,7 @@ The project is under active development, and we are [looking for feedback and co
| `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) | | `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) | | `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) | | `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) |
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) | | `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) | | `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
@ -163,13 +164,13 @@ The project is under active development, and we are [looking for feedback and co
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>list of built-in templates:<br/>chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>list of built-in templates:<br/>chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16) | | `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5) | | `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9) | | `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model) | | `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices | | `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model | | `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) | | `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var. Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
@ -302,23 +303,23 @@ mkdir llama-client
cd llama-client cd llama-client
``` ```
Create a index.js file and put this inside: Create an index.js file and put this inside:
```javascript ```javascript
const prompt = `Building a website can be done in 10 simple steps:`; const prompt = "Building a website can be done in 10 simple steps:"
async function Test() { async function test() {
let response = await fetch("http://127.0.0.1:8080/completion", { let response = await fetch("http://127.0.0.1:8080/completion", {
method: 'POST', method: "POST",
body: JSON.stringify({ body: JSON.stringify({
prompt, prompt,
n_predict: 512, n_predict: 64,
}) })
}) })
console.log((await response.json()).content) console.log((await response.json()).content)
} }
Test() test()
``` ```
And run it: And run it:
@ -380,7 +381,7 @@ Multiple prompts are also supported. In this case, the completion result will be
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token. `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt. By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. `stream`: Allows receiving each predicted token in real-time instead of waiting for the completion to finish (uses a different response format). To enable this, set to `true`.
`stop`: Specify a JSON array of stopping strings. `stop`: Specify a JSON array of stopping strings.
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]` These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
@ -391,8 +392,6 @@ These words will not be included in the completion, so make sure to add them to
`repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size. `repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.
`penalize_nl`: Penalize newline tokens when applying the repeat penalty. Default: `true`
`presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled. `presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled.
`frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled. `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
@ -439,19 +438,22 @@ These words will not be included in the completion, so make sure to add them to
`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true` `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
`return_tokens`: Return the raw generated token ids in the `tokens` field. Otherwise `tokens` remains empty. Default: `false`
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values. `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
`timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false` `timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false`
**Response format** **Response format**
- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. - Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure: - `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
```json ```json
{ {
"content": "<the token selected by the model>", "content": "<the token generated by the model>",
"tokens": [ generated token ids if requested ],
"probs": [ "probs": [
{ {
"prob": float, "prob": float,
@ -469,13 +471,16 @@ These words will not be included in the completion, so make sure to add them to
Notice that each `probs` is an array of length `n_probs`. Notice that each `probs` is an array of length `n_probs`.
- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string. - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
- `tokens`: Same as `content` but represented as raw token ids. Only populated if `"return_tokens": true` or `"stream": true` in the request.
- `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options) - `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.). - `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
- `model`: The path to the model loaded with `-m` - `model`: The path to the model loaded with `-m`
- `prompt`: The provided `prompt` - `prompt`: The provided `prompt`
- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token - `stop_type`: Indicating whether the completion has stopped. Possible values are:
- `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered - `none`: Generating (not stopped)
- `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided - `eos`: Stopped because it encountered the EOS token
- `limit`: Stopped because `n_predict` tokens were generated before stop words or EOS was encountered
- `word`: Stopped due to encountering a stopping word from `stop` JSON array provided
- `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word) - `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
- `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second` - `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
- `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`) - `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
@ -616,14 +621,82 @@ This endpoint is public (no API key check). By default, it is read-only. To make
```json ```json
{ {
"default_generation_settings": { ... }, "default_generation_settings": {
"id": 0,
"id_task": -1,
"n_ctx": 1024,
"speculative": false,
"is_processing": false,
"params": {
"n_predict": -1,
"seed": 4294967295,
"temperature": 0.800000011920929,
"dynatemp_range": 0.0,
"dynatemp_exponent": 1.0,
"top_k": 40,
"top_p": 0.949999988079071,
"min_p": 0.05000000074505806,
"xtc_probability": 0.0,
"xtc_threshold": 0.10000000149011612,
"typical_p": 1.0,
"repeat_last_n": 64,
"repeat_penalty": 1.0,
"presence_penalty": 0.0,
"frequency_penalty": 0.0,
"dry_multiplier": 0.0,
"dry_base": 1.75,
"dry_allowed_length": 2,
"dry_penalty_last_n": -1,
"dry_sequence_breakers": [
"\n",
":",
"\"",
"*"
],
"mirostat": 0,
"mirostat_tau": 5.0,
"mirostat_eta": 0.10000000149011612,
"stop": [],
"max_tokens": -1,
"n_keep": 0,
"n_discard": 0,
"ignore_eos": false,
"stream": true,
"n_probs": 0,
"min_keep": 0,
"grammar": "",
"samplers": [
"dry",
"top_k",
"typ_p",
"top_p",
"min_p",
"xtc",
"temperature"
],
"speculative.n_max": 16,
"speculative.n_min": 5,
"speculative.p_min": 0.8999999761581421,
"timings_per_token": false
},
"prompt": "",
"next_token": {
"has_next_token": true,
"has_new_line": false,
"n_remain": -1,
"n_decoded": 0,
"stopping_word": ""
}
},
"total_slots": 1, "total_slots": 1,
"chat_template": "" "model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
"chat_template": "..."
} }
``` ```
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint. - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option) - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
- `model_path` - the path to model file (same with `-m` argument)
- `chat_template` - the model's original Jinja2 prompt template - `chat_template` - the model's original Jinja2 prompt template
### POST `/props`: Change server global properties. ### POST `/props`: Change server global properties.
@ -690,6 +763,8 @@ curl http://localhost:8080/v1/chat/completions \
### POST `/v1/embeddings`: OpenAI-compatible embeddings API ### POST `/v1/embeddings`: OpenAI-compatible embeddings API
This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm.
*Options:* *Options:*
See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings). See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
@ -722,6 +797,46 @@ See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-r
}' }'
``` ```
### POST `/embeddings`: non-OpenAI-compatible embeddings API
This endpoint supports all poolings, including `--pooling none`. When the pooling is `none`, the responses will contain the *unnormalized* embeddings for *all* input tokens. For all other pooling types, only the pooled embeddings are returned, normalized using Euclidian norm.
Note that the response format of this endpoint is different from `/v1/embeddings`.
*Options:*
Same as the `/v1/embeddings` endpoint.
*Examples:*
Same as the `/v1/embeddings` endpoint.
**Response format**
```json
[
{
"index": 0,
"embedding": [
[ ... embeddings for token 0 ... ],
[ ... embeddings for token 1 ... ],
[ ... ]
[ ... embeddings for token N-1 ... ],
]
},
...
{
"index": P,
"embedding": [
[ ... embeddings for token 0 ... ],
[ ... embeddings for token 1 ... ],
[ ... ]
[ ... embeddings for token N-1 ... ],
]
}
]
```
### GET `/slots`: Returns the current slots processing state ### GET `/slots`: Returns the current slots processing state
> [!WARNING] > [!WARNING]
@ -738,54 +853,71 @@ Example:
```json ```json
[ [
{ {
"dynatemp_exponent": 1.0,
"dynatemp_range": 0.0,
"frequency_penalty": 0.0,
"grammar": "",
"id": 0, "id": 0,
"ignore_eos": false, "id_task": -1,
"n_ctx": 1024,
"speculative": false,
"is_processing": false, "is_processing": false,
"logit_bias": [], "params": {
"min_p": 0.05000000074505806, "n_predict": -1,
"mirostat": 0, "seed": 4294967295,
"mirostat_eta": 0.10000000149011612, "temperature": 0.800000011920929,
"mirostat_tau": 5.0, "dynatemp_range": 0.0,
"model": "llama-2-7b-32k-instruct.Q2_K.gguf", "dynatemp_exponent": 1.0,
"n_ctx": 2048,
"n_keep": 0,
"n_predict": 100000,
"n_probs": 0,
"next_token": {
"has_next_token": true,
"n_remain": -1,
"n_decoded": 0,
"stopped_eos": false,
"stopped_limit": false,
"stopped_word": false,
"stopping_word": ""
},
"penalize_nl": true,
"presence_penalty": 0.0,
"prompt": "Say hello to llama.cpp",
"repeat_last_n": 64,
"repeat_penalty": 1.100000023841858,
"samplers": [
"top_k",
"typical_p",
"top_p",
"min_p",
"temperature"
],
"seed": 42,
"stop": [
"\n"
],
"stream": false,
"task_id": 0,
"temperature": 0.0,
"top_k": 40, "top_k": 40,
"top_p": 0.949999988079071, "top_p": 0.949999988079071,
"typical_p": 1.0 "min_p": 0.05000000074505806,
"xtc_probability": 0.0,
"xtc_threshold": 0.10000000149011612,
"typical_p": 1.0,
"repeat_last_n": 64,
"repeat_penalty": 1.0,
"presence_penalty": 0.0,
"frequency_penalty": 0.0,
"dry_multiplier": 0.0,
"dry_base": 1.75,
"dry_allowed_length": 2,
"dry_penalty_last_n": -1,
"dry_sequence_breakers": [
"\n",
":",
"\"",
"*"
],
"mirostat": 0,
"mirostat_tau": 5.0,
"mirostat_eta": 0.10000000149011612,
"stop": [],
"max_tokens": -1,
"n_keep": 0,
"n_discard": 0,
"ignore_eos": false,
"stream": true,
"n_probs": 0,
"min_keep": 0,
"grammar": "",
"samplers": [
"dry",
"top_k",
"typ_p",
"top_p",
"min_p",
"xtc",
"temperature"
],
"speculative.n_max": 16,
"speculative.n_min": 5,
"speculative.p_min": 0.8999999761581421,
"timings_per_token": false
},
"prompt": "",
"next_token": {
"has_next_token": true,
"has_new_line": false,
"n_remain": -1,
"n_decoded": 0,
"stopping_word": ""
}
} }
] ]
``` ```

File diff suppressed because one or more lines are too long

Binary file not shown.

View File

@ -39,7 +39,6 @@
temperature: 0.8, // adapt all following parameters to optimized min-p requierements. If for non-english, set to 0.6 or lower temperature: 0.8, // adapt all following parameters to optimized min-p requierements. If for non-english, set to 0.6 or lower
repeat_last_n: 0, // 0 = disable penalty, -1 = context size repeat_last_n: 0, // 0 = disable penalty, -1 = context size
repeat_penalty: 1.0, // 1.0 = disabled repeat_penalty: 1.0, // 1.0 = disabled
penalize_nl: false, // true only useful for infinite completion
dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well
dry_base: 1.75, // 0.0 = disabled dry_base: 1.75, // 0.0 = disabled
dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well

View File

@ -303,7 +303,6 @@
temperature: 0.7, temperature: 0.7,
repeat_last_n: 256, // 0 = disable penalty, -1 = context size repeat_last_n: 256, // 0 = disable penalty, -1 = context size
repeat_penalty: 1.18, // 1.0 = disabled repeat_penalty: 1.18, // 1.0 = disabled
penalize_nl: false,
dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well
dry_base: 1.75, // 0.0 = disabled dry_base: 1.75, // 0.0 = disabled
dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well
@ -1006,7 +1005,6 @@
${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })} ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })} ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })} ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })}
${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })} ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })} ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })} ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}

View File

@ -407,6 +407,9 @@ class SimpleChat {
if (curLine.startsWith("data:")) { if (curLine.startsWith("data:")) {
curLine = curLine.substring(5); curLine = curLine.substring(5);
} }
if (curLine.trim() === "[DONE]") {
break;
}
let curJson = JSON.parse(curLine); let curJson = JSON.parse(curLine);
console.debug("DBUG:SC:PART:Json:", curJson); console.debug("DBUG:SC:PART:Json:", curJson);
this.append_response(this.response_extract_stream(curJson, apiEP)); this.append_response(this.response_extract_stream(curJson, apiEP));

File diff suppressed because it is too large Load Diff

View File

@ -44,4 +44,10 @@ To run with stdout/stderr display in real time (verbose output, but useful for d
DEBUG=1 ./tests.sh -s -v -x DEBUG=1 ./tests.sh -s -v -x
``` ```
Hint: You can compile and run test in single command, useful for local developement:
```shell
cmake --build build -j --target llama-server && ./examples/server/tests/tests.sh
```
To see all available arguments, please refer to [pytest documentation](https://docs.pytest.org/en/stable/how-to/usage.html) To see all available arguments, please refer to [pytest documentation](https://docs.pytest.org/en/stable/how-to/usage.html)

View File

@ -1,5 +1,9 @@
#!/bin/bash #!/bin/bash
# make sure we are in the right directory
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd $SCRIPT_DIR
set -eu set -eu
if [ $# -lt 1 ] if [ $# -lt 1 ]

View File

@ -1,4 +1,5 @@
import pytest import pytest
import requests
from utils import * from utils import *
server = ServerPreset.tinyllama2() server = ServerPreset.tinyllama2()
@ -22,7 +23,12 @@ def test_server_props():
server.start() server.start()
res = server.make_request("GET", "/props") res = server.make_request("GET", "/props")
assert res.status_code == 200 assert res.status_code == 200
assert ".gguf" in res.body["model_path"]
assert res.body["total_slots"] == server.n_slots assert res.body["total_slots"] == server.n_slots
default_val = res.body["default_generation_settings"]
assert server.n_ctx is not None and server.n_slots is not None
assert default_val["n_ctx"] == server.n_ctx / server.n_slots
assert default_val["params"]["seed"] == server.seed
def test_server_models(): def test_server_models():
@ -33,6 +39,31 @@ def test_server_models():
assert len(res.body["data"]) == 1 assert len(res.body["data"]) == 1
assert res.body["data"][0]["id"] == server.model_alias assert res.body["data"][0]["id"] == server.model_alias
def test_server_slots():
global server
# without slots endpoint enabled, this should return error
server.server_slots = False
server.start()
res = server.make_request("GET", "/slots")
assert res.status_code == 501 # ERROR_TYPE_NOT_SUPPORTED
assert "error" in res.body
server.stop()
# with slots endpoint enabled, this should return slots info
server.server_slots = True
server.n_slots = 2
server.start()
res = server.make_request("GET", "/slots")
assert res.status_code == 200
assert len(res.body) == server.n_slots
assert server.n_ctx is not None and server.n_slots is not None
assert res.body[0]["n_ctx"] == server.n_ctx / server.n_slots
assert "params" in res.body[0]
assert res.body[0]["params"]["seed"] == server.seed
def test_load_split_model(): def test_load_split_model():
global server global server
server.model_hf_repo = "ggml-org/models" server.model_hf_repo = "ggml-org/models"
@ -46,3 +77,20 @@ def test_load_split_model():
}) })
assert res.status_code == 200 assert res.status_code == 200
assert match_regex("(little|girl)+", res.body["content"]) assert match_regex("(little|girl)+", res.body["content"])
def test_no_webui():
global server
# default: webui enabled
server.start()
url = f"http://{server.server_host}:{server.server_port}"
res = requests.get(url)
assert res.status_code == 200
assert "<html>" in res.text
server.stop()
# with --no-webui
server.no_webui = True
server.start()
res = requests.get(url)
assert res.status_code == 404

View File

@ -12,13 +12,13 @@ def create_server():
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated", "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
[ [
("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False), (None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False), ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
] ]
) )
def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated): def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
global server global server
server.start() server.start()
res = server.make_request("POST", "/chat/completions", data={ res = server.make_request("POST", "/chat/completions", data={
@ -30,29 +30,28 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
], ],
}) })
assert res.status_code == 200 assert res.status_code == 200
assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
assert res.body["model"] == model if model is not None else server.model_alias
assert res.body["usage"]["prompt_tokens"] == n_prompt assert res.body["usage"]["prompt_tokens"] == n_prompt
assert res.body["usage"]["completion_tokens"] == n_predicted assert res.body["usage"]["completion_tokens"] == n_predicted
choice = res.body["choices"][0] choice = res.body["choices"][0]
assert "assistant" == choice["message"]["role"] assert "assistant" == choice["message"]["role"]
assert match_regex(re_content, choice["message"]["content"]) assert match_regex(re_content, choice["message"]["content"])
if truncated: assert choice["finish_reason"] == finish_reason
assert choice["finish_reason"] == "length"
else:
assert choice["finish_reason"] == "stop"
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated", "system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
[ [
("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False), ("Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False), ("You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
] ]
) )
def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated): def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
global server global server
server.model_alias = None # try using DEFAULT_OAICOMPAT_MODEL
server.start() server.start()
res = server.make_stream_request("POST", "/chat/completions", data={ res = server.make_stream_request("POST", "/chat/completions", data={
"model": model,
"max_tokens": max_tokens, "max_tokens": max_tokens,
"messages": [ "messages": [
{"role": "system", "content": system_prompt}, {"role": "system", "content": system_prompt},
@ -61,18 +60,19 @@ def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, r
"stream": True, "stream": True,
}) })
content = "" content = ""
last_cmpl_id = None
for data in res: for data in res:
choice = data["choices"][0] choice = data["choices"][0]
assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
if last_cmpl_id is None:
last_cmpl_id = data["id"]
assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream
if choice["finish_reason"] in ["stop", "length"]: if choice["finish_reason"] in ["stop", "length"]:
assert data["usage"]["prompt_tokens"] == n_prompt assert data["usage"]["prompt_tokens"] == n_prompt
assert data["usage"]["completion_tokens"] == n_predicted assert data["usage"]["completion_tokens"] == n_predicted
assert "content" not in choice["delta"] assert "content" not in choice["delta"]
assert match_regex(re_content, content) assert match_regex(re_content, content)
# FIXME: not sure why this is incorrect in stream mode assert choice["finish_reason"] == finish_reason
# if truncated:
# assert choice["finish_reason"] == "length"
# else:
# assert choice["finish_reason"] == "stop"
else: else:
assert choice["finish_reason"] is None assert choice["finish_reason"] is None
content += choice["delta"]["content"] content += choice["delta"]["content"]
@ -93,7 +93,7 @@ def test_chat_completion_with_openai_library():
temperature=0.8, temperature=0.8,
) )
print(res) print(res)
assert res.choices[0].finish_reason == "stop" assert res.choices[0].finish_reason == "length"
assert res.choices[0].message.content is not None assert res.choices[0].message.content is not None
assert match_regex("(Suddenly)+", res.choices[0].message.content) assert match_regex("(Suddenly)+", res.choices[0].message.content)

View File

@ -10,22 +10,29 @@ def create_server():
global server global server
server = ServerPreset.tinyllama2() server = ServerPreset.tinyllama2()
@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [ @pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated,return_tokens", [
("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False), ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False, False),
("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False), ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False, True),
]) ])
def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool): def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool, return_tokens: bool):
global server global server
server.start() server.start()
res = server.make_request("POST", "/completion", data={ res = server.make_request("POST", "/completion", data={
"n_predict": n_predict, "n_predict": n_predict,
"prompt": prompt, "prompt": prompt,
"return_tokens": return_tokens,
}) })
assert res.status_code == 200 assert res.status_code == 200
assert res.body["timings"]["prompt_n"] == n_prompt assert res.body["timings"]["prompt_n"] == n_prompt
assert res.body["timings"]["predicted_n"] == n_predicted assert res.body["timings"]["predicted_n"] == n_predicted
assert res.body["truncated"] == truncated assert res.body["truncated"] == truncated
assert type(res.body["has_new_line"]) == bool
assert match_regex(re_content, res.body["content"]) assert match_regex(re_content, res.body["content"])
if return_tokens:
assert len(res.body["tokens"]) > 0
assert all(type(tok) == int for tok in res.body["tokens"])
else:
assert res.body["tokens"] == []
@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [ @pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
@ -42,15 +49,42 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp
}) })
content = "" content = ""
for data in res: for data in res:
assert "stop" in data and type(data["stop"]) == bool
if data["stop"]: if data["stop"]:
assert data["timings"]["prompt_n"] == n_prompt assert data["timings"]["prompt_n"] == n_prompt
assert data["timings"]["predicted_n"] == n_predicted assert data["timings"]["predicted_n"] == n_predicted
assert data["truncated"] == truncated assert data["truncated"] == truncated
assert data["stop_type"] == "limit"
assert type(data["has_new_line"]) == bool
assert "generation_settings" in data
assert server.n_predict is not None
assert data["generation_settings"]["n_predict"] == min(n_predict, server.n_predict)
assert data["generation_settings"]["seed"] == server.seed
assert match_regex(re_content, content) assert match_regex(re_content, content)
else: else:
assert len(data["tokens"]) > 0
assert all(type(tok) == int for tok in data["tokens"])
content += data["content"] content += data["content"]
def test_completion_stream_vs_non_stream():
global server
server.start()
res_stream = server.make_stream_request("POST", "/completion", data={
"n_predict": 8,
"prompt": "I believe the meaning of life is",
"stream": True,
})
res_non_stream = server.make_request("POST", "/completion", data={
"n_predict": 8,
"prompt": "I believe the meaning of life is",
})
content_stream = ""
for data in res_stream:
content_stream += data["content"]
assert content_stream == res_non_stream.body["content"]
@pytest.mark.parametrize("n_slots", [1, 2]) @pytest.mark.parametrize("n_slots", [1, 2])
def test_consistent_result_same_seed(n_slots: int): def test_consistent_result_same_seed(n_slots: int):
global server global server
@ -221,3 +255,24 @@ def test_completion_parallel_slots(n_slots: int, n_requests: int):
assert len(res.body["content"]) > 10 assert len(res.body["content"]) > 10
# FIXME: the result is not deterministic when using other slot than slot 0 # FIXME: the result is not deterministic when using other slot than slot 0
# assert match_regex(re_content, res.body["content"]) # assert match_regex(re_content, res.body["content"])
def test_n_probs():
global server
server.start()
res = server.make_request("POST", "/completion", data={
"prompt": "I believe the meaning of life is",
"n_probs": 10,
"temperature": 0.0,
"n_predict": 5,
})
assert res.status_code == 200
assert "completion_probabilities" in res.body
assert len(res.body["completion_probabilities"]) == 5
for tok in res.body["completion_probabilities"]:
assert "probs" in tok
assert len(tok["probs"]) == 10
for prob in tok["probs"]:
assert "prob" in prob
assert "tok_str" in prob
assert 0.0 <= prob["prob"] <= 1.0

View File

@ -14,8 +14,9 @@ def create_server():
def test_embedding_single(): def test_embedding_single():
global server global server
server.pooling = 'last'
server.start() server.start()
res = server.make_request("POST", "/embeddings", data={ res = server.make_request("POST", "/v1/embeddings", data={
"input": "I believe the meaning of life is", "input": "I believe the meaning of life is",
}) })
assert res.status_code == 200 assert res.status_code == 200
@ -29,8 +30,9 @@ def test_embedding_single():
def test_embedding_multiple(): def test_embedding_multiple():
global server global server
server.pooling = 'last'
server.start() server.start()
res = server.make_request("POST", "/embeddings", data={ res = server.make_request("POST", "/v1/embeddings", data={
"input": [ "input": [
"I believe the meaning of life is", "I believe the meaning of life is",
"Write a joke about AI from a very long prompt which will not be truncated", "Write a joke about AI from a very long prompt which will not be truncated",
@ -45,10 +47,69 @@ def test_embedding_multiple():
assert len(d['embedding']) > 1 assert len(d['embedding']) > 1
def test_embedding_openai_library_single(): @pytest.mark.parametrize(
"input,is_multi_prompt",
[
# single prompt
("string", False),
([12, 34, 56], False),
([12, 34, "string", 56, 78], False),
# multiple prompts
(["string1", "string2"], True),
(["string1", [12, 34, 56]], True),
([[12, 34, 56], [12, 34, 56]], True),
([[12, 34, 56], [12, "string", 34, 56]], True),
]
)
def test_embedding_mixed_input(input, is_multi_prompt: bool):
global server global server
server.start() server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}") res = server.make_request("POST", "/v1/embeddings", data={"input": input})
assert res.status_code == 200
data = res.body['data']
if is_multi_prompt:
assert len(data) == len(input)
for d in data:
assert 'embedding' in d
assert len(d['embedding']) > 1
else:
assert 'embedding' in data[0]
assert len(data[0]['embedding']) > 1
def test_embedding_pooling_none():
global server
server.pooling = 'none'
server.start()
res = server.make_request("POST", "/embeddings", data={
"input": "hello hello hello",
})
assert res.status_code == 200
assert 'embedding' in res.body[0]
assert len(res.body[0]['embedding']) == 5 # 3 text tokens + 2 special
# make sure embedding vector is not normalized
for x in res.body[0]['embedding']:
assert abs(sum([x ** 2 for x in x]) - 1) > EPSILON
def test_embedding_pooling_none_oai():
global server
server.pooling = 'none'
server.start()
res = server.make_request("POST", "/v1/embeddings", data={
"input": "hello hello hello",
})
# /v1/embeddings does not support pooling type 'none'
assert res.status_code == 400
def test_embedding_openai_library_single():
global server
server.pooling = 'last'
server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
res = client.embeddings.create(model="text-embedding-3-small", input="I believe the meaning of life is") res = client.embeddings.create(model="text-embedding-3-small", input="I believe the meaning of life is")
assert len(res.data) == 1 assert len(res.data) == 1
assert len(res.data[0].embedding) > 1 assert len(res.data[0].embedding) > 1
@ -56,8 +117,9 @@ def test_embedding_openai_library_single():
def test_embedding_openai_library_multiple(): def test_embedding_openai_library_multiple():
global server global server
server.pooling = 'last'
server.start() server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}") client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
res = client.embeddings.create(model="text-embedding-3-small", input=[ res = client.embeddings.create(model="text-embedding-3-small", input=[
"I believe the meaning of life is", "I believe the meaning of life is",
"Write a joke about AI from a very long prompt which will not be truncated", "Write a joke about AI from a very long prompt which will not be truncated",
@ -71,8 +133,9 @@ def test_embedding_openai_library_multiple():
def test_embedding_error_prompt_too_long(): def test_embedding_error_prompt_too_long():
global server global server
server.pooling = 'last'
server.start() server.start()
res = server.make_request("POST", "/embeddings", data={ res = server.make_request("POST", "/v1/embeddings", data={
"input": "This is a test " * 512, "input": "This is a test " * 512,
}) })
assert res.status_code != 200 assert res.status_code != 200
@ -80,8 +143,9 @@ def test_embedding_error_prompt_too_long():
def test_same_prompt_give_same_result(): def test_same_prompt_give_same_result():
server.pooling = 'last'
server.start() server.start()
res = server.make_request("POST", "/embeddings", data={ res = server.make_request("POST", "/v1/embeddings", data={
"input": [ "input": [
"I believe the meaning of life is", "I believe the meaning of life is",
"I believe the meaning of life is", "I believe the meaning of life is",
@ -97,3 +161,33 @@ def test_same_prompt_give_same_result():
vi = res.body['data'][i]['embedding'] vi = res.body['data'][i]['embedding']
for x, y in zip(v0, vi): for x, y in zip(v0, vi):
assert abs(x - y) < EPSILON assert abs(x - y) < EPSILON
@pytest.mark.parametrize(
"content,n_tokens",
[
("I believe the meaning of life is", 9),
("This is a test", 6),
]
)
def test_embedding_usage_single(content, n_tokens):
global server
server.start()
res = server.make_request("POST", "/v1/embeddings", data={"input": content})
assert res.status_code == 200
assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
assert res.body['usage']['prompt_tokens'] == n_tokens
def test_embedding_usage_multiple():
global server
server.start()
res = server.make_request("POST", "/v1/embeddings", data={
"input": [
"I believe the meaning of life is",
"I believe the meaning of life is",
],
})
assert res.status_code == 200
assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
assert res.body['usage']['prompt_tokens'] == 2 * 9

View File

@ -13,28 +13,28 @@ def test_infill_without_input_extra():
global server global server
server.start() server.start()
res = server.make_request("POST", "/infill", data={ res = server.make_request("POST", "/infill", data={
"prompt": "Complete this", "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_", "prompt": " int n_threads = llama_",
"input_suffix": "}\n", "input_suffix": "}\n",
}) })
assert res.status_code == 200 assert res.status_code == 200
assert match_regex("(One|day|she|saw|big|scary|bird)+", res.body["content"]) assert match_regex("(Ann|small|shiny)+", res.body["content"])
def test_infill_with_input_extra(): def test_infill_with_input_extra():
global server global server
server.start() server.start()
res = server.make_request("POST", "/infill", data={ res = server.make_request("POST", "/infill", data={
"prompt": "Complete this",
"input_extra": [{ "input_extra": [{
"filename": "llama.h", "filename": "llama.h",
"text": "LLAMA_API int32_t llama_n_threads();\n" "text": "LLAMA_API int32_t llama_n_threads();\n"
}], }],
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_", "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
"prompt": " int n_threads = llama_",
"input_suffix": "}\n", "input_suffix": "}\n",
}) })
assert res.status_code == 200 assert res.status_code == 200
assert match_regex("(cuts|Jimmy|mom|came|into|the|room)+", res.body["content"]) assert match_regex("(Dad|excited|park)+", res.body["content"])
@pytest.mark.parametrize("input_extra", [ @pytest.mark.parametrize("input_extra", [
@ -48,10 +48,30 @@ def test_invalid_input_extra_req(input_extra):
global server global server
server.start() server.start()
res = server.make_request("POST", "/infill", data={ res = server.make_request("POST", "/infill", data={
"prompt": "Complete this",
"input_extra": [input_extra], "input_extra": [input_extra],
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_", "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
"prompt": " int n_threads = llama_",
"input_suffix": "}\n", "input_suffix": "}\n",
}) })
assert res.status_code == 400 assert res.status_code == 400
assert "error" in res.body assert "error" in res.body
@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
def test_with_qwen_model():
global server
server.model_file = None
server.model_hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-IQ3_XXS-GGUF"
server.model_hf_file = "qwen2.5-coder-1.5b-iq3_xxs-imat.gguf"
server.start(timeout_seconds=600)
res = server.make_request("POST", "/infill", data={
"input_extra": [{
"filename": "llama.h",
"text": "LLAMA_API int32_t llama_n_threads();\n"
}],
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
"prompt": " int n_threads = llama_",
"input_suffix": "}\n",
})
assert res.status_code == 200
assert res.body["content"] == "n_threads();\n printf(\"Number of threads: %d\\n\", n_threads);\n return 0;\n"

View File

@ -53,3 +53,26 @@ def test_invalid_rerank_req(documents):
}) })
assert res.status_code == 400 assert res.status_code == 400
assert "error" in res.body assert "error" in res.body
@pytest.mark.parametrize(
"query,doc1,doc2,n_tokens",
[
("Machine learning is", "A machine", "Learning is", 19),
("Which city?", "Machine learning is ", "Paris, capitale de la", 26),
]
)
def test_rerank_usage(query, doc1, doc2, n_tokens):
global server
server.start()
res = server.make_request("POST", "/rerank", data={
"query": query,
"documents": [
doc1,
doc2,
]
})
assert res.status_code == 200
assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
assert res.body['usage']['prompt_tokens'] == n_tokens

View File

@ -82,6 +82,37 @@ def test_different_draft_min_draft_max():
last_content = res.body["content"] last_content = res.body["content"]
def test_slot_ctx_not_exceeded():
global server
server.n_ctx = 64
server.start()
res = server.make_request("POST", "/completion", data={
"prompt": "Hello " * 56,
"temperature": 0.0,
"top_k": 1,
"speculative.p_min": 0.0,
})
assert res.status_code == 200
assert len(res.body["content"]) > 0
def test_with_ctx_shift():
global server
server.n_ctx = 64
server.start()
res = server.make_request("POST", "/completion", data={
"prompt": "Hello " * 56,
"temperature": 0.0,
"top_k": 1,
"n_predict": 64,
"speculative.p_min": 0.0,
})
assert res.status_code == 200
assert len(res.body["content"]) > 0
assert res.body["tokens_predicted"] == 64
assert res.body["truncated"] == True
@pytest.mark.parametrize("n_slots,n_requests", [ @pytest.mark.parametrize("n_slots,n_requests", [
(1, 2), (1, 2),
(2, 2), (2, 2),

View File

@ -64,6 +64,8 @@ class ServerProcess:
server_embeddings: bool | None = False server_embeddings: bool | None = False
server_reranking: bool | None = False server_reranking: bool | None = False
server_metrics: bool | None = False server_metrics: bool | None = False
server_slots: bool | None = False
pooling: str | None = None
draft: int | None = None draft: int | None = None
api_key: str | None = None api_key: str | None = None
response_format: str | None = None response_format: str | None = None
@ -71,6 +73,7 @@ class ServerProcess:
disable_ctx_shift: int | None = False disable_ctx_shift: int | None = False
draft_min: int | None = None draft_min: int | None = None
draft_max: int | None = None draft_max: int | None = None
no_webui: bool | None = None
# session variables # session variables
process: subprocess.Popen | None = None process: subprocess.Popen | None = None
@ -91,7 +94,6 @@ class ServerProcess:
else: else:
server_path = "../../../build/bin/llama-server" server_path = "../../../build/bin/llama-server"
server_args = [ server_args = [
"--slots", # requires to get slot status via /slots endpoint
"--host", "--host",
self.server_host, self.server_host,
"--port", "--port",
@ -129,6 +131,10 @@ class ServerProcess:
server_args.append("--reranking") server_args.append("--reranking")
if self.server_metrics: if self.server_metrics:
server_args.append("--metrics") server_args.append("--metrics")
if self.server_slots:
server_args.append("--slots")
if self.pooling:
server_args.extend(["--pooling", self.pooling])
if self.model_alias: if self.model_alias:
server_args.extend(["--alias", self.model_alias]) server_args.extend(["--alias", self.model_alias])
if self.n_ctx: if self.n_ctx:
@ -156,6 +162,8 @@ class ServerProcess:
server_args.extend(["--draft-max", self.draft_max]) server_args.extend(["--draft-max", self.draft_max])
if self.draft_min: if self.draft_min:
server_args.extend(["--draft-min", self.draft_min]) server_args.extend(["--draft-min", self.draft_min])
if self.no_webui:
server_args.append("--no-webui")
args = [str(arg) for arg in [server_path, *server_args]] args = [str(arg) for arg in [server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}") print(f"bench: starting server with: {' '.join(args)}")
@ -181,7 +189,7 @@ class ServerProcess:
start_time = time.time() start_time = time.time()
while time.time() - start_time < timeout_seconds: while time.time() - start_time < timeout_seconds:
try: try:
response = self.make_request("GET", "/slots", headers={ response = self.make_request("GET", "/health", headers={
"Authorization": f"Bearer {self.api_key}" if self.api_key else None "Authorization": f"Bearer {self.api_key}" if self.api_key else None
}) })
if response.status_code == 200: if response.status_code == 200:
@ -224,7 +232,7 @@ class ServerProcess:
result.headers = dict(response.headers) result.headers = dict(response.headers)
result.status_code = response.status_code result.status_code = response.status_code
result.body = response.json() if parse_body else None result.body = response.json() if parse_body else None
print("Response from server", result.body) print("Response from server", json.dumps(result.body, indent=2))
return result return result
def make_stream_request( def make_stream_request(
@ -245,7 +253,7 @@ class ServerProcess:
break break
elif line.startswith('data: '): elif line.startswith('data: '):
data = json.loads(line[6:]) data = json.loads(line[6:])
print("Partial response from server", data) print("Partial response from server", json.dumps(data, indent=2))
yield data yield data
@ -369,3 +377,6 @@ def match_regex(regex: str, text: str) -> bool:
).search(text) ).search(text)
is not None is not None
) )
def is_slow_test_allowed():
return os.environ.get("SLOW_TESTS") == "1" or os.environ.get("SLOW_TESTS") == "ON"

View File

@ -222,7 +222,6 @@
temperature: 0.7, temperature: 0.7,
repeat_last_n: 256, // 0 = disable penalty, -1 = context size repeat_last_n: 256, // 0 = disable penalty, -1 = context size
repeat_penalty: 1.18, // 1.0 = disabled repeat_penalty: 1.18, // 1.0 = disabled
penalize_nl: false,
top_k: 40, // <= 0 to use vocab size top_k: 40, // <= 0 to use vocab size
top_p: 0.95, // 1.0 = disabled top_p: 0.95, // 1.0 = disabled
min_p: 0.05, // 0 = disabled min_p: 0.05, // 0 = disabled
@ -779,7 +778,6 @@
${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })} ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })} ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })} ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })}
${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })} ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })} ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })} ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}

View File

@ -225,7 +225,6 @@
temperature: 0.7, temperature: 0.7,
repeat_last_n: 256, // 0 = disable penalty, -1 = context size repeat_last_n: 256, // 0 = disable penalty, -1 = context size
repeat_penalty: 1.18, // 1.0 = disabled repeat_penalty: 1.18, // 1.0 = disabled
penalize_nl: false,
top_k: 40, // <= 0 to use vocab size top_k: 40, // <= 0 to use vocab size
top_p: 0.95, // 1.0 = disabled top_p: 0.95, // 1.0 = disabled
min_p: 0.05, // 0 = disabled min_p: 0.05, // 0 = disabled
@ -782,7 +781,6 @@
${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })} ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })} ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })} ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })}
${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })} ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })} ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })} ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}

View File

@ -20,8 +20,9 @@
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include <memory>
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
using json = nlohmann::ordered_json; using json = nlohmann::ordered_json;
@ -40,17 +41,6 @@ using json = nlohmann::ordered_json;
#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) #define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) #define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
enum error_type {
ERROR_TYPE_INVALID_REQUEST,
ERROR_TYPE_AUTHENTICATION,
ERROR_TYPE_SERVER,
ERROR_TYPE_NOT_FOUND,
ERROR_TYPE_PERMISSION,
ERROR_TYPE_UNAVAILABLE, // custom error
ERROR_TYPE_NOT_SUPPORTED, // custom error
};
template <typename T> template <typename T>
static T json_value(const json & body, const std::string & key, const T & default_value) { static T json_value(const json & body, const std::string & key, const T & default_value) {
// Fallback null to default value // Fallback null to default value
@ -148,6 +138,7 @@ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_
* and multiple prompts (multi-tasks): * and multiple prompts (multi-tasks):
* - "prompt": ["string1", "string2"] * - "prompt": ["string1", "string2"]
* - "prompt": ["string1", [12, 34, 56]] * - "prompt": ["string1", [12, 34, 56]]
* - "prompt": [[12, 34, 56], [78, 90, 12]]
* - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]] * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
*/ */
static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) { static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
@ -174,6 +165,9 @@ static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, con
} else { } else {
throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts"); throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
} }
if (result.empty()) {
throw std::runtime_error("\"prompt\" must not be empty");
}
return result; return result;
} }
@ -337,12 +331,12 @@ static std::string llama_get_chat_template(const struct llama_model * model) {
std::string template_key = "tokenizer.chat_template"; std::string template_key = "tokenizer.chat_template";
// call with NULL buffer to get the total size of the string // call with NULL buffer to get the total size of the string
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0); int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0);
if (res < 0) { if (res < 2) {
return ""; return "";
} else { } else {
std::vector<char> model_template(res, 0); std::vector<char> model_template(res + 1, 0);
llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
return std::string(model_template.data(), model_template.size()); return std::string(model_template.data(), model_template.size() - 1);
} }
} }
@ -485,48 +479,11 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
return out; return out;
} }
struct completion_token_output {
llama_token tok;
std::string text_to_send;
struct token_prob {
llama_token tok;
float prob;
};
std::vector<token_prob> probs;
};
// convert a vector of completion_token_output to json
static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
json out = json::array();
for (const auto & prob : probs) {
json probs_for_token = json::array();
for (const auto & p : prob.probs) {
const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
probs_for_token.push_back(json {
{"tok_str", tok_str},
{"prob", p.prob},
});
}
const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
out.push_back(json {
{"content", tok_str},
{"probs", probs_for_token},
});
}
return out;
}
static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) { static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
const std::string str = const std::string str =
std::string(event) + ": " + std::string(event) + ": " +
data.dump(-1, ' ', false, json::error_handler_t::replace) + data.dump(-1, ' ', false, json::error_handler_t::replace) +
"\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain) "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
LOG_DBG("data stream, to_send: %s", str.c_str()); LOG_DBG("data stream, to_send: %s", str.c_str());
@ -543,8 +500,6 @@ static json oaicompat_completion_params_parse(
const std::string & chat_template) { const std::string & chat_template) {
json llama_params; json llama_params;
llama_params["__oaicompat"] = true;
// Apply chat template to the list of messages // Apply chat template to the list of messages
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages")); llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
@ -604,166 +559,9 @@ static json oaicompat_completion_params_parse(
return llama_params; return llama_params;
} }
static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) {
bool stopped_word = result.count("stopped_word") != 0;
bool stopped_eos = json_value(result, "stopped_eos", false);
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
std::string content = json_value(result, "content", std::string(""));
std::string finish_reason = "length";
if (stopped_word || stopped_eos) {
finish_reason = "stop";
}
json choices =
streaming ? json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"delta", json::object()}}})
: json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"message", json{{"content", content},
{"role", "assistant"}}}}});
std::time_t t = std::time(0);
json res = json {
{"choices", choices},
{"created", t},
{"model",
json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"object", streaming ? "chat.completion.chunk" : "chat.completion"},
{"usage", json {
{"completion_tokens", num_tokens_predicted},
{"prompt_tokens", num_prompt_tokens},
{"total_tokens", num_tokens_predicted + num_prompt_tokens}
}},
{"id", completion_id}
};
// extra fields for debugging purposes
if (verbose) {
res["__verbose"] = result;
}
if (result.contains("completion_probabilities")) {
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
}
if (result.contains("timings")) {
res.push_back({"timings", json_value(result, "timings", json::object())});
}
return res;
}
// return value is vector as there is one case where we might need to generate two responses
static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
return std::vector<json>({result});
}
bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
bool stopped_word = json_value(result, "stopped_word", false);
bool stopped_eos = json_value(result, "stopped_eos", false);
bool stopped_limit = json_value(result, "stopped_limit", false);
std::string content = json_value(result, "content", std::string(""));
std::string finish_reason;
if (stopped_word || stopped_eos) {
finish_reason = "stop";
}
if (stopped_limit) {
finish_reason = "length";
}
std::time_t t = std::time(0);
json choices;
if (!finish_reason.empty()) {
choices = json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"delta", json::object()}}});
} else {
if (first) {
if (content.empty()) {
choices = json::array({json{{"finish_reason", nullptr},
{"index", 0},
{"delta", json{{"role", "assistant"}}}}});
} else {
// We have to send this as two updates to conform to openai behavior
json initial_ret = json{{"choices", json::array({json{
{"finish_reason", nullptr},
{"index", 0},
{"delta", json{
{"role", "assistant"}
}}}})},
{"created", t},
{"id", completion_id},
{"model", modelname},
{"object", "chat.completion.chunk"}};
json second_ret = json{
{"choices", json::array({json{{"finish_reason", nullptr},
{"index", 0},
{"delta", json{
{"content", content}}}
}})},
{"created", t},
{"id", completion_id},
{"model", modelname},
{"object", "chat.completion.chunk"}};
return std::vector<json>({initial_ret, second_ret});
}
} else {
// Some idiosyncrasy in task processing logic makes several trailing calls
// with empty content, we ignore these at the calee site.
if (content.empty()) {
return std::vector<json>({json::object()});
}
choices = json::array({json{
{"finish_reason", nullptr},
{"index", 0},
{"delta",
json{
{"content", content},
}},
}});
}
}
json ret = json {
{"choices", choices},
{"created", t},
{"id", completion_id},
{"model", modelname},
{"object", "chat.completion.chunk"}
};
if (result.contains("timings")) {
ret.push_back({"timings", json_value(result, "timings", json::object())});
}
if (!finish_reason.empty()) {
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
ret.push_back({"usage", json {
{"completion_tokens", num_tokens_predicted},
{"prompt_tokens", num_prompt_tokens},
{"total_tokens", num_tokens_predicted + num_prompt_tokens}
}});
}
return std::vector<json>({ret});
}
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) { static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
json data = json::array(); json data = json::array();
int32_t n_tokens = 0;
int i = 0; int i = 0;
for (const auto & elem : embeddings) { for (const auto & elem : embeddings) {
data.push_back(json{ data.push_back(json{
@ -771,14 +569,16 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
{"index", i++}, {"index", i++},
{"object", "embedding"} {"object", "embedding"}
}); });
n_tokens += json_value(elem, "tokens_evaluated", 0);
} }
json res = json { json res = json {
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"object", "list"}, {"object", "list"},
{"usage", json { // TODO: fill {"usage", json {
{"prompt_tokens", 0}, {"prompt_tokens", n_tokens},
{"total_tokens", 0} {"total_tokens", n_tokens}
}}, }},
{"data", data} {"data", data}
}; };
@ -788,20 +588,23 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
static json format_response_rerank(const json & request, const json & ranks) { static json format_response_rerank(const json & request, const json & ranks) {
json data = json::array(); json data = json::array();
int32_t n_tokens = 0;
int i = 0; int i = 0;
for (const auto & rank : ranks) { for (const auto & rank : ranks) {
data.push_back(json{ data.push_back(json{
{"index", i++}, {"index", i++},
{"relevance_score", json_value(rank, "score", 0.0)}, {"relevance_score", json_value(rank, "score", 0.0)},
}); });
n_tokens += json_value(rank, "tokens_evaluated", 0);
} }
json res = json { json res = json {
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"object", "list"}, {"object", "list"},
{"usage", json { // TODO: fill {"usage", json {
{"prompt_tokens", 0}, {"prompt_tokens", n_tokens},
{"total_tokens", 0} {"total_tokens", n_tokens}
}}, }},
{"results", data} {"results", data}
}; };
@ -854,42 +657,17 @@ static json format_detokenized_response(const std::string & content) {
}; };
} }
static json format_error_response(const std::string & message, const enum error_type type) { static json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) {
std::string type_str; json data = json::array();
int code = 500; for (const auto & lb : logit_bias) {
switch (type) { data.push_back(json{
case ERROR_TYPE_INVALID_REQUEST: {"bias", lb.bias},
type_str = "invalid_request_error"; {"token", lb.token},
code = 400; });
break;
case ERROR_TYPE_AUTHENTICATION:
type_str = "authentication_error";
code = 401;
break;
case ERROR_TYPE_NOT_FOUND:
type_str = "not_found_error";
code = 404;
break;
case ERROR_TYPE_SERVER:
type_str = "server_error";
code = 500;
break;
case ERROR_TYPE_PERMISSION:
type_str = "permission_error";
code = 403;
break;
case ERROR_TYPE_NOT_SUPPORTED:
type_str = "not_supported_error";
code = 501;
break;
case ERROR_TYPE_UNAVAILABLE:
type_str = "unavailable_error";
code = 503;
break;
} }
return json { return data;
{"code", code}, }
{"message", message},
{"type", type_str}, static std::string safe_json_to_str(json data) {
}; return data.dump(-1, ' ', false, json::error_handler_t::replace);
} }

View File

@ -15,7 +15,7 @@
<!-- sidebar --> <!-- sidebar -->
<div class="drawer-side h-screen lg:h-screen z-50 lg:max-w-64"> <div class="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
<label for="toggle-drawer" aria-label="close sidebar" class="drawer-overlay"></label> <label for="toggle-drawer" aria-label="close sidebar" class="drawer-overlay"></label>
<div class="flex flex-col bg-base-200 min-h-full max-w-[calc(100vw-2em)] py-4 px-4"> <div class="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
<div class="flex flex-row items-center justify-between mb-4 mt-4"> <div class="flex flex-row items-center justify-between mb-4 mt-4">
<h2 class="font-bold ml-4">Conversations</h2> <h2 class="font-bold ml-4">Conversations</h2>
@ -120,51 +120,25 @@
{{ messages.length === 0 ? 'Send a message to start' : '' }} {{ messages.length === 0 ? 'Send a message to start' : '' }}
</div> </div>
<div v-for="msg in messages" class="group"> <div v-for="msg in messages" class="group">
<div :class="{ <message-bubble
'chat': true, :config="config"
'chat-start': msg.role !== 'user', :msg="msg"
'chat-end': msg.role === 'user', :key="msg.id"
}"> :is-generating="isGenerating"
<div :class="{ :edit-user-msg-and-regenerate="editUserMsgAndRegenerate"
'chat-bubble markdown': true, :regenerate-msg="regenerateMsg"></message-bubble>
'chat-bubble-base-300': msg.role !== 'user',
}">
<!-- textarea for editing message -->
<template v-if="editingMsg && editingMsg.id === msg.id">
<textarea
class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
v-model="msg.content"></textarea>
<br/>
<button class="btn btn-ghost mt-2 mr-2" @click="editingMsg = null">Cancel</button>
<button class="btn mt-2" @click="editUserMsgAndRegenerate(msg)">Submit</button>
</template>
<!-- render message as markdown -->
<vue-markdown v-else :source="msg.content" />
</div>
</div>
<!-- actions for each message -->
<div :class="{'text-right': msg.role === 'user'}" class="mx-4 mt-2 mb-2">
<!-- user message -->
<button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingMsg = msg" :disabled="isGenerating">
✍️ Edit
</button>
<!-- assistant message -->
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
🔄 Regenerate
</button>
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg(msg)" :disabled="isGenerating">
📋 Copy
</button>
</div>
</div> </div>
<!-- pending (ongoing) assistant message --> <!-- pending (ongoing) assistant message -->
<div id="pending-msg" class="chat chat-start"> <div id="pending-msg" class="group">
<div v-if="pendingMsg" class="chat-bubble markdown chat-bubble-base-300"> <message-bubble
<span v-if="!pendingMsg.content" class="loading loading-dots loading-md"></span> v-if="pendingMsg"
<vue-markdown v-else :source="pendingMsg.content" /> :config="config"
</div> :msg="pendingMsg"
:key="pendingMsg.id"
:is-generating="isGenerating"
:edit-user-msg-and-regenerate="() => {}"
:regenerate-msg="() => {}"></message-bubble>
</div> </div>
</div> </div>
@ -227,6 +201,14 @@
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible"> <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
<summary class="collapse-title font-bold">Advanced config</summary> <summary class="collapse-title font-bold">Advanced config</summary>
<div class="collapse-content"> <div class="collapse-content">
<div class="flex flex-row items-center mb-2" v-if="isDev">
<!-- this button only shows in dev mode, used to import a demo conversation to test message rendering -->
<button class="btn" @click="debugImportDemoConv()">(debug) Import demo conversation</button>
</div>
<div class="flex flex-row items-center mb-2">
<input type="checkbox" class="checkbox" v-model="config.showTokensPerSecond" />
<span class="ml-4">Show tokens per second</span>
</div>
<label class="form-control mb-2"> <label class="form-control mb-2">
<!-- Custom parameters input --> <!-- Custom parameters input -->
<div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div> <div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
@ -247,6 +229,66 @@
</div> </div>
<!-- Template to be used as message bubble -->
<template id="message-bubble">
<div :class="{
'chat': true,
'chat-start': msg.role !== 'user',
'chat-end': msg.role === 'user',
}">
<div :class="{
'chat-bubble markdown': true,
'chat-bubble-base-300': msg.role !== 'user',
}">
<!-- textarea for editing message -->
<template v-if="editingContent !== null">
<textarea
class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
v-model="editingContent"></textarea>
<br/>
<button class="btn btn-ghost mt-2 mr-2" @click="editingContent = null">Cancel</button>
<button class="btn mt-2" @click="editMsg()">Submit</button>
</template>
<template v-else>
<!-- show loading dots for pending message -->
<span v-if="msg.content === null" class="loading loading-dots loading-md"></span>
<!-- render message as markdown -->
<vue-markdown v-else :source="msg.content"></vue-markdown>
<!-- render timings if enabled -->
<div class="dropdown dropdown-hover dropdown-top mt-2" v-if="timings && config.showTokensPerSecond">
<div tabindex="0" role="button" class="cursor-pointer font-semibold text-sm opacity-60">Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s</div>
<div class="dropdown-content bg-base-100 z-10 w-64 p-2 shadow mt-4">
<b>Prompt</b><br/>
- Tokens: {{ timings.prompt_n }}<br/>
- Time: {{ timings.prompt_ms }} ms<br/>
- Speed: {{ timings.prompt_per_second.toFixed(1) }} t/s<br/>
<b>Generation</b><br/>
- Tokens: {{ timings.predicted_n }}<br/>
- Time: {{ timings.predicted_ms }} ms<br/>
- Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s<br/>
</div>
</div>
</template>
</div>
</div>
<!-- actions for each message -->
<div :class="{'text-right': msg.role === 'user', 'opacity-0': isGenerating}" class="mx-4 mt-2 mb-2">
<!-- user message -->
<button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingContent = msg.content" :disabled="isGenerating">
✍️ Edit
</button>
<!-- assistant message -->
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
🔄 Regenerate
</button>
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg()" :disabled="isGenerating">
📋 Copy
</button>
</div>
</template>
<!-- Template to be used by settings modal --> <!-- Template to be used by settings modal -->
<template id="settings-modal-short-input"> <template id="settings-modal-short-input">
<label class="input input-bordered join-item grow flex items-center gap-2 mb-2"> <label class="input input-bordered join-item grow flex items-center gap-2 mb-2">

View File

@ -8,15 +8,21 @@
"name": "webui", "name": "webui",
"version": "0.0.0", "version": "0.0.0",
"dependencies": { "dependencies": {
"@sec-ant/readable-stream": "^0.6.0",
"@vscode/markdown-it-katex": "^1.1.1",
"autoprefixer": "^10.4.20", "autoprefixer": "^10.4.20",
"daisyui": "^4.12.14", "daisyui": "^4.12.14",
"highlight.js": "^11.10.0",
"katex": "^0.16.15",
"markdown-it": "^14.1.0", "markdown-it": "^14.1.0",
"postcss": "^8.4.49", "postcss": "^8.4.49",
"tailwindcss": "^3.4.15", "tailwindcss": "^3.4.15",
"textlinestream": "^1.1.1",
"vite-plugin-singlefile": "^2.0.3", "vite-plugin-singlefile": "^2.0.3",
"vue": "^3.5.13" "vue": "^3.5.13"
}, },
"devDependencies": { "devDependencies": {
"sass-embedded": "^1.83.0",
"vite": "^5.4.10" "vite": "^5.4.10"
} }
}, },
@ -32,6 +38,13 @@
"url": "https://github.com/sponsors/sindresorhus" "url": "https://github.com/sponsors/sindresorhus"
} }
}, },
"node_modules/@bufbuild/protobuf": {
"version": "2.2.3",
"resolved": "https://registry.npmjs.org/@bufbuild/protobuf/-/protobuf-2.2.3.tgz",
"integrity": "sha512-tFQoXHJdkEOSwj5tRIZSPNUuXK3RaR7T1nUrPgbYX1pUbvqqaaZAsfo+NXBPsz5rZMSKVFrgK1WL8Q/MSLvprg==",
"devOptional": true,
"license": "(Apache-2.0 AND BSD-3-Clause)"
},
"node_modules/@esbuild/aix-ppc64": { "node_modules/@esbuild/aix-ppc64": {
"version": "0.21.5", "version": "0.21.5",
"resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz", "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz",
@ -605,6 +618,21 @@
"win32" "win32"
] ]
}, },
"node_modules/@sec-ant/readable-stream": {
"version": "0.6.0",
"resolved": "https://registry.npmjs.org/@sec-ant/readable-stream/-/readable-stream-0.6.0.tgz",
"integrity": "sha512-uiBh8DrB5FN35gP6/o8JEhEQ7/ci1jUsOZO/VMUjyvTpjtV54VstOXVj1TvTj/wsT23pfX6butxxh3qufsW3+g==",
"license": "MIT"
},
"node_modules/@vscode/markdown-it-katex": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/@vscode/markdown-it-katex/-/markdown-it-katex-1.1.1.tgz",
"integrity": "sha512-3KTlbsRBPJQLE2YmLL7K6nunTlU+W9T5+FjfNdWuIUKgxSS6HWLQHaO3L4MkJi7z7MpIPpY+g4N+cWNBPE/MSA==",
"license": "MIT",
"dependencies": {
"katex": "^0.16.4"
}
},
"node_modules/@vue/compiler-dom": { "node_modules/@vue/compiler-dom": {
"version": "3.5.13", "version": "3.5.13",
"resolved": "https://registry.npmjs.org/@vue/compiler-dom/-/compiler-dom-3.5.13.tgz", "resolved": "https://registry.npmjs.org/@vue/compiler-dom/-/compiler-dom-3.5.13.tgz",
@ -1003,6 +1031,13 @@
"browserslist": ">= 4.21.0" "browserslist": ">= 4.21.0"
} }
}, },
"node_modules/buffer-builder": {
"version": "0.2.0",
"resolved": "https://registry.npmjs.org/buffer-builder/-/buffer-builder-0.2.0.tgz",
"integrity": "sha512-7VPMEPuYznPSoR21NE1zvd2Xna6c/CloiZCfcMXR1Jny6PjX0N4Nsa38zcBFo/FMK+BlA+FLKbJCQ0i2yxp+Xg==",
"devOptional": true,
"license": "MIT/X11"
},
"node_modules/caniuse-lite": { "node_modules/caniuse-lite": {
"version": "1.0.30001684", "version": "1.0.30001684",
"resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001684.tgz", "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001684.tgz",
@ -1165,6 +1200,22 @@
"node": ">=8.0" "node": ">=8.0"
} }
}, },
"node_modules/colorjs.io": {
"version": "0.5.2",
"resolved": "https://registry.npmjs.org/colorjs.io/-/colorjs.io-0.5.2.tgz",
"integrity": "sha512-twmVoizEW7ylZSN32OgKdXRmo1qg+wT5/6C3xu5b9QsWzSFAhHLn2xd8ro0diCsKfCj1RdaTP/nrcW+vAoQPIw==",
"devOptional": true,
"license": "MIT"
},
"node_modules/commander": {
"version": "8.3.0",
"resolved": "https://registry.npmjs.org/commander/-/commander-8.3.0.tgz",
"integrity": "sha512-OkTL9umf+He2DZkUq8f8J9of7yL6RJKI24dVITBmNfZBmri9zYZQrKkuXiKhyfPSu8tUhnVBB1iKXevvnlR4Ww==",
"license": "MIT",
"engines": {
"node": ">= 12"
}
},
"node_modules/css-selector-tokenizer": { "node_modules/css-selector-tokenizer": {
"version": "0.8.0", "version": "0.8.0",
"resolved": "https://registry.npmjs.org/css-selector-tokenizer/-/css-selector-tokenizer-0.8.0.tgz", "resolved": "https://registry.npmjs.org/css-selector-tokenizer/-/css-selector-tokenizer-0.8.0.tgz",
@ -1472,6 +1523,31 @@
"node": ">=10.13.0" "node": ">=10.13.0"
} }
}, },
"node_modules/has-flag": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
"integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
"devOptional": true,
"license": "MIT",
"engines": {
"node": ">=8"
}
},
"node_modules/highlight.js": {
"version": "11.10.0",
"resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-11.10.0.tgz",
"integrity": "sha512-SYVnVFswQER+zu1laSya563s+F8VDGt7o35d4utbamowvUNLLMovFqwCLSocpZTz3MgaSRA1IbqRWZv97dtErQ==",
"engines": {
"node": ">=12.0.0"
}
},
"node_modules/immutable": {
"version": "5.0.3",
"resolved": "https://registry.npmjs.org/immutable/-/immutable-5.0.3.tgz",
"integrity": "sha512-P8IdPQHq3lA1xVeBRi5VPqUm5HDgKnx0Ru51wZz5mjxHr5n3RWhjIpOFU7ybkUxfB+5IToy+OLaHYDBIWsv+uw==",
"devOptional": true,
"license": "MIT"
},
"node_modules/is-glob": { "node_modules/is-glob": {
"version": "4.0.3", "version": "4.0.3",
"resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz", "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz",
@ -1502,6 +1578,22 @@
"jiti": "bin/jiti.js" "jiti": "bin/jiti.js"
} }
}, },
"node_modules/katex": {
"version": "0.16.15",
"resolved": "https://registry.npmjs.org/katex/-/katex-0.16.15.tgz",
"integrity": "sha512-yE9YJIEAk2aZ+FL/G8r+UGw0CTUzEA8ZFy6E+8tc3spHUKq3qBnzCkI1CQwGoI9atJhVyFPEypQsTY7mJ1Pi9w==",
"funding": [
"https://opencollective.com/katex",
"https://github.com/sponsors/katex"
],
"license": "MIT",
"dependencies": {
"commander": "^8.3.0"
},
"bin": {
"katex": "cli.js"
}
},
"node_modules/lilconfig": { "node_modules/lilconfig": {
"version": "2.1.0", "version": "2.1.0",
"resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-2.1.0.tgz", "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-2.1.0.tgz",
@ -2021,6 +2113,381 @@
"integrity": "sha512-AYnb1nQyY49te+VRAVgmzfcgjYS91mY5P0TKUDCLEM+gNnA+3T6rWITXRLYCpahpqSQbN5cE+gHpnPyXjHWxcw==", "integrity": "sha512-AYnb1nQyY49te+VRAVgmzfcgjYS91mY5P0TKUDCLEM+gNnA+3T6rWITXRLYCpahpqSQbN5cE+gHpnPyXjHWxcw==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/rxjs": {
"version": "7.8.1",
"resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.8.1.tgz",
"integrity": "sha512-AA3TVj+0A2iuIoQkWEK/tqFjBq2j+6PO6Y0zJcvzLAFhEFIO3HL0vls9hWLncZbAAbK0mar7oZ4V079I/qPMxg==",
"devOptional": true,
"license": "Apache-2.0",
"dependencies": {
"tslib": "^2.1.0"
}
},
"node_modules/sass-embedded": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded/-/sass-embedded-1.83.0.tgz",
"integrity": "sha512-/8cYZeL39evUqe0o//193na51Q1VWZ61qhxioQvLJwOtWIrX+PgNhCyD8RSuTtmzc4+6+waFZf899bfp/MCUwA==",
"devOptional": true,
"license": "MIT",
"dependencies": {
"@bufbuild/protobuf": "^2.0.0",
"buffer-builder": "^0.2.0",
"colorjs.io": "^0.5.0",
"immutable": "^5.0.2",
"rxjs": "^7.4.0",
"supports-color": "^8.1.1",
"sync-child-process": "^1.0.2",
"varint": "^6.0.0"
},
"bin": {
"sass": "dist/bin/sass.js"
},
"engines": {
"node": ">=16.0.0"
},
"optionalDependencies": {
"sass-embedded-android-arm": "1.83.0",
"sass-embedded-android-arm64": "1.83.0",
"sass-embedded-android-ia32": "1.83.0",
"sass-embedded-android-riscv64": "1.83.0",
"sass-embedded-android-x64": "1.83.0",
"sass-embedded-darwin-arm64": "1.83.0",
"sass-embedded-darwin-x64": "1.83.0",
"sass-embedded-linux-arm": "1.83.0",
"sass-embedded-linux-arm64": "1.83.0",
"sass-embedded-linux-ia32": "1.83.0",
"sass-embedded-linux-musl-arm": "1.83.0",
"sass-embedded-linux-musl-arm64": "1.83.0",
"sass-embedded-linux-musl-ia32": "1.83.0",
"sass-embedded-linux-musl-riscv64": "1.83.0",
"sass-embedded-linux-musl-x64": "1.83.0",
"sass-embedded-linux-riscv64": "1.83.0",
"sass-embedded-linux-x64": "1.83.0",
"sass-embedded-win32-arm64": "1.83.0",
"sass-embedded-win32-ia32": "1.83.0",
"sass-embedded-win32-x64": "1.83.0"
}
},
"node_modules/sass-embedded-android-arm": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded-android-arm/-/sass-embedded-android-arm-1.83.0.tgz",
"integrity": "sha512-uwFSXzJlfbd4Px189xE5l+cxN8+TQpXdQgJec7TIrb4HEY7imabtpYufpVdqUVwT1/uiis5V4+qIEC4Vl5XObQ==",
"cpu": [
"arm"
],
"license": "MIT",
"optional": true,
"os": [
"android"
],
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/sass-embedded-android-arm64": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded-android-arm64/-/sass-embedded-android-arm64-1.83.0.tgz",
"integrity": "sha512-GBiCvM4a2rkWBLdYDxI6XYnprfk5U5c81g69RC2X6kqPuzxzx8qTArQ9M6keFK4+iDQ5N9QTwFCr0KbZTn+ZNQ==",
"cpu": [
"arm64"
],
"license": "MIT",
"optional": true,
"os": [
"android"
],
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/sass-embedded-android-ia32": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded-android-ia32/-/sass-embedded-android-ia32-1.83.0.tgz",
"integrity": "sha512-5ATPdGo2SICqAhiJl/Z8KQ23zH4sGgobGgux0TnrNtt83uHZ+r+To/ubVJ7xTkZxed+KJZnIpolGD8dQyQqoTg==",
"cpu": [
"ia32"
],
"license": "MIT",
"optional": true,
"os": [
"android"
],
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/sass-embedded-android-riscv64": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded-android-riscv64/-/sass-embedded-android-riscv64-1.83.0.tgz",
"integrity": "sha512-aveknUOB8GZewOzVn2Uwk+DKcncTR50Q6vtzslNMGbYnxtgQNHzy8A1qVEviNUruex+pHofppeMK4iMPFAbiEQ==",
"cpu": [
"riscv64"
],
"license": "MIT",
"optional": true,
"os": [
"android"
],
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/sass-embedded-android-x64": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded-android-x64/-/sass-embedded-android-x64-1.83.0.tgz",
"integrity": "sha512-WqIay/72ncyf9Ph4vS742J3a73wZihWmzFUwpn1OD6lme1Aj4eWzWIve5IVnlTEJgcZcDHu6ECID9IZgehJKoA==",
"cpu": [
"x64"
],
"license": "MIT",
"optional": true,
"os": [
"android"
],
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/sass-embedded-darwin-arm64": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded-darwin-arm64/-/sass-embedded-darwin-arm64-1.83.0.tgz",
"integrity": "sha512-XQl9QqgxFFIPm/CzHhmppse5o9ocxrbaAdC2/DAnlAqvYWBBtgFqPjGoYlej13h9SzfvNoogx+y9r+Ap+e+hYg==",
"cpu": [
"arm64"
],
"license": "MIT",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/sass-embedded-darwin-x64": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded-darwin-x64/-/sass-embedded-darwin-x64-1.83.0.tgz",
"integrity": "sha512-ERQ7Tvp1kFOW3ux4VDFIxb7tkYXHYc+zJpcrbs0hzcIO5ilIRU2tIOK1OrNwrFO6Qxyf7AUuBwYKLAtIU/Nz7g==",
"cpu": [
"x64"
],
"license": "MIT",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/sass-embedded-linux-arm": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded-linux-arm/-/sass-embedded-linux-arm-1.83.0.tgz",
"integrity": "sha512-baG9RYBJxUFmqwDNC9h9ZFElgJoyO3jgHGjzEZ1wHhIS9anpG+zZQvO8bHx3dBpKEImX+DBeLX+CxsFR9n81gQ==",
"cpu": [
"arm"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/sass-embedded-linux-arm64": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded-linux-arm64/-/sass-embedded-linux-arm64-1.83.0.tgz",
"integrity": "sha512-syEAVTJt4qhaMLxrSwOWa46zdqHJdnqJkLUK+t9aCr8xqBZLPxSUeIGji76uOehQZ1C+KGFj6n9xstHN6wzOJw==",
"cpu": [
"arm64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/sass-embedded-linux-ia32": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded-linux-ia32/-/sass-embedded-linux-ia32-1.83.0.tgz",
"integrity": "sha512-RRBxQxMpoxu5+XcSSc6QR/o9asEwUzR8AbCS83RaXcdTIHTa/CccQsiAoDDoPlRsMTLqnzs0LKL4CfOsf7zBbA==",
"cpu": [
"ia32"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/sass-embedded-linux-musl-arm": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-arm/-/sass-embedded-linux-musl-arm-1.83.0.tgz",
"integrity": "sha512-Yc7u2TelCfBab+PRob9/MNJFh3EooMiz4urvhejXkihTiKSHGCv5YqDdtWzvyb9tY2Jb7YtYREVuHwfdVn3dTQ==",
"cpu": [
"arm"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/sass-embedded-linux-musl-arm64": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-arm64/-/sass-embedded-linux-musl-arm64-1.83.0.tgz",
"integrity": "sha512-Y7juhPHClUO2H5O+u+StRy6SEAcwZ+hTEk5WJdEmo1Bb1gDtfHvJaWB/iFZJ2tW0W1e865AZeUrC4OcOFjyAQA==",
"cpu": [
"arm64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/sass-embedded-linux-musl-ia32": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-ia32/-/sass-embedded-linux-musl-ia32-1.83.0.tgz",
"integrity": "sha512-arQeYwGmwXV8byx5G1PtSzZWW1jbkfR5qrIHMEbTFSAvAxpqjgSvCvrHMOFd73FcMxVaYh4BX9LQNbKinkbEdg==",
"cpu": [
"ia32"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/sass-embedded-linux-musl-riscv64": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-riscv64/-/sass-embedded-linux-musl-riscv64-1.83.0.tgz",
"integrity": "sha512-E6uzlIWz59rut+Z3XR6mLG915zNzv07ISvj3GUNZENdHM7dF8GQ//ANoIpl5PljMQKp89GnYdvo6kj2gnaBf/g==",
"cpu": [
"riscv64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/sass-embedded-linux-musl-x64": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-x64/-/sass-embedded-linux-musl-x64-1.83.0.tgz",
"integrity": "sha512-eAMK6tyGqvqr21r9g8BnR3fQc1rYFj85RGduSQ3xkITZ6jOAnOhuU94N5fwRS852Hpws0lXhET+7JHXgg3U18w==",
"cpu": [
"x64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/sass-embedded-linux-riscv64": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded-linux-riscv64/-/sass-embedded-linux-riscv64-1.83.0.tgz",
"integrity": "sha512-Ojpi78pTv02sy2fUYirRGXHLY3fPnV/bvwuC2i5LwPQw2LpCcFyFTtN0c5h4LJDk9P6wr+/ZB/JXU8tHIOlK+Q==",
"cpu": [
"riscv64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/sass-embedded-linux-x64": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded-linux-x64/-/sass-embedded-linux-x64-1.83.0.tgz",
"integrity": "sha512-3iLjlXdoPfgZRtX4odhRvka1BQs5mAXqfCtDIQBgh/o0JnGPzJIWWl9bYLpHxK8qb+uyVBxXYgXpI0sCzArBOw==",
"cpu": [
"x64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/sass-embedded-win32-arm64": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded-win32-arm64/-/sass-embedded-win32-arm64-1.83.0.tgz",
"integrity": "sha512-iOHw/8/t2dlTW3lOFwG5eUbiwhEyGWawivlKWJ8lkXH7fjMpVx2VO9zCFAm8RvY9xOHJ9sf1L7g5bx3EnNP9BQ==",
"cpu": [
"arm64"
],
"license": "MIT",
"optional": true,
"os": [
"win32"
],
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/sass-embedded-win32-ia32": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded-win32-ia32/-/sass-embedded-win32-ia32-1.83.0.tgz",
"integrity": "sha512-2PxNXJ8Pad4geVcTXY4rkyTr5AwbF8nfrCTDv0ulbTvPhzX2mMKEGcBZUXWn5BeHZTBc6whNMfS7d5fQXR9dDQ==",
"cpu": [
"ia32"
],
"license": "MIT",
"optional": true,
"os": [
"win32"
],
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/sass-embedded-win32-x64": {
"version": "1.83.0",
"resolved": "https://registry.npmjs.org/sass-embedded-win32-x64/-/sass-embedded-win32-x64-1.83.0.tgz",
"integrity": "sha512-muBXkFngM6eLTNqOV0FQi7Dv9s+YRQ42Yem26mosdan/GmJQc81deto6uDTgrYn+bzFNmiXcOdfm+0MkTWK3OQ==",
"cpu": [
"x64"
],
"license": "MIT",
"optional": true,
"os": [
"win32"
],
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/sucrase": { "node_modules/sucrase": {
"version": "3.35.0", "version": "3.35.0",
"resolved": "https://registry.npmjs.org/sucrase/-/sucrase-3.35.0.tgz", "resolved": "https://registry.npmjs.org/sucrase/-/sucrase-3.35.0.tgz",
@ -2640,6 +3107,45 @@
"node": ">=8" "node": ">=8"
} }
}, },
"node_modules/supports-color": {
"version": "8.1.1",
"resolved": "https://registry.npmjs.org/supports-color/-/supports-color-8.1.1.tgz",
"integrity": "sha512-MpUEN2OodtUzxvKQl72cUF7RQ5EiHsGvSsVG0ia9c5RbWGL2CI4C7EpPS8UTBIplnlzZiNuV56w+FuNxy3ty2Q==",
"devOptional": true,
"license": "MIT",
"dependencies": {
"has-flag": "^4.0.0"
},
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/chalk/supports-color?sponsor=1"
}
},
"node_modules/sync-child-process": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/sync-child-process/-/sync-child-process-1.0.2.tgz",
"integrity": "sha512-8lD+t2KrrScJ/7KXCSyfhT3/hRq78rC0wBFqNJXv3mZyn6hW2ypM05JmlSvtqRbeq6jqA94oHbxAr2vYsJ8vDA==",
"devOptional": true,
"license": "MIT",
"dependencies": {
"sync-message-port": "^1.0.0"
},
"engines": {
"node": ">=16.0.0"
}
},
"node_modules/sync-message-port": {
"version": "1.1.3",
"resolved": "https://registry.npmjs.org/sync-message-port/-/sync-message-port-1.1.3.tgz",
"integrity": "sha512-GTt8rSKje5FilG+wEdfCkOcLL7LWqpMlr2c3LRuKt/YXxcJ52aGSbGBAdI4L3aaqfrBt6y711El53ItyH1NWzg==",
"devOptional": true,
"license": "MIT",
"engines": {
"node": ">=16.0.0"
}
},
"node_modules/tailwindcss": { "node_modules/tailwindcss": {
"version": "3.4.15", "version": "3.4.15",
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.15.tgz", "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.15.tgz",
@ -2677,12 +3183,32 @@
"node": ">=14.0.0" "node": ">=14.0.0"
} }
}, },
"node_modules/textlinestream": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/textlinestream/-/textlinestream-1.1.1.tgz",
"integrity": "sha512-iBHbi7BQxrFmwZUQJsT0SjNzlLLsXhvW/kg7EyOMVMBIrlnj/qYofwo1LVLZi+3GbUEo96Iu2eqToI2+lZoAEQ==",
"license": "MIT"
},
"node_modules/tslib": {
"version": "2.8.1",
"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
"integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
"devOptional": true,
"license": "0BSD"
},
"node_modules/uc.micro": { "node_modules/uc.micro": {
"version": "2.1.0", "version": "2.1.0",
"resolved": "https://registry.npmjs.org/uc.micro/-/uc.micro-2.1.0.tgz", "resolved": "https://registry.npmjs.org/uc.micro/-/uc.micro-2.1.0.tgz",
"integrity": "sha512-ARDJmphmdvUk6Glw7y9DQ2bFkKBHwQHLi2lsaH6PPmz/Ka9sFOBsBluozhDltWmnv9u/cF6Rt87znRTPV+yp/A==", "integrity": "sha512-ARDJmphmdvUk6Glw7y9DQ2bFkKBHwQHLi2lsaH6PPmz/Ka9sFOBsBluozhDltWmnv9u/cF6Rt87znRTPV+yp/A==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/varint": {
"version": "6.0.0",
"resolved": "https://registry.npmjs.org/varint/-/varint-6.0.0.tgz",
"integrity": "sha512-cXEIW6cfr15lFv563k4GuVuW/fiwjknytD37jIOLSdSWuOI6WnO/oKwmP2FQTU2l01LP8/M5TSAJpzUaGe3uWg==",
"devOptional": true,
"license": "MIT"
},
"node_modules/vite": { "node_modules/vite": {
"version": "5.4.11", "version": "5.4.11",
"resolved": "https://registry.npmjs.org/vite/-/vite-5.4.11.tgz", "resolved": "https://registry.npmjs.org/vite/-/vite-5.4.11.tgz",

View File

@ -6,17 +6,24 @@
"scripts": { "scripts": {
"dev": "vite", "dev": "vite",
"build": "vite build", "build": "vite build",
"preview": "vite preview" "preview": "vite preview",
"analyze": "ANALYZE=1 npx vite-bundle-visualizer"
}, },
"devDependencies": { "devDependencies": {
"sass-embedded": "^1.83.0",
"vite": "^5.4.10" "vite": "^5.4.10"
}, },
"dependencies": { "dependencies": {
"@sec-ant/readable-stream": "^0.6.0",
"@vscode/markdown-it-katex": "^1.1.1",
"autoprefixer": "^10.4.20", "autoprefixer": "^10.4.20",
"daisyui": "^4.12.14", "daisyui": "^4.12.14",
"highlight.js": "^11.10.0",
"katex": "^0.16.15",
"markdown-it": "^14.1.0", "markdown-it": "^14.1.0",
"postcss": "^8.4.49", "postcss": "^8.4.49",
"tailwindcss": "^3.4.15", "tailwindcss": "^3.4.15",
"textlinestream": "^1.1.1",
"vite-plugin-singlefile": "^2.0.3", "vite-plugin-singlefile": "^2.0.3",
"vue": "^3.5.13" "vue": "^3.5.13"
} }

View File

@ -0,0 +1,33 @@
{
"demo": true,
"id": "conv-1734086746930",
"lastModified": 1734087548943,
"messages": [
{
"id": 1734086764521,
"role": "user",
"content": "this is a demo conversation, used in dev mode"
},
{
"id": 1734087548327,
"role": "assistant",
"content": "This is the formula:\n\n$\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}$\n\nGiven an input vector \\(\\mathbf{x} = [x_1, x_2, \\ldots, x_n]\\)\n\n\\[\ny_i = \\frac{e^{x_i}}{\\sum_{j=1}^n e^{x_j}}\n\\]\n\nCode block latex:\n```latex\n\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}\n```\n\nTest dollar sign: $1234 $4567\n\nInvalid latex syntax: $E = mc^$ and $$E = mc^$$",
"timings": {
"prompt_n": 1,
"prompt_ms": 28.923,
"predicted_n": 25,
"predicted_ms": 573.016
}
},
{
"id": 1734087548328,
"role": "user",
"content": "this is a demo conversation, used in dev mode"
},
{
"id": 1734087548329,
"role": "assistant",
"content": "Code block:\n```js\nconsole.log('hello world')\n```\n```sh\nls -la /dev\n```"
}
]
}

View File

@ -1,225 +0,0 @@
const paramDefaults = {
stream: true,
temperature: 0.2,
};
let generation_settings = null;
export class CompletionError extends Error {
constructor(message, name, data) {
super(message);
this.name = name;
}
};
// Completes the prompt as a generator. Recommended for most use cases.
//
// Example:
//
// import { llama } from '/completion.js'
//
// const request = llama("Tell me a joke", {n_predict: 800})
// for await (const chunk of request) {
// document.write(chunk.data.content)
// }
//
export async function* llama(prompt, params = {}, config = {}) {
let controller = config.controller;
const api_url = config.api_url?.replace(/\/+$/, '') || "";
if (!controller) {
controller = new AbortController();
}
const completionParams = { ...paramDefaults, ...params, prompt };
const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
method: 'POST',
body: JSON.stringify(completionParams),
headers: {
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Accept': 'text/event-stream',
...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
},
signal: controller.signal,
});
const status = response.status;
if (status !== 200) {
try {
const body = await response.json();
if (body && body.error && body.error.message) {
throw new CompletionError(body.error.message, 'ServerError');
}
} catch (err) {
throw new CompletionError(err.message, 'ServerError');
}
}
const reader = response.body.getReader();
const decoder = new TextDecoder();
let content = "";
let leftover = ""; // Buffer for partially read lines
try {
let cont = true;
while (cont) {
const result = await reader.read();
if (result.done) {
break;
}
// Add any leftover data to the current chunk of data
const text = leftover + decoder.decode(result.value);
// Check if the last character is a line break
const endsWithLineBreak = text.endsWith('\n');
// Split the text into lines
let lines = text.split('\n');
// If the text doesn't end with a line break, then the last line is incomplete
// Store it in leftover to be added to the next chunk of data
if (!endsWithLineBreak) {
leftover = lines.pop();
} else {
leftover = ""; // Reset leftover if we have a line break at the end
}
// Parse all sse events and add them to result
const regex = /^(\S+):\s(.*)$/gm;
for (const line of lines) {
const match = regex.exec(line);
if (match) {
result[match[1]] = match[2];
if (result.data === '[DONE]') {
cont = false;
break;
}
// since we know this is llama.cpp, let's just decode the json in data
if (result.data) {
result.data = JSON.parse(result.data);
content += result.data.content;
// yield
yield result;
// if we got a stop token from server, we will break here
if (result.data.stop) {
if (result.data.generation_settings) {
generation_settings = result.data.generation_settings;
}
cont = false;
break;
}
}
if (result.error) {
try {
result.error = JSON.parse(result.error);
if (result.error.message.includes('slot unavailable')) {
// Throw an error to be caught by upstream callers
throw new Error('slot unavailable');
} else {
console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`);
}
} catch(e) {
console.error(`llama.cpp error ${result.error}`)
}
}
}
}
}
} catch (e) {
if (e.name !== 'AbortError') {
console.error("llama error: ", e);
}
throw e;
}
finally {
controller.abort();
}
return content;
}
// Call llama, return an event target that you can subscribe to
//
// Example:
//
// import { llamaEventTarget } from '/completion.js'
//
// const conn = llamaEventTarget(prompt)
// conn.addEventListener("message", (chunk) => {
// document.write(chunk.detail.content)
// })
//
export const llamaEventTarget = (prompt, params = {}, config = {}) => {
const eventTarget = new EventTarget();
(async () => {
let content = "";
for await (const chunk of llama(prompt, params, config)) {
if (chunk.data) {
content += chunk.data.content;
eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
}
if (chunk.data.generation_settings) {
eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
}
if (chunk.data.timings) {
eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
}
}
eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
})();
return eventTarget;
}
// Call llama, return a promise that resolves to the completed text. This does not support streaming
//
// Example:
//
// llamaPromise(prompt).then((content) => {
// document.write(content)
// })
//
// or
//
// const content = await llamaPromise(prompt)
// document.write(content)
//
export const llamaPromise = (prompt, params = {}, config = {}) => {
return new Promise(async (resolve, reject) => {
let content = "";
try {
for await (const chunk of llama(prompt, params, config)) {
content += chunk.data.content;
}
resolve(content);
} catch (error) {
reject(error);
}
});
};
/**
* (deprecated)
*/
export const llamaComplete = async (params, controller, callback) => {
for await (const chunk of llama(params.prompt, params, { controller })) {
callback(chunk);
}
}
// Get the model info from the server. This is useful for getting the context window and so on.
export const llamaModelInfo = async (config = {}) => {
if (!generation_settings) {
const api_url = config.api_url?.replace(/\/+$/, '') || "";
const props = await fetch(`${api_url}/props`).then(r => r.json());
generation_settings = props.default_generation_settings;
}
return generation_settings;
}

View File

@ -0,0 +1,60 @@
import hljs from 'highlight.js/lib/core';
// only import commonly used languages to reduce bundle size
import python from 'highlight.js/lib/languages/python';
import javascript from 'highlight.js/lib/languages/javascript';
import json from 'highlight.js/lib/languages/json';
import bash from 'highlight.js/lib/languages/bash';
import yaml from 'highlight.js/lib/languages/yaml';
import markdown from 'highlight.js/lib/languages/markdown';
import scss from 'highlight.js/lib/languages/scss';
import xml from 'highlight.js/lib/languages/xml';
import ruby from 'highlight.js/lib/languages/ruby';
import go from 'highlight.js/lib/languages/go';
import java from 'highlight.js/lib/languages/java';
import rust from 'highlight.js/lib/languages/rust';
import scala from 'highlight.js/lib/languages/scala';
import cpp from 'highlight.js/lib/languages/cpp';
import csharp from 'highlight.js/lib/languages/csharp';
import swift from 'highlight.js/lib/languages/swift';
import dart from 'highlight.js/lib/languages/dart';
import elixir from 'highlight.js/lib/languages/elixir';
import kotlin from 'highlight.js/lib/languages/kotlin';
import lua from 'highlight.js/lib/languages/lua';
import php from 'highlight.js/lib/languages/php';
import latex from 'highlight.js/lib/languages/latex';
hljs.registerLanguage('python', python);
hljs.registerLanguage('javascript', javascript);
hljs.registerLanguage('json', json);
hljs.registerLanguage('yaml', yaml);
hljs.registerLanguage('markdown', markdown);
hljs.registerLanguage('xml', xml);
hljs.registerLanguage('ruby', ruby);
hljs.registerLanguage('go', go);
hljs.registerLanguage('java', java);
hljs.registerLanguage('rust', rust);
hljs.registerLanguage('scala', scala);
hljs.registerLanguage('csharp', csharp);
hljs.registerLanguage('swift', swift);
hljs.registerLanguage('dart', dart);
hljs.registerLanguage('elixir', elixir);
hljs.registerLanguage('kotlin', kotlin);
hljs.registerLanguage('lua', lua);
hljs.registerLanguage('php', php);
hljs.registerLanguage('latex', latex);
// reuse some languages to further reduce bundle size
hljs.registerLanguage('shell', bash);
hljs.registerLanguage('bash', bash);
hljs.registerLanguage('sh', bash);
hljs.registerLanguage('css', scss);
hljs.registerLanguage('scss', scss);
hljs.registerLanguage('c', cpp);
hljs.registerLanguage('cpp', cpp);
export default hljs;

View File

@ -0,0 +1,66 @@
import katex from 'katex';
// Adapted from https://github.com/SchneeHertz/markdown-it-katex-gpt
// MIT license
const defaultOptions = {
delimiters: [
{ left: '\\[', right: '\\]', display: true },
{ left: '\\(', right: '\\)', display: false },
],
};
export function renderLatexHTML(content, display = false) {
return katex.renderToString(content, {
throwOnError: false,
output: 'mathml',
displayMode: display,
});
}
function escapedBracketRule(options) {
return (state, silent) => {
const max = state.posMax;
const start = state.pos;
for (const { left, right, display } of options.delimiters) {
// Check if it starts with the left delimiter
if (!state.src.slice(start).startsWith(left)) continue;
// Skip the length of the left delimiter
let pos = start + left.length;
// Find the matching right delimiter
while (pos < max) {
if (state.src.slice(pos).startsWith(right)) {
break;
}
pos++;
}
// No matching right delimiter found, skip to the next match
if (pos >= max) continue;
// If not in silent mode, convert LaTeX formula to MathML
if (!silent) {
const content = state.src.slice(start + left.length, pos);
try {
const renderedContent = renderLatexHTML(content, display);
const token = state.push('html_inline', '', 0);
token.content = renderedContent;
} catch (e) {
console.error(e);
}
}
// Update position, skip the length of the right delimiter
state.pos = pos + right.length;
return true;
}
}
}
export default function (md, options = defaultOptions) {
md.inline.ruler.after('text', 'escaped_bracket', escapedBracketRule(options));
}

View File

@ -1,23 +1,42 @@
import './styles.css'; import './styles.scss';
import { createApp, defineComponent, shallowRef, computed, h } from 'vue/dist/vue.esm-bundler.js'; import { createApp, defineComponent, shallowRef, computed, h } from 'vue/dist/vue.esm-bundler.js';
import { llama } from './completion.js';
import MarkdownIt from 'markdown-it'; import MarkdownIt from 'markdown-it';
import TextLineStream from 'textlinestream';
// math formula rendering
import 'katex/dist/katex.min.css';
import markdownItKatexGpt from './katex-gpt';
import markdownItKatexNormal from '@vscode/markdown-it-katex';
// code highlighting
import hljs from './highlight-config';
import daisyuiThemes from 'daisyui/src/theming/themes';
// ponyfill for missing ReadableStream asyncIterator on Safari
import { asyncIterator } from "@sec-ant/readable-stream/ponyfill/asyncIterator";
const isDev = import.meta.env.MODE === 'development';
// utility functions // utility functions
const isString = (x) => !!x.toLowerCase; const isString = (x) => !!x.toLowerCase;
const isNumeric = (n) => !isString(n) && !isNaN(n); const isBoolean = (x) => x === true || x === false;
const isNumeric = (n) => !isString(n) && !isNaN(n) && !isBoolean(n);
const escapeAttr = (str) => str.replace(/>/g, '&gt;').replace(/"/g, '&quot;'); const escapeAttr = (str) => str.replace(/>/g, '&gt;').replace(/"/g, '&quot;');
const copyStr = (str) => navigator.clipboard.writeText(str); const copyStr = (str) => navigator.clipboard.writeText(str);
// constants // constants
const BASE_URL = localStorage.getItem('base') // for debugging const BASE_URL = isDev
|| (new URL('.', document.baseURI).href).toString(); // for production ? (localStorage.getItem('base') || 'https://localhost:8080') // for debugging
: (new URL('.', document.baseURI).href).toString().replace(/\/$/, ''); // for production
console.log({ BASE_URL });
const CONFIG_DEFAULT = { const CONFIG_DEFAULT = {
// Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value. // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
apiKey: '', apiKey: '',
systemMessage: 'You are a helpful assistant.', systemMessage: 'You are a helpful assistant.',
showTokensPerSecond: false,
// make sure these default values are in sync with `common.h` // make sure these default values are in sync with `common.h`
samplers: 'dkypmxt', samplers: 'edkypmxt',
temperature: 0.8, temperature: 0.8,
dynatemp_range: 0.0, dynatemp_range: 0.0,
dynatemp_exponent: 1.0, dynatemp_exponent: 1.0,
@ -65,12 +84,39 @@ const CONFIG_INFO = {
// config keys having numeric value (i.e. temperature, top_k, top_p, etc) // config keys having numeric value (i.e. temperature, top_k, top_p, etc)
const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]); const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
// list of themes supported by daisyui // list of themes supported by daisyui
const THEMES = ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset']; const THEMES = ['light', 'dark']
// make sure light & dark are always at the beginning
.concat(Object.keys(daisyuiThemes).filter(t => t !== 'light' && t !== 'dark'));
// markdown support // markdown support
const VueMarkdown = defineComponent( const VueMarkdown = defineComponent(
(props) => { (props) => {
const md = shallowRef(new MarkdownIt({ breaks: true })); const md = shallowRef(new MarkdownIt({
breaks: true,
highlight: function (str, lang) { // Add highlight.js
if (lang && hljs.getLanguage(lang)) {
try {
return '<pre><code class="hljs">' +
hljs.highlight(str, { language: lang, ignoreIllegals: true }).value +
'</code></pre>';
} catch (__) {}
}
return '<pre><code class="hljs">' + md.value.utils.escapeHtml(str) + '</code></pre>';
}
}));
// support latex with double dollar sign and square brackets
md.value.use(markdownItKatexGpt, {
delimiters: [
{ left: '\\[', right: '\\]', display: true },
{ left: '\\(', right: '\\)', display: false },
{ left: '$$', right: '$$', display: false },
// do not add single dollar sign here, other wise it will confused with dollar used for money symbol
],
throwOnError: false,
});
// support latex with single dollar sign
md.value.use(markdownItKatexNormal, { throwOnError: false });
// add copy button to code blocks
const origFenchRenderer = md.value.renderer.rules.fence; const origFenchRenderer = md.value.renderer.rules.fence;
md.value.renderer.rules.fence = (tokens, idx, ...args) => { md.value.renderer.rules.fence = (tokens, idx, ...args) => {
const content = tokens[idx].content; const content = tokens[idx].content;
@ -101,6 +147,48 @@ const SettingsModalShortInput = defineComponent({
}, },
}); });
// message bubble component
const MessageBubble = defineComponent({
components: {
VueMarkdown
},
template: document.getElementById('message-bubble').innerHTML,
props: {
config: Object,
msg: Object,
isGenerating: Boolean,
editUserMsgAndRegenerate: Function,
regenerateMsg: Function,
},
data() {
return {
editingContent: null,
};
},
computed: {
timings() {
if (!this.msg.timings) return null;
return {
...this.msg.timings,
prompt_per_second: this.msg.timings.prompt_n / (this.msg.timings.prompt_ms / 1000),
predicted_per_second: this.msg.timings.predicted_n / (this.msg.timings.predicted_ms / 1000),
};
}
},
methods: {
copyMsg() {
copyStr(this.msg.content);
},
editMsg() {
this.editUserMsgAndRegenerate({
...this.msg,
content: this.editingContent,
});
this.editingContent = null;
},
},
});
// coversations is stored in localStorage // coversations is stored in localStorage
// format: { [convId]: { id: string, lastModified: number, messages: [...] } } // format: { [convId]: { id: string, lastModified: number, messages: [...] } }
// convId is a string prefixed with 'conv-' // convId is a string prefixed with 'conv-'
@ -192,10 +280,29 @@ const chatScrollToBottom = (requiresNearBottom) => {
} }
}; };
// wrapper for SSE
async function* sendSSEPostRequest(url, fetchOptions) {
const res = await fetch(url, fetchOptions);
const lines = res.body
.pipeThrough(new TextDecoderStream())
.pipeThrough(new TextLineStream());
for await (const line of asyncIterator(lines)) {
if (isDev) console.log({line});
if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
const data = JSON.parse(line.slice(5));
yield data;
} else if (line.startsWith('error:')) {
const data = JSON.parse(line.slice(6));
throw new Error(data.message || 'Unknown error');
}
}
};
const mainApp = createApp({ const mainApp = createApp({
components: { components: {
VueMarkdown, VueMarkdown,
SettingsModalShortInput, SettingsModalShortInput,
MessageBubble,
}, },
data() { data() {
return { return {
@ -209,11 +316,11 @@ const mainApp = createApp({
selectedTheme: StorageUtils.getTheme(), selectedTheme: StorageUtils.getTheme(),
config: StorageUtils.getConfig(), config: StorageUtils.getConfig(),
showConfigDialog: false, showConfigDialog: false,
editingMsg: null,
// const // const
themes: THEMES, themes: THEMES,
configDefault: {...CONFIG_DEFAULT}, configDefault: {...CONFIG_DEFAULT},
configInfo: {...CONFIG_INFO}, configInfo: {...CONFIG_INFO},
isDev,
} }
}, },
computed: {}, computed: {},
@ -225,6 +332,16 @@ const mainApp = createApp({
if (this.isGenerating) chatScrollToBottom(true); if (this.isGenerating) chatScrollToBottom(true);
}); });
resizeObserver.observe(pendingMsgElem); resizeObserver.observe(pendingMsgElem);
this.setSelectedTheme(this.selectedTheme);
},
watch: {
viewingConvId: function(val, oldVal) {
if (val != oldVal) {
this.fetchMessages();
chatScrollToBottom();
this.hideSidebar();
}
}
}, },
methods: { methods: {
hideSidebar() { hideSidebar() {
@ -232,23 +349,17 @@ const mainApp = createApp({
}, },
setSelectedTheme(theme) { setSelectedTheme(theme) {
this.selectedTheme = theme; this.selectedTheme = theme;
document.body.setAttribute('data-theme', theme);
document.body.setAttribute('data-color-scheme', daisyuiThemes[theme]?.['color-scheme'] ?? 'auto');
StorageUtils.setTheme(theme); StorageUtils.setTheme(theme);
}, },
newConversation() { newConversation() {
if (this.isGenerating) return; if (this.isGenerating) return;
this.viewingConvId = StorageUtils.getNewConvId(); this.viewingConvId = StorageUtils.getNewConvId();
this.editingMsg = null;
this.fetchMessages();
chatScrollToBottom();
this.hideSidebar();
}, },
setViewingConv(convId) { setViewingConv(convId) {
if (this.isGenerating) return; if (this.isGenerating) return;
this.viewingConvId = convId; this.viewingConvId = convId;
this.editingMsg = null;
this.fetchMessages();
chatScrollToBottom();
this.hideSidebar();
}, },
deleteConv(convId) { deleteConv(convId) {
if (this.isGenerating) return; if (this.isGenerating) return;
@ -256,7 +367,6 @@ const mainApp = createApp({
StorageUtils.remove(convId); StorageUtils.remove(convId);
if (this.viewingConvId === convId) { if (this.viewingConvId === convId) {
this.viewingConvId = StorageUtils.getNewConvId(); this.viewingConvId = StorageUtils.getNewConvId();
this.editingMsg = null;
} }
this.fetchConversation(); this.fetchConversation();
this.fetchMessages(); this.fetchMessages();
@ -291,7 +401,6 @@ const mainApp = createApp({
this.fetchConversation(); this.fetchConversation();
this.fetchMessages(); this.fetchMessages();
this.inputMsg = ''; this.inputMsg = '';
this.editingMsg = null;
this.generateMessage(currConvId); this.generateMessage(currConvId);
chatScrollToBottom(); chatScrollToBottom();
}, },
@ -299,7 +408,6 @@ const mainApp = createApp({
if (this.isGenerating) return; if (this.isGenerating) return;
this.pendingMsg = { id: Date.now()+1, role: 'assistant', content: null }; this.pendingMsg = { id: Date.now()+1, role: 'assistant', content: null };
this.isGenerating = true; this.isGenerating = true;
this.editingMsg = null;
try { try {
const abortController = new AbortController(); const abortController = new AbortController();
@ -330,17 +438,21 @@ const mainApp = createApp({
dry_allowed_length: this.config.dry_allowed_length, dry_allowed_length: this.config.dry_allowed_length,
dry_penalty_last_n: this.config.dry_penalty_last_n, dry_penalty_last_n: this.config.dry_penalty_last_n,
max_tokens: this.config.max_tokens, max_tokens: this.config.max_tokens,
timings_per_token: !!this.config.showTokensPerSecond,
...(this.config.custom.length ? JSON.parse(this.config.custom) : {}), ...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
...(this.config.apiKey ? { api_key: this.config.apiKey } : {}),
}; };
const config = { const chunks = sendSSEPostRequest(`${BASE_URL}/v1/chat/completions`, {
controller: abortController, method: 'POST',
api_url: BASE_URL, headers: {
endpoint: '/chat/completions', 'Content-Type': 'application/json',
}; ...(this.config.apiKey ? {'Authorization': `Bearer ${this.config.apiKey}`} : {})
for await (const chunk of llama(prompt, params, config)) { },
const stop = chunk.data.stop; body: JSON.stringify(params),
const addedContent = chunk.data.choices[0].delta.content; signal: abortController.signal,
});
for await (const chunk of chunks) {
const stop = chunk.stop;
const addedContent = chunk.choices[0].delta.content;
const lastContent = this.pendingMsg.content || ''; const lastContent = this.pendingMsg.content || '';
if (addedContent) { if (addedContent) {
this.pendingMsg = { this.pendingMsg = {
@ -349,6 +461,16 @@ const mainApp = createApp({
content: lastContent + addedContent, content: lastContent + addedContent,
}; };
} }
const timings = chunk.timings;
if (timings && this.config.showTokensPerSecond) {
// only extract what's really needed, to save some space
this.pendingMsg.timings = {
prompt_n: timings.prompt_n,
prompt_ms: timings.prompt_ms,
predicted_n: timings.predicted_n,
predicted_ms: timings.predicted_ms,
};
}
} }
StorageUtils.appendMsg(currConvId, this.pendingMsg); StorageUtils.appendMsg(currConvId, this.pendingMsg);
@ -387,14 +509,10 @@ const mainApp = createApp({
this.fetchMessages(); this.fetchMessages();
this.generateMessage(currConvId); this.generateMessage(currConvId);
}, },
copyMsg(msg) {
copyStr(msg.content);
},
editUserMsgAndRegenerate(msg) { editUserMsgAndRegenerate(msg) {
if (this.isGenerating) return; if (this.isGenerating) return;
const currConvId = this.viewingConvId; const currConvId = this.viewingConvId;
const newContent = msg.content; const newContent = msg.content;
this.editingMsg = null;
StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id); StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
StorageUtils.appendMsg(currConvId, { StorageUtils.appendMsg(currConvId, {
id: Date.now(), id: Date.now(),
@ -441,6 +559,17 @@ const mainApp = createApp({
fetchMessages() { fetchMessages() {
this.messages = StorageUtils.getOneConversation(this.viewingConvId)?.messages ?? []; this.messages = StorageUtils.getOneConversation(this.viewingConvId)?.messages ?? [];
}, },
// debug functions
async debugImportDemoConv() {
const res = await fetch('/demo-conversation.json');
const demoConv = await res.json();
StorageUtils.remove(demoConv.id);
for (const msg of demoConv.messages) {
StorageUtils.appendMsg(demoConv.id, msg);
}
this.fetchConversation();
}
}, },
}); });
mainApp.config.errorHandler = alert; mainApp.config.errorHandler = alert;

View File

@ -1,26 +0,0 @@
@tailwind base;
@tailwind components;
@tailwind utilities;
.markdown {
h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; }
pre {
@apply whitespace-pre-wrap rounded-lg p-2;
border: 1px solid currentColor;
}
/* TODO: fix markdown table */
}
.show-on-hover {
@apply md:opacity-0 md:group-hover:opacity-100;
}
.btn-mini {
@apply cursor-pointer hover:shadow-md;
}
.chat-screen { max-width: 900px; }
.chat-bubble-base-300 {
--tw-bg-opacity: 1;
--tw-text-opacity: 1;
@apply bg-base-300 text-base-content;
}

View File

@ -0,0 +1,48 @@
@use "sass:meta";
@tailwind base;
@tailwind components;
@tailwind utilities;
.markdown {
h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; }
pre {
@apply whitespace-pre-wrap rounded-lg p-2;
border: 1px solid currentColor;
}
/* TODO: fix markdown table */
}
.show-on-hover {
@apply md:opacity-0 md:group-hover:opacity-100;
}
.btn-mini {
@apply cursor-pointer hover:shadow-md;
}
.chat-screen { max-width: 900px; }
.chat-bubble-base-300 {
--tw-bg-opacity: 1;
--tw-text-opacity: 1;
@apply bg-base-300 text-base-content;
}
/* Highlight.js */
[data-color-scheme='light'] {
@include meta.load-css('highlight.js/styles/stackoverflow-light');
}
[data-color-scheme='dark'] {
@include meta.load-css('highlight.js/styles/stackoverflow-dark');
}
[data-color-scheme='auto'] {
@media (prefers-color-scheme: light) {
@include meta.load-css('highlight.js/styles/stackoverflow-light');
}
@media (prefers-color-scheme: dark) {
@include meta.load-css('highlight.js/styles/stackoverflow-dark');
}
}
.hljs {
background: transparent !important;
padding: 0.5em !important;
}

View File

@ -2,6 +2,9 @@
import { viteSingleFile } from 'vite-plugin-singlefile'; import { viteSingleFile } from 'vite-plugin-singlefile';
import path from 'path'; import path from 'path';
import fs from 'fs'; import fs from 'fs';
import zlib from 'zlib';
const MAX_BUNDLE_SIZE = 1.5 * 1024 * 1024; // only increase when absolutely necessary
const GUIDE_FOR_FRONTEND = ` const GUIDE_FOR_FRONTEND = `
<!-- <!--
@ -12,8 +15,7 @@ const GUIDE_FOR_FRONTEND = `
--> -->
`.trim(); `.trim();
export default { const BUILD_PLUGINS = [
plugins: [
viteSingleFile(), viteSingleFile(),
(function llamaCppPlugin() { (function llamaCppPlugin() {
let config; let config;
@ -25,12 +27,33 @@ export default {
}, },
writeBundle() { writeBundle() {
const outputIndexHtml = path.join(config.build.outDir, 'index.html'); const outputIndexHtml = path.join(config.build.outDir, 'index.html');
const content = fs.readFileSync(outputIndexHtml, 'utf-8'); const content = GUIDE_FOR_FRONTEND + '\n' + fs.readFileSync(outputIndexHtml, 'utf-8');
const compressed = zlib.gzipSync(Buffer.from(content, 'utf-8'), { level: 9 });
const targetOutputFile = path.join(config.build.outDir, '../../public/index.html'); // because gzip header contains machine-specific info, we must remove these data from the header
fs.writeFileSync(targetOutputFile, GUIDE_FOR_FRONTEND + '\n' + content); // timestamp
compressed[0x4] = 0;
compressed[0x5] = 0;
compressed[0x6] = 0;
compressed[0x7] = 0;
// OS
compressed[0x9] = 0;
if (compressed.byteLength > MAX_BUNDLE_SIZE) {
throw new Error(
`Bundle size is too large (${Math.ceil(compressed.byteLength / 1024)} KB).\n` +
`Please reduce the size of the frontend or increase MAX_BUNDLE_SIZE in vite.config.js.\n`,
);
}
const targetOutputFile = path.join(config.build.outDir, '../../public/index.html.gz');
fs.writeFileSync(targetOutputFile, compressed);
} }
} }
})(), })(),
], ];
/** @type {import('vite').UserConfig} */
export default {
plugins: process.env.ANALYZE ? [] : BUILD_PLUGINS,
}; };

View File

@ -394,7 +394,7 @@ int main(int raw_argc, char ** raw_argv) {
} }
if (show_token_count) { if (show_token_count) {
printf("Total number of tokens: %ld\n", tokens.size()); printf("Total number of tokens: %zu\n", tokens.size());
} }
// silence valgrind // silence valgrind
llama_free(ctx); llama_free(ctx);

View File

@ -0,0 +1,5 @@
set(TARGET llama-tts)
add_executable(${TARGET} tts.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -0,0 +1,180 @@
# convert the https://huggingface.co/novateur/WavTokenizer-large-speech-75token to HF format
# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the WavTokenizer decoder
#
# TODO: this script is LLM-generated and probably very inefficient and should be rewritten
import torch
import json
import os
import sys
import re
from safetensors.torch import save_file
# default
model_path = './model.pt';
# read from CLI
if len(sys.argv) > 1:
model_path = sys.argv[1]
# get the directory of the input model
path_dst = os.path.dirname(model_path)
print(f"Loading model from {model_path}")
model = torch.load(model_path, map_location='cpu')
#print(model)
# print all keys
for key in model.keys():
print(key)
if key == 'hyper_parameters':
#print(model[key])
# dump as json pretty
print(json.dumps(model[key], indent=4))
#if key != 'state_dict' and key != 'optimizer_states':
# print(model[key])
# Check if the loaded model is a state_dict or a model instance
if isinstance(model, torch.nn.Module):
state_dict = model.state_dict()
else:
state_dict = model
# Print the structure of the state_dict to understand its format
print("State dictionary keys:")
for key in state_dict.keys():
print(key)
# Ensure the state_dict is flat and contains only torch.Tensor objects
def flatten_state_dict(state_dict, parent_key='', sep='.'):
items = []
items_new = []
for k, v in state_dict.items():
new_key = f"{parent_key}{sep}{k}" if parent_key else k
if isinstance(v, torch.Tensor):
items.append((new_key, v))
elif isinstance(v, dict):
items.extend(flatten_state_dict(v, new_key, sep=sep).items())
return dict(items)
size_total_mb = 0
for key, value in list(items):
# keep only what we need for inference
if not key.startswith('state_dict.feature_extractor.encodec.quantizer.') and \
not key.startswith('state_dict.backbone.') and \
not key.startswith('state_dict.head.out'):
print('Skipping key: ', key)
continue
new_key = key
new_key = new_key.replace('state_dict.', '')
new_key = new_key.replace('pos_net', 'posnet')
# check if matches "backbone.posnet.%d.bias" or "backbone.posnet.%d.weight"
if new_key.startswith("backbone.posnet."):
match = re.match(r"backbone\.posnet\.(\d+)\.(bias|weight)", new_key)
if match:
new_key = f"backbone.posnet.{match.group(1)}.norm.{match.group(2)}"
# "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed" -> "backbone.embedding.weight"
if new_key == "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed":
new_key = "backbone.embedding.weight"
# these are the only rows used
# ref: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/wav_tokenizer/audio_codec.py#L100
if new_key.endswith("norm.scale.weight"):
new_key = new_key.replace("norm.scale.weight", "norm.weight")
value = value[0]
if new_key.endswith("norm.shift.weight"):
new_key = new_key.replace("norm.shift.weight", "norm.bias")
value = value[0]
if new_key.endswith("gamma"):
new_key = new_key.replace("gamma", "gamma.weight")
# convert from 1D [768] to 2D [768, 1] so that ggml_add can broadcast the bias
if (new_key.endswith("norm.weight") or new_key.endswith("norm1.weight") or new_key.endswith("norm2.weight") or new_key.endswith(".bias")) and (new_key.startswith("backbone.posnet") or new_key.startswith("backbone.embed.bias")):
value = value.unsqueeze(1)
if new_key.endswith("dwconv.bias"):
value = value.unsqueeze(1)
size_mb = value.element_size() * value.nelement() / (1024 * 1024)
print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}")
size_total_mb += size_mb
#print(key, '->', new_key, ': ', value)
#print(key, '->', new_key)
items_new.append((new_key, value))
print(f"Total size: {size_total_mb:8.2f} MB")
return dict(items_new)
flattened_state_dict = flatten_state_dict(state_dict)
# Convert the model to the safetensors format
output_path = path_dst + '/model.safetensors'
save_file(flattened_state_dict, output_path)
print(f"Model has been successfully converted and saved to {output_path}")
# Calculate the total size of the .safetensors file
total_size = os.path.getsize(output_path)
# Create the weight map
weight_map = {
"model.safetensors": ["*"] # Assuming all weights are in one file
}
# Create metadata for the index.json file
metadata = {
"total_size": total_size,
"weight_map": weight_map
}
# Save the metadata to index.json
index_path = path_dst + '/index.json'
with open(index_path, 'w') as f:
json.dump(metadata, f, indent=4)
print(f"Metadata has been saved to {index_path}")
config = {
"architectures": [
"WavTokenizerDec"
],
"hidden_size": 1282,
"n_embd_features": 512,
"n_ff": 2304,
"vocab_size": 4096,
"n_head": 1,
"layer_norm_epsilon": 1e-6,
"group_norm_epsilon": 1e-6,
"group_norm_groups": 32,
"max_position_embeddings": 8192, # ?
"n_layer": 12,
"posnet": {
"n_embd": 768,
"n_layer": 6
},
"convnext": {
"n_embd": 768,
"n_layer": 12
},
}
with open(path_dst + '/config.json', 'w') as f:
json.dump(config, f, indent=4)
print(f"Config has been saved to {path_dst + 'config.json'}")

175
examples/tts/tts-outetts.py Normal file
View File

@ -0,0 +1,175 @@
import sys
#import json
#import struct
import requests
import re
def process_text(text: str):
text = re.sub(r'\d+(\.\d+)?', lambda x: x.group(), text.lower()) # TODO this needs to be fixed
text = re.sub(r'[-_/,\.\\]', ' ', text)
text = re.sub(r'[^a-z\s]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text.split()
# usage:
# python tts-outetts.py http://server-llm:port http://server-dec:port "text"
if len(sys.argv) <= 3:
print("usage: python tts-outetts.py http://server-llm:port http://server-dec:port \"text\"")
exit(1)
host_llm = sys.argv[1]
host_dec = sys.argv[2]
text = sys.argv[3]
prefix = """<|im_start|>
<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>"""
words = process_text(text)
words = "<|text_sep|>".join([i.strip() for i in words])
words += "<|text_end|>\n"
# voice data
# TODO: load from json
#suffix = """<|audio_start|>
#the<|t_0.08|><|code_start|><|257|><|740|><|636|><|913|><|788|><|1703|><|code_end|>
#overall<|t_0.36|><|code_start|><|127|><|201|><|191|><|774|><|700|><|532|><|1056|><|557|><|798|><|298|><|1741|><|747|><|1662|><|1617|><|1702|><|1527|><|368|><|1588|><|1049|><|1008|><|1625|><|747|><|1576|><|728|><|1019|><|1696|><|1765|><|code_end|>
#package<|t_0.56|><|code_start|><|935|><|584|><|1319|><|627|><|1016|><|1491|><|1344|><|1117|><|1526|><|1040|><|239|><|1435|><|951|><|498|><|723|><|1180|><|535|><|789|><|1649|><|1637|><|78|><|465|><|1668|><|901|><|595|><|1675|><|117|><|1009|><|1667|><|320|><|840|><|79|><|507|><|1762|><|1508|><|1228|><|1768|><|802|><|1450|><|1457|><|232|><|639|><|code_end|>
#from<|t_0.19|><|code_start|><|604|><|782|><|1682|><|872|><|1532|><|1600|><|1036|><|1761|><|647|><|1554|><|1371|><|653|><|1595|><|950|><|code_end|>
#just<|t_0.25|><|code_start|><|1782|><|1670|><|317|><|786|><|1748|><|631|><|599|><|1155|><|1364|><|1524|><|36|><|1591|><|889|><|1535|><|541|><|440|><|1532|><|50|><|870|><|code_end|>
#two<|t_0.24|><|code_start|><|1681|><|1510|><|673|><|799|><|805|><|1342|><|330|><|519|><|62|><|640|><|1138|><|565|><|1552|><|1497|><|1552|><|572|><|1715|><|1732|><|code_end|>
#people<|t_0.39|><|code_start|><|593|><|274|><|136|><|740|><|691|><|633|><|1484|><|1061|><|1138|><|1485|><|344|><|428|><|397|><|1562|><|645|><|917|><|1035|><|1449|><|1669|><|487|><|442|><|1484|><|1329|><|1832|><|1704|><|600|><|761|><|653|><|269|><|code_end|>
#is<|t_0.16|><|code_start|><|566|><|583|><|1755|><|646|><|1337|><|709|><|802|><|1008|><|485|><|1583|><|652|><|10|><|code_end|>
#pretty<|t_0.32|><|code_start|><|1818|><|1747|><|692|><|733|><|1010|><|534|><|406|><|1697|><|1053|><|1521|><|1355|><|1274|><|816|><|1398|><|211|><|1218|><|817|><|1472|><|1703|><|686|><|13|><|822|><|445|><|1068|><|code_end|>
#remarkable<|t_0.68|><|code_start|><|230|><|1048|><|1705|><|355|><|706|><|1149|><|1535|><|1787|><|1356|><|1396|><|835|><|1583|><|486|><|1249|><|286|><|937|><|1076|><|1150|><|614|><|42|><|1058|><|705|><|681|><|798|><|934|><|490|><|514|><|1399|><|572|><|1446|><|1703|><|1346|><|1040|><|1426|><|1304|><|664|><|171|><|1530|><|625|><|64|><|1708|><|1830|><|1030|><|443|><|1509|><|1063|><|1605|><|1785|><|721|><|1440|><|923|><|code_end|>
#sure<|t_0.36|><|code_start|><|792|><|1780|><|923|><|1640|><|265|><|261|><|1525|><|567|><|1491|><|1250|><|1730|><|362|><|919|><|1766|><|543|><|1|><|333|><|113|><|970|><|252|><|1606|><|133|><|302|><|1810|><|1046|><|1190|><|1675|><|code_end|>
#i<|t_0.08|><|code_start|><|123|><|439|><|1074|><|705|><|1799|><|637|><|code_end|>
#have<|t_0.16|><|code_start|><|1509|><|599|><|518|><|1170|><|552|><|1029|><|1267|><|864|><|419|><|143|><|1061|><|0|><|code_end|>
#some<|t_0.16|><|code_start|><|619|><|400|><|1270|><|62|><|1370|><|1832|><|917|><|1661|><|167|><|269|><|1366|><|1508|><|code_end|>
#critiques<|t_0.60|><|code_start|><|559|><|584|><|1163|><|1129|><|1313|><|1728|><|721|><|1146|><|1093|><|577|><|928|><|27|><|630|><|1080|><|1346|><|1337|><|320|><|1382|><|1175|><|1682|><|1556|><|990|><|1683|><|860|><|1721|><|110|><|786|><|376|><|1085|><|756|><|1523|><|234|><|1334|><|1506|><|1578|><|659|><|612|><|1108|><|1466|><|1647|><|308|><|1470|><|746|><|556|><|1061|><|code_end|>
#about<|t_0.29|><|code_start|><|26|><|1649|><|545|><|1367|><|1263|><|1728|><|450|><|859|><|1434|><|497|><|1220|><|1285|><|179|><|755|><|1154|><|779|><|179|><|1229|><|1213|><|922|><|1774|><|1408|><|code_end|>
#some<|t_0.23|><|code_start|><|986|><|28|><|1649|><|778|><|858|><|1519|><|1|><|18|><|26|><|1042|><|1174|><|1309|><|1499|><|1712|><|1692|><|1516|><|1574|><|code_end|>
#of<|t_0.07|><|code_start|><|197|><|716|><|1039|><|1662|><|64|><|code_end|>
#the<|t_0.08|><|code_start|><|1811|><|1568|><|569|><|886|><|1025|><|1374|><|code_end|>
#gameplay<|t_0.48|><|code_start|><|1269|><|1092|><|933|><|1362|><|1762|><|1700|><|1675|><|215|><|781|><|1086|><|461|><|838|><|1022|><|759|><|649|><|1416|><|1004|><|551|><|909|><|787|><|343|><|830|><|1391|><|1040|><|1622|><|1779|><|1360|><|1231|><|1187|><|1317|><|76|><|997|><|989|><|978|><|737|><|189|><|code_end|>
#aspects<|t_0.56|><|code_start|><|1423|><|797|><|1316|><|1222|><|147|><|719|><|1347|><|386|><|1390|><|1558|><|154|><|440|><|634|><|592|><|1097|><|1718|><|712|><|763|><|1118|><|1721|><|1311|><|868|><|580|><|362|><|1435|><|868|><|247|><|221|><|886|><|1145|><|1274|><|1284|><|457|><|1043|><|1459|><|1818|><|62|><|599|><|1035|><|62|><|1649|><|778|><|code_end|>
#but<|t_0.20|><|code_start|><|780|><|1825|><|1681|><|1007|><|861|><|710|><|702|><|939|><|1669|><|1491|><|613|><|1739|><|823|><|1469|><|648|><|code_end|>
#its<|t_0.09|><|code_start|><|92|><|688|><|1623|><|962|><|1670|><|527|><|599|><|code_end|>
#still<|t_0.27|><|code_start|><|636|><|10|><|1217|><|344|><|713|><|957|><|823|><|154|><|1649|><|1286|><|508|><|214|><|1760|><|1250|><|456|><|1352|><|1368|><|921|><|615|><|5|><|code_end|>
#really<|t_0.36|><|code_start|><|55|><|420|><|1008|><|1659|><|27|><|644|><|1266|><|617|><|761|><|1712|><|109|><|1465|><|1587|><|503|><|1541|><|619|><|197|><|1019|><|817|><|269|><|377|><|362|><|1381|><|507|><|1488|><|4|><|1695|><|code_end|>
#enjoyable<|t_0.49|><|code_start|><|678|><|501|><|864|><|319|><|288|><|1472|><|1341|><|686|><|562|><|1463|><|619|><|1563|><|471|><|911|><|730|><|1811|><|1006|><|520|><|861|><|1274|><|125|><|1431|><|638|><|621|><|153|><|876|><|1770|><|437|><|987|><|1653|><|1109|><|898|><|1285|><|80|><|593|><|1709|><|843|><|code_end|>
#and<|t_0.15|><|code_start|><|1285|><|987|><|303|><|1037|><|730|><|1164|><|502|><|120|><|1737|><|1655|><|1318|><|code_end|>
#it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><|code_end|>
#looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|>
#lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>"""
# TODO: tokenization is slow for some reason - here is pre-tokenized input
suffix = [ 151667, 198, 1782, 155780, 151669, 151929, 152412, 152308, 152585, 152460, 153375, 151670, 198, 74455,
155808, 151669, 151799, 151873, 151863, 152446, 152372, 152204, 152728, 152229, 152470, 151970, 153413,
152419, 153334, 153289, 153374, 153199, 152040, 153260, 152721, 152680, 153297, 152419, 153248, 152400,
152691, 153368, 153437, 151670, 198, 1722, 155828, 151669, 152607, 152256, 152991, 152299, 152688, 153163,
153016, 152789, 153198, 152712, 151911, 153107, 152623, 152170, 152395, 152852, 152207, 152461, 153321,
153309, 151750, 152137, 153340, 152573, 152267, 153347, 151789, 152681, 153339, 151992, 152512, 151751,
152179, 153434, 153180, 152900, 153440, 152474, 153122, 153129, 151904, 152311, 151670, 198, 1499, 155791,
151669, 152276, 152454, 153354, 152544, 153204, 153272, 152708, 153433, 152319, 153226, 153043, 152325,
153267, 152622, 151670, 198, 4250, 155797, 151669, 153454, 153342, 151989, 152458, 153420, 152303, 152271,
152827, 153036, 153196, 151708, 153263, 152561, 153207, 152213, 152112, 153204, 151722, 152542, 151670, 198,
19789, 155796, 151669, 153353, 153182, 152345, 152471, 152477, 153014, 152002, 152191, 151734, 152312, 152810,
152237, 153224, 153169, 153224, 152244, 153387, 153404, 151670, 198, 16069, 155811, 151669, 152265, 151946,
151808, 152412, 152363, 152305, 153156, 152733, 152810, 153157, 152016, 152100, 152069, 153234, 152317,
152589, 152707, 153121, 153341, 152159, 152114, 153156, 153001, 153504, 153376, 152272, 152433, 152325,
151941, 151670, 198, 285, 155788, 151669, 152238, 152255, 153427, 152318, 153009, 152381, 152474, 152680,
152157, 153255, 152324, 151682, 151670, 198, 32955, 155804, 151669, 153490, 153419, 152364, 152405, 152682,
152206, 152078, 153369, 152725, 153193, 153027, 152946, 152488, 153070, 151883, 152890, 152489, 153144,
153375, 152358, 151685, 152494, 152117, 152740, 151670, 198, 37448, 480, 155840, 151669, 151902, 152720,
153377, 152027, 152378, 152821, 153207, 153459, 153028, 153068, 152507, 153255, 152158, 152921, 151958,
152609, 152748, 152822, 152286, 151714, 152730, 152377, 152353, 152470, 152606, 152162, 152186, 153071,
152244, 153118, 153375, 153018, 152712, 153098, 152976, 152336, 151843, 153202, 152297, 151736, 153380,
153502, 152702, 152115, 153181, 152735, 153277, 153457, 152393, 153112, 152595, 151670, 198, 19098, 155808,
151669, 152464, 153452, 152595, 153312, 151937, 151933, 153197, 152239, 153163, 152922, 153402, 152034,
152591, 153438, 152215, 151673, 152005, 151785, 152642, 151924, 153278, 151805, 151974, 153482, 152718,
152862, 153347, 151670, 198, 72, 155780, 151669, 151795, 152111, 152746, 152377, 153471, 152309, 151670, 198,
19016, 155788, 151669, 153181, 152271, 152190, 152842, 152224, 152701, 152939, 152536, 152091, 151815, 152733,
151672, 151670, 198, 14689, 155788, 151669, 152291, 152072, 152942, 151734, 153042, 153504, 152589, 153333,
151839, 151941, 153038, 153180, 151670, 198, 36996, 8303, 155832, 151669, 152231, 152256, 152835, 152801,
152985, 153400, 152393, 152818, 152765, 152249, 152600, 151699, 152302, 152752, 153018, 153009, 151992,
153054, 152847, 153354, 153228, 152662, 153355, 152532, 153393, 151782, 152458, 152048, 152757, 152428,
153195, 151906, 153006, 153178, 153250, 152331, 152284, 152780, 153138, 153319, 151980, 153142, 152418,
152228, 152733, 151670, 198, 9096, 155801, 151669, 151698, 153321, 152217, 153039, 152935, 153400, 152122,
152531, 153106, 152169, 152892, 152957, 151851, 152427, 152826, 152451, 151851, 152901, 152885, 152594,
153446, 153080, 151670, 198, 14689, 155795, 151669, 152658, 151700, 153321, 152450, 152530, 153191, 151673,
151690, 151698, 152714, 152846, 152981, 153171, 153384, 153364, 153188, 153246, 151670, 198, 1055, 155779,
151669, 151869, 152388, 152711, 153334, 151736, 151670, 198, 1782, 155780, 151669, 153483, 153240, 152241,
152558, 152697, 153046, 151670, 198, 5804, 1363, 155820, 151669, 152941, 152764, 152605, 153034, 153434,
153372, 153347, 151887, 152453, 152758, 152133, 152510, 152694, 152431, 152321, 153088, 152676, 152223,
152581, 152459, 152015, 152502, 153063, 152712, 153294, 153451, 153032, 152903, 152859, 152989, 151748,
152669, 152661, 152650, 152409, 151861, 151670, 198, 300, 7973, 155828, 151669, 153095, 152469, 152988,
152894, 151819, 152391, 153019, 152058, 153062, 153230, 151826, 152112, 152306, 152264, 152769, 153390,
152384, 152435, 152790, 153393, 152983, 152540, 152252, 152034, 153107, 152540, 151919, 151893, 152558,
152817, 152946, 152956, 152129, 152715, 153131, 153490, 151734, 152271, 152707, 151734, 153321, 152450,
151670, 198, 8088, 155792, 151669, 152452, 153497, 153353, 152679, 152533, 152382, 152374, 152611, 153341,
153163, 152285, 153411, 152495, 153141, 152320, 151670, 198, 1199, 155781, 151669, 151764, 152360, 153295,
152634, 153342, 152199, 152271, 151670, 198, 43366, 155799, 151669, 152308, 151682, 152889, 152016, 152385,
152629, 152495, 151826, 153321, 152958, 152180, 151886, 153432, 152922, 152128, 153024, 153040, 152593,
152287, 151677, 151670, 198, 53660, 155808, 151669, 151727, 152092, 152680, 153331, 151699, 152316, 152938,
152289, 152433, 153384, 151781, 153137, 153259, 152175, 153213, 152291, 151869, 152691, 152489, 151941,
152049, 152034, 153053, 152179, 153160, 151676, 153367, 151670, 198, 268, 4123, 480, 155821, 151669, 152350,
152173, 152536, 151991, 151960, 153144, 153013, 152358, 152234, 153135, 152291, 153235, 152143, 152583,
152402, 153483, 152678, 152192, 152533, 152946, 151797, 153103, 152310, 152293, 151825, 152548, 153442,
152109, 152659, 153325, 152781, 152570, 152957, 151752, 152265, 153381, 152515, 151670, 198, 437, 155787,
151669, 152957, 152659, 151975, 152709, 152402, 152836, 152174, 151792, 153409, 153327, 152990, 151670, 198,
275, 155781, 151669, 152520, 153038, 152067, 153273, 153185, 152265, 152974, 151670, 198, 94273, 155799,
151669, 152953, 152938, 153427, 152244, 151920, 153423, 152929, 152367, 153052, 152129, 152331, 152257,
152987, 152777, 153448, 152408, 151696, 152408, 152326, 152699, 151670, 198, 385, 16239, 155828, 151669,
152306, 152268, 153438, 153228, 152978, 152957, 153153, 153393, 152795, 152110, 152918, 152923, 152467,
152331, 153053, 153330, 151889, 153444, 152234, 152624, 151779, 152801, 152784, 152139, 152222, 152751,
152512, 153287, 153141, 153052, 151840, 152589, 152508, 153499, 152109, 152255, 151739, 152267, 152759,
153318, 153165, 153349, 151670, ]
response = requests.post(
host_llm + "/completion",
json={
"prompt": [prefix + words, *suffix],
"n_predict": 1024,
"cache_prompt": True,
"return_tokens": True,
"samplers": ["top_k"],
"top_k": 16,
"seed": 1003,
}
)
response_json = response.json()
#print(json.dumps(response_json, indent=4))
#print(json.dumps(response_json["prompt"], indent=4).replace("\\n", "\n"))
#print(json.dumps(response_json["timings"], indent=4))
#print(json.dumps(response_json["tokens"], indent=4))
codes = response_json["tokens"]
codes = [t - 151672 for t in codes if t >= 151672 and t <= 155772]
response = requests.post(
host_dec + "/embeddings",
json={
"input": [*codes],
}
)
response_json = response.json()
#print(json.dumps(response_json, indent=4))
# spectrogram
embd = response_json[0]["embedding"]
n_codes = len(embd)
n_embd = len(embd[0])
print('spectrogram generated: n_codes: %d, n_embd: %d' % (n_codes, n_embd))
# post-process the spectrogram to convert to audio
# TODO: see the tts.cpp:embd_to_audio() and implement it in Python
print('converting to audio ...')
print('TODO: see the tts.cpp:embd_to_audio() and implement it in Python')

932
examples/tts/tts.cpp Normal file
View File

@ -0,0 +1,932 @@
#include "arg.h"
#include "common.h"
#include "sampling.h"
#include "log.h"
#include "llama.h"
#define _USE_MATH_DEFINES // For M_PI on MSVC
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <fstream>
#include <map>
#include <regex>
#include <string>
#include <thread>
#include <vector>
//
// Terminal utils
//
#define SQR(X) ((X) * (X))
#define UNCUBE(x) x < 48 ? 0 : x < 115 ? 1 : (x - 35) / 40
/**
* Quantizes 24-bit RGB to xterm256 code range [16,256).
*/
static int rgb2xterm256(int r, int g, int b) {
unsigned char cube[] = {0, 0137, 0207, 0257, 0327, 0377};
int av, ir, ig, ib, il, qr, qg, qb, ql;
av = r * .299 + g * .587 + b * .114 + .5;
ql = (il = av > 238 ? 23 : (av - 3) / 10) * 10 + 8;
qr = cube[(ir = UNCUBE(r))];
qg = cube[(ig = UNCUBE(g))];
qb = cube[(ib = UNCUBE(b))];
if (SQR(qr - r) + SQR(qg - g) + SQR(qb - b) <=
SQR(ql - r) + SQR(ql - g) + SQR(ql - b))
return ir * 36 + ig * 6 + ib + 020;
return il + 0350;
}
static std::string set_xterm256_foreground(int r, int g, int b) {
int x = rgb2xterm256(r, g, b);
std::ostringstream oss;
oss << "\033[38;5;" << x << "m";
return oss.str();
}
const std::vector<std::string> k_colors = {
set_xterm256_foreground(220, 5, 12),
set_xterm256_foreground(232, 96, 28),
set_xterm256_foreground(241, 147, 45),
set_xterm256_foreground(246, 193, 65),
set_xterm256_foreground(247, 240, 86),
set_xterm256_foreground(144, 201, 135),
set_xterm256_foreground( 78, 178, 101),
};
static void print_usage(int, char ** argv) {
LOG("\nexample usage:\n");
LOG("\n %s -m model.gguf -p \"Hello!\"\n", argv[0]);
LOG("\n");
}
struct wav_header {
char riff[4] = {'R', 'I', 'F', 'F'};
uint32_t chunk_size;
char wave[4] = {'W', 'A', 'V', 'E'};
char fmt[4] = {'f', 'm', 't', ' '};
uint32_t fmt_chunk_size = 16;
uint16_t audio_format = 1; // PCM
uint16_t num_channels = 1; // Mono
uint32_t sample_rate;
uint32_t byte_rate;
uint16_t block_align;
uint16_t bits_per_sample = 16;
char data[4] = {'d', 'a', 't', 'a'};
uint32_t data_size;
};
static void save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
std::ofstream file(fname, std::ios::binary);
if (!file) {
LOG_ERR("%s: Failed to open file '%s' for writing", __func__, fname.c_str());
return;
}
wav_header header;
header.sample_rate = sample_rate;
header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8);
header.block_align = header.num_channels * (header.bits_per_sample / 8);
header.data_size = data.size() * (header.bits_per_sample / 8);
header.chunk_size = 36 + header.data_size;
file.write(reinterpret_cast<const char*>(&header), sizeof(header));
for (const auto & sample : data) {
int16_t pcm_sample = static_cast<int16_t>(std::clamp(sample * 32767.0, -32768.0, 32767.0));
file.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(pcm_sample));
}
file.close();
}
static void fill_hann_window(int length, bool periodic, float * output) {
int offset = -1;
if (periodic) {
offset = 0;
}
for (int i = 0; i < length; i++) {
output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
}
}
// very poor-man fft
static void twiddle(float * real, float * imag, int k, int N) {
float angle = 2 * M_PI * k / N;
*real = cos(angle);
*imag = sin(angle);
}
static void irfft(int n, const float * inp_cplx, float * out_real) {
int N = n / 2 + 1;
std::vector<float> real_input(N);
std::vector<float> imag_input(N);
for (int i = 0; i < N; ++i) {
real_input[i] = inp_cplx[2 * i];
imag_input[i] = inp_cplx[2 * i + 1];
}
std::vector<float> real_output(n);
std::vector<float> imag_output(n);
for (int k = 0; k < n; ++k) {
real_output[k] = 0.0f;
imag_output[k] = 0.0f;
for (int m = 0; m < N; ++m) {
float twiddle_real;
float twiddle_imag;
twiddle(&twiddle_real, &twiddle_imag, k * m, n);
real_output[k] += real_input[m] * twiddle_real - imag_input[m] * twiddle_imag;
imag_output[k] += real_input[m] * twiddle_imag + imag_input[m] * twiddle_real;
}
}
for (int i = 0; i < n; ++i) {
out_real[i] = real_output[i] / N;
}
}
//
// y = torch.nn.functional.fold(
// data, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length),
// )[:, 0, 0, pad:-pad]
//
// data.shape = torch.Size([1, 1280, 261])
// output_size = 84480
// win_length = 1280
// hop_length = 320
// pad = 480
//
static void fold(const std::vector<float> & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector<float> & output) {
int64_t output_height = n_out;
int64_t kernel_w = n_win;
int64_t stride_w = n_hop;
int64_t width = n_out;
output.resize(width, 0.0f);
int64_t col_idx = 0;
for (int64_t w_col = 0; w_col < width; ++w_col) {
int64_t start = w_col * stride_w - n_pad;
int64_t end = start + kernel_w;
for (int64_t w_im = start; w_im < end; ++w_im) {
if (w_im >= 0 && w_im < output_height && col_idx < (int64_t) data.size()) {
output[w_im] += data[col_idx];
}
col_idx++;
}
}
output.resize(n_out - 2 * n_pad);
}
// TODO: not optimized at all
static std::vector<float> embd_to_audio(
const float * embd,
const int n_codes,
const int n_embd,
const int n_thread) {
const int n_fft = 1280;
const int n_hop = 320;
const int n_win = 1280;
const int n_pad = (n_win - n_hop)/2;
const int n_out = (n_codes - 1)*n_hop + n_win;
std::vector<float> hann(n_fft);
fill_hann_window(hann.size(), true, hann.data());
int n_spec = n_embd*n_codes;
std::vector<float> E (n_spec);
std::vector<float> S (n_spec);
std::vector<float> ST(n_spec);
for (int l = 0; l < n_codes; ++l) {
for (int k = 0; k < n_embd; ++k) {
E[k*n_codes + l] = embd[l*n_embd + k];
}
}
for (int k = 0; k < n_embd/2; ++k) {
for (int l = 0; l < n_codes; ++l) {
float mag = E[(k )*n_codes + l];
float phi = E[(k + n_embd/2)*n_codes + l];
mag = exp(mag);
if (mag > 1e2) {
mag = 1e2;
}
S[2*(k*n_codes + l) + 0] = mag*cosf(phi);
S[2*(k*n_codes + l) + 1] = mag*sinf(phi);
}
}
for (int l = 0; l < n_codes; ++l) {
for (int k = 0; k < n_embd/2; ++k) {
ST[l*n_embd + 2*k + 0] = S[2*(k*n_codes + l) + 0];
ST[l*n_embd + 2*k + 1] = S[2*(k*n_codes + l) + 1];
}
}
std::vector<float> res (n_codes*n_fft);
std::vector<float> hann2(n_codes*n_fft);
std::vector<std::thread> workers(n_thread);
for (int i = 0; i < n_thread; ++i) {
workers[i] = std::thread([&, i]() {
for (int l = i; l < n_codes; l += n_thread) {
irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft);
for (int j = 0; j < n_fft; ++j) {
res [l*n_fft + j] *= hann[j];
hann2[l*n_fft + j] = hann[j] * hann[j];
}
}
});
}
for (int i = 0; i < n_thread; ++i) {
workers[i].join();
}
std::vector<float> audio;
std::vector<float> env;
fold(res, n_out, n_win, n_hop, n_pad, audio);
fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once
for (size_t i = 0; i < audio.size(); ++i) {
audio[i] /= env[i];
}
return audio;
}
static const std::map<int, std::string> ones = {
{0, "zero"}, {1, "one"}, {2, "two"}, {3, "three"}, {4, "four"},
{5, "five"}, {6, "six"}, {7, "seven"}, {8, "eight"}, {9, "nine"},
{10, "ten"}, {11, "eleven"}, {12, "twelve"}, {13, "thirteen"}, {14, "fourteen"},
{15, "fifteen"}, {16, "sixteen"}, {17, "seventeen"}, {18, "eighteen"}, {19, "nineteen"}
};
static const std::map<int, std::string> tens = {
{2, "twenty"}, {3, "thirty"}, {4, "forty"}, {5, "fifty"},
{6, "sixty"}, {7, "seventy"}, {8, "eighty"}, {9, "ninety"}
};
// Convert a number less than 1000 to words
static std::string convert_less_than_thousand(int num) {
std::string result;
if (num >= 100) {
result += ones.at(num / 100) + " hundred ";
num %= 100;
}
if (num >= 20) {
result += tens.at(num / 10);
if (num % 10 > 0) {
result += "-" + ones.at(num % 10);
}
} else if (num > 0) {
result += ones.at(num);
}
return result;
}
static std::string number_to_words(const std::string & number_str) {
try {
size_t decimal_pos = number_str.find('.');
std::string integer_part = number_str.substr(0, decimal_pos);
int int_number = std::stoi(integer_part);
std::string result;
if (int_number == 0) {
result = "zero";
} else {
if (int_number >= 1000000000) {
int billions = int_number / 1000000000;
result += convert_less_than_thousand(billions) + " billion ";
int_number %= 1000000000;
}
if (int_number >= 1000000) {
int millions = int_number / 1000000;
result += convert_less_than_thousand(millions) + " million ";
int_number %= 1000000;
}
if (int_number >= 1000) {
int thousands = int_number / 1000;
result += convert_less_than_thousand(thousands) + " thousand ";
int_number %= 1000;
}
if (int_number > 0) {
result += convert_less_than_thousand(int_number);
}
}
// Handle decimal part
if (decimal_pos != std::string::npos) {
result += " point";
std::string decimal_part = number_str.substr(decimal_pos + 1);
for (char digit : decimal_part) {
result += " " + ones.at(digit - '0');
}
}
return result;
} catch (const std::exception& e) {
// Skip if fails
return " ";
}
}
static std::string replace_numbers_with_words(const std::string & input_text) {
std::regex number_pattern(R"(\d+(\.\d+)?)");
std::string result;
auto it = std::sregex_iterator(input_text.begin(), input_text.end(), number_pattern);
auto end = std::sregex_iterator();
size_t last_pos = 0;
for (std::sregex_iterator i = it; i != end; ++i) {
const std::smatch& match = *i;
result.append(input_text, last_pos, match.position() - last_pos);
result.append(number_to_words(match.str()));
last_pos = match.position() + match.length();
}
result.append(input_text, last_pos);
return result;
}
// Based on: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/version/v1/prompt_processor.py#L39
static std::string process_text(const std::string & text) {
// For now I skipped text romanization as I am unsure how to handle
// uroman and MeCab implementations in C++
// maybe something like https://github.com/anyascii/anyascii/ could work.
// currently only English would be supported in this function
std::string processed_text = replace_numbers_with_words(text);
std::transform(processed_text.begin(), processed_text.end(),
processed_text.begin(), ::tolower);
std::regex special_chars(R"([-_/,\.\\])");
processed_text = std::regex_replace(processed_text, special_chars, " ");
std::regex non_alpha(R"([^a-z\s])");
processed_text = std::regex_replace(processed_text, non_alpha, "");
std::regex multiple_spaces(R"(\s+)");
processed_text = std::regex_replace(processed_text, multiple_spaces, " ");
processed_text = std::regex_replace(processed_text, std::regex(R"(^\s+|\s+$)"), "");
/*
Replace spaces with the separator token same as in line 365
for (auto & c : prompt_user) {
if (c == ' ') {
prompt_clean += "<|text_sep|>";
*/
processed_text = std::regex_replace(processed_text, std::regex(R"(\s)"), "<|text_sep|>");
return processed_text;
}
static void prompt_add(llama_tokens & prompt, llama_token token) {
prompt.push_back(token);
}
static void prompt_add(llama_tokens & prompt, const llama_tokens & tokens) {
prompt.insert(prompt.end(), tokens.begin(), tokens.end());
}
static void prompt_add(llama_tokens & prompt, const llama_model * model, const std::string & txt, bool add_special, bool parse_special) {
auto tmp = common_tokenize(model, txt, add_special, parse_special);
prompt_add(prompt, tmp);
}
static void prompt_init(llama_tokens & prompt, const llama_model * model) {
prompt.clear();
prompt_add(prompt, model, "<|im_start|>\n", true, true);
}
int main(int argc, char ** argv) {
common_params params;
params.prompt = "";
params.n_predict = 4096;
params.n_batch = 8192;
params.n_ctx = 8192;
params.sampling.top_k = 4;
params.sampling.samplers = { COMMON_SAMPLER_TYPE_TOP_K, };
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) {
return 1;
}
const int n_parallel = params.n_parallel;
const int n_predict = params.n_predict;
common_init();
// init LLM
llama_backend_init();
llama_numa_init(params.numa);
llama_model * model_ttc = NULL; // text-to-codes
llama_model * model_cts = NULL; // codes-to-speech
llama_context * ctx_ttc = NULL;
llama_context * ctx_cts = NULL;
common_init_result llama_init_ttc = common_init_from_params(params);
model_ttc = llama_init_ttc.model;
ctx_ttc = llama_init_ttc.context;
// TODO: refactor in a common struct
params.model = params.vocoder.model;
params.model_url = params.vocoder.model_url;
params.hf_repo = params.vocoder.hf_repo;
params.hf_file = params.vocoder.hf_file;
params.embedding = true;
common_init_result llama_init_cts = common_init_from_params(params);
model_cts = llama_init_cts.model;
ctx_cts = llama_init_cts.context;
std::vector<common_sampler *> smpl(n_parallel);
for (int i = 0; i < n_parallel; ++i) {
params.sampling.no_perf = (i != 0);
params.sampling.seed = params.sampling.seed + 1;
smpl[i] = common_sampler_init(model_ttc, params.sampling);
}
LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl[0]));
LOG_INF("sampler params: \n%s\n", params.sampling.print().c_str());
LOG_INF("sampler chain: %s\n", common_sampler_print(smpl[0]).c_str());
LOG_INF("%s: loading done\n", __func__);
const auto t_main_start = ggml_time_us();
std::vector<llama_token> codes;
// process prompt and generate voice codes
{
LOG_INF("%s: constructing prompt ..\n", __func__);
std::vector<llama_token> prompt_inp;
prompt_init(prompt_inp, model_ttc);
prompt_add(prompt_inp, model_ttc, "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>", false, true);
// convert the input text into the necessary format expected by OuteTTS
{
std::string prompt_clean = process_text(params.prompt);
LOG_INF("%s: prompt: '%s'\n", __func__, prompt_clean.c_str());
prompt_add(prompt_inp, model_ttc, prompt_clean, false, true);
}
prompt_add(prompt_inp, model_ttc, "<|text_end|>\n", false, true);
// disabled to save time on tokenizing each time
// TODO: load voices from the json files
#if 0
const std::string voice_data = R"(<|audio_start|>
the<|t_0.08|><|code_start|><|257|><|740|><|636|><|913|><|788|><|1703|><|code_end|>
overall<|t_0.36|><|code_start|><|127|><|201|><|191|><|774|><|700|><|532|><|1056|><|557|><|798|><|298|><|1741|><|747|><|1662|><|1617|><|1702|><|1527|><|368|><|1588|><|1049|><|1008|><|1625|><|747|><|1576|><|728|><|1019|><|1696|><|1765|><|code_end|>
package<|t_0.56|><|code_start|><|935|><|584|><|1319|><|627|><|1016|><|1491|><|1344|><|1117|><|1526|><|1040|><|239|><|1435|><|951|><|498|><|723|><|1180|><|535|><|789|><|1649|><|1637|><|78|><|465|><|1668|><|901|><|595|><|1675|><|117|><|1009|><|1667|><|320|><|840|><|79|><|507|><|1762|><|1508|><|1228|><|1768|><|802|><|1450|><|1457|><|232|><|639|><|code_end|>
from<|t_0.19|><|code_start|><|604|><|782|><|1682|><|872|><|1532|><|1600|><|1036|><|1761|><|647|><|1554|><|1371|><|653|><|1595|><|950|><|code_end|>
just<|t_0.25|><|code_start|><|1782|><|1670|><|317|><|786|><|1748|><|631|><|599|><|1155|><|1364|><|1524|><|36|><|1591|><|889|><|1535|><|541|><|440|><|1532|><|50|><|870|><|code_end|>
two<|t_0.24|><|code_start|><|1681|><|1510|><|673|><|799|><|805|><|1342|><|330|><|519|><|62|><|640|><|1138|><|565|><|1552|><|1497|><|1552|><|572|><|1715|><|1732|><|code_end|>
people<|t_0.39|><|code_start|><|593|><|274|><|136|><|740|><|691|><|633|><|1484|><|1061|><|1138|><|1485|><|344|><|428|><|397|><|1562|><|645|><|917|><|1035|><|1449|><|1669|><|487|><|442|><|1484|><|1329|><|1832|><|1704|><|600|><|761|><|653|><|269|><|code_end|>
is<|t_0.16|><|code_start|><|566|><|583|><|1755|><|646|><|1337|><|709|><|802|><|1008|><|485|><|1583|><|652|><|10|><|code_end|>
pretty<|t_0.32|><|code_start|><|1818|><|1747|><|692|><|733|><|1010|><|534|><|406|><|1697|><|1053|><|1521|><|1355|><|1274|><|816|><|1398|><|211|><|1218|><|817|><|1472|><|1703|><|686|><|13|><|822|><|445|><|1068|><|code_end|>
remarkable<|t_0.68|><|code_start|><|230|><|1048|><|1705|><|355|><|706|><|1149|><|1535|><|1787|><|1356|><|1396|><|835|><|1583|><|486|><|1249|><|286|><|937|><|1076|><|1150|><|614|><|42|><|1058|><|705|><|681|><|798|><|934|><|490|><|514|><|1399|><|572|><|1446|><|1703|><|1346|><|1040|><|1426|><|1304|><|664|><|171|><|1530|><|625|><|64|><|1708|><|1830|><|1030|><|443|><|1509|><|1063|><|1605|><|1785|><|721|><|1440|><|923|><|code_end|>
sure<|t_0.36|><|code_start|><|792|><|1780|><|923|><|1640|><|265|><|261|><|1525|><|567|><|1491|><|1250|><|1730|><|362|><|919|><|1766|><|543|><|1|><|333|><|113|><|970|><|252|><|1606|><|133|><|302|><|1810|><|1046|><|1190|><|1675|><|code_end|>
i<|t_0.08|><|code_start|><|123|><|439|><|1074|><|705|><|1799|><|637|><|code_end|>
have<|t_0.16|><|code_start|><|1509|><|599|><|518|><|1170|><|552|><|1029|><|1267|><|864|><|419|><|143|><|1061|><|0|><|code_end|>
some<|t_0.16|><|code_start|><|619|><|400|><|1270|><|62|><|1370|><|1832|><|917|><|1661|><|167|><|269|><|1366|><|1508|><|code_end|>
critiques<|t_0.60|><|code_start|><|559|><|584|><|1163|><|1129|><|1313|><|1728|><|721|><|1146|><|1093|><|577|><|928|><|27|><|630|><|1080|><|1346|><|1337|><|320|><|1382|><|1175|><|1682|><|1556|><|990|><|1683|><|860|><|1721|><|110|><|786|><|376|><|1085|><|756|><|1523|><|234|><|1334|><|1506|><|1578|><|659|><|612|><|1108|><|1466|><|1647|><|308|><|1470|><|746|><|556|><|1061|><|code_end|>
about<|t_0.29|><|code_start|><|26|><|1649|><|545|><|1367|><|1263|><|1728|><|450|><|859|><|1434|><|497|><|1220|><|1285|><|179|><|755|><|1154|><|779|><|179|><|1229|><|1213|><|922|><|1774|><|1408|><|code_end|>
some<|t_0.23|><|code_start|><|986|><|28|><|1649|><|778|><|858|><|1519|><|1|><|18|><|26|><|1042|><|1174|><|1309|><|1499|><|1712|><|1692|><|1516|><|1574|><|code_end|>
of<|t_0.07|><|code_start|><|197|><|716|><|1039|><|1662|><|64|><|code_end|>
the<|t_0.08|><|code_start|><|1811|><|1568|><|569|><|886|><|1025|><|1374|><|code_end|>
gameplay<|t_0.48|><|code_start|><|1269|><|1092|><|933|><|1362|><|1762|><|1700|><|1675|><|215|><|781|><|1086|><|461|><|838|><|1022|><|759|><|649|><|1416|><|1004|><|551|><|909|><|787|><|343|><|830|><|1391|><|1040|><|1622|><|1779|><|1360|><|1231|><|1187|><|1317|><|76|><|997|><|989|><|978|><|737|><|189|><|code_end|>
aspects<|t_0.56|><|code_start|><|1423|><|797|><|1316|><|1222|><|147|><|719|><|1347|><|386|><|1390|><|1558|><|154|><|440|><|634|><|592|><|1097|><|1718|><|712|><|763|><|1118|><|1721|><|1311|><|868|><|580|><|362|><|1435|><|868|><|247|><|221|><|886|><|1145|><|1274|><|1284|><|457|><|1043|><|1459|><|1818|><|62|><|599|><|1035|><|62|><|1649|><|778|><|code_end|>
but<|t_0.20|><|code_start|><|780|><|1825|><|1681|><|1007|><|861|><|710|><|702|><|939|><|1669|><|1491|><|613|><|1739|><|823|><|1469|><|648|><|code_end|>
its<|t_0.09|><|code_start|><|92|><|688|><|1623|><|962|><|1670|><|527|><|599|><|code_end|>
still<|t_0.27|><|code_start|><|636|><|10|><|1217|><|344|><|713|><|957|><|823|><|154|><|1649|><|1286|><|508|><|214|><|1760|><|1250|><|456|><|1352|><|1368|><|921|><|615|><|5|><|code_end|>
really<|t_0.36|><|code_start|><|55|><|420|><|1008|><|1659|><|27|><|644|><|1266|><|617|><|761|><|1712|><|109|><|1465|><|1587|><|503|><|1541|><|619|><|197|><|1019|><|817|><|269|><|377|><|362|><|1381|><|507|><|1488|><|4|><|1695|><|code_end|>
enjoyable<|t_0.49|><|code_start|><|678|><|501|><|864|><|319|><|288|><|1472|><|1341|><|686|><|562|><|1463|><|619|><|1563|><|471|><|911|><|730|><|1811|><|1006|><|520|><|861|><|1274|><|125|><|1431|><|638|><|621|><|153|><|876|><|1770|><|437|><|987|><|1653|><|1109|><|898|><|1285|><|80|><|593|><|1709|><|843|><|code_end|>
and<|t_0.15|><|code_start|><|1285|><|987|><|303|><|1037|><|730|><|1164|><|502|><|120|><|1737|><|1655|><|1318|><|code_end|>
it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><|code_end|>
looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|>
lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>)";
auto tmp = common_tokenize(model_ttc, voice_data, false, true);
printf("\n\n");
for (int i = 0; i < tmp.size(); ++i) {
printf("%d, ", tmp[i]);
}
printf("\n\n");
#else
prompt_add(prompt_inp, llama_tokens {
151667, 198, 1782, 155780, 151669, 151929, 152412, 152308, 152585,
152460, 153375, 151670, 198, 74455, 155808, 151669, 151799,
151873, 151863, 152446, 152372, 152204, 152728, 152229, 152470,
151970, 153413, 152419, 153334, 153289, 153374, 153199, 152040,
153260, 152721, 152680, 153297, 152419, 153248, 152400, 152691,
153368, 153437, 151670, 198, 1722, 155828, 151669, 152607,
152256, 152991, 152299, 152688, 153163, 153016, 152789, 153198,
152712, 151911, 153107, 152623, 152170, 152395, 152852, 152207,
152461, 153321, 153309, 151750, 152137, 153340, 152573, 152267,
153347, 151789, 152681, 153339, 151992, 152512, 151751, 152179,
153434, 153180, 152900, 153440, 152474, 153122, 153129, 151904,
152311, 151670, 198, 1499, 155791, 151669, 152276, 152454,
153354, 152544, 153204, 153272, 152708, 153433, 152319, 153226,
153043, 152325, 153267, 152622, 151670, 198, 4250, 155797,
151669, 153454, 153342, 151989, 152458, 153420, 152303, 152271,
152827, 153036, 153196, 151708, 153263, 152561, 153207, 152213,
152112, 153204, 151722, 152542, 151670, 198, 19789, 155796,
151669, 153353, 153182, 152345, 152471, 152477, 153014, 152002,
152191, 151734, 152312, 152810, 152237, 153224, 153169, 153224,
152244, 153387, 153404, 151670, 198, 16069, 155811, 151669,
152265, 151946, 151808, 152412, 152363, 152305, 153156, 152733,
152810, 153157, 152016, 152100, 152069, 153234, 152317, 152589,
152707, 153121, 153341, 152159, 152114, 153156, 153001, 153504,
153376, 152272, 152433, 152325, 151941, 151670, 198, 285,
155788, 151669, 152238, 152255, 153427, 152318, 153009, 152381,
152474, 152680, 152157, 153255, 152324, 151682, 151670, 198,
32955, 155804, 151669, 153490, 153419, 152364, 152405, 152682,
152206, 152078, 153369, 152725, 153193, 153027, 152946, 152488,
153070, 151883, 152890, 152489, 153144, 153375, 152358, 151685,
152494, 152117, 152740, 151670, 198, 37448, 480, 155840, 151669,
151902, 152720, 153377, 152027, 152378, 152821, 153207, 153459,
153028, 153068, 152507, 153255, 152158, 152921, 151958, 152609,
152748, 152822, 152286, 151714, 152730, 152377, 152353, 152470,
152606, 152162, 152186, 153071, 152244, 153118, 153375, 153018,
152712, 153098, 152976, 152336, 151843, 153202, 152297, 151736,
153380, 153502, 152702, 152115, 153181, 152735, 153277, 153457,
152393, 153112, 152595, 151670, 198, 19098, 155808, 151669,
152464, 153452, 152595, 153312, 151937, 151933, 153197, 152239,
153163, 152922, 153402, 152034, 152591, 153438, 152215, 151673,
152005, 151785, 152642, 151924, 153278, 151805, 151974, 153482,
152718, 152862, 153347, 151670, 198, 72, 155780, 151669, 151795,
152111, 152746, 152377, 153471, 152309, 151670, 198, 19016,
155788, 151669, 153181, 152271, 152190, 152842, 152224, 152701,
152939, 152536, 152091, 151815, 152733, 151672, 151670, 198,
14689, 155788, 151669, 152291, 152072, 152942, 151734, 153042,
153504, 152589, 153333, 151839, 151941, 153038, 153180, 151670,
198, 36996, 8303, 155832, 151669, 152231, 152256, 152835,
152801, 152985, 153400, 152393, 152818, 152765, 152249, 152600,
151699, 152302, 152752, 153018, 153009, 151992, 153054, 152847,
153354, 153228, 152662, 153355, 152532, 153393, 151782, 152458,
152048, 152757, 152428, 153195, 151906, 153006, 153178, 153250,
152331, 152284, 152780, 153138, 153319, 151980, 153142, 152418,
152228, 152733, 151670, 198, 9096, 155801, 151669, 151698,
153321, 152217, 153039, 152935, 153400, 152122, 152531, 153106,
152169, 152892, 152957, 151851, 152427, 152826, 152451, 151851,
152901, 152885, 152594, 153446, 153080, 151670, 198, 14689,
155795, 151669, 152658, 151700, 153321, 152450, 152530, 153191,
151673, 151690, 151698, 152714, 152846, 152981, 153171, 153384,
153364, 153188, 153246, 151670, 198, 1055, 155779, 151669,
151869, 152388, 152711, 153334, 151736, 151670, 198, 1782,
155780, 151669, 153483, 153240, 152241, 152558, 152697, 153046,
151670, 198, 5804, 1363, 155820, 151669, 152941, 152764, 152605,
153034, 153434, 153372, 153347, 151887, 152453, 152758, 152133,
152510, 152694, 152431, 152321, 153088, 152676, 152223, 152581,
152459, 152015, 152502, 153063, 152712, 153294, 153451, 153032,
152903, 152859, 152989, 151748, 152669, 152661, 152650, 152409,
151861, 151670, 198, 300, 7973, 155828, 151669, 153095, 152469,
152988, 152894, 151819, 152391, 153019, 152058, 153062, 153230,
151826, 152112, 152306, 152264, 152769, 153390, 152384, 152435,
152790, 153393, 152983, 152540, 152252, 152034, 153107, 152540,
151919, 151893, 152558, 152817, 152946, 152956, 152129, 152715,
153131, 153490, 151734, 152271, 152707, 151734, 153321, 152450,
151670, 198, 8088, 155792, 151669, 152452, 153497, 153353,
152679, 152533, 152382, 152374, 152611, 153341, 153163, 152285,
153411, 152495, 153141, 152320, 151670, 198, 1199, 155781,
151669, 151764, 152360, 153295, 152634, 153342, 152199, 152271,
151670, 198, 43366, 155799, 151669, 152308, 151682, 152889,
152016, 152385, 152629, 152495, 151826, 153321, 152958, 152180,
151886, 153432, 152922, 152128, 153024, 153040, 152593, 152287,
151677, 151670, 198, 53660, 155808, 151669, 151727, 152092,
152680, 153331, 151699, 152316, 152938, 152289, 152433, 153384,
151781, 153137, 153259, 152175, 153213, 152291, 151869, 152691,
152489, 151941, 152049, 152034, 153053, 152179, 153160, 151676,
153367, 151670, 198, 268, 4123, 480, 155821, 151669, 152350,
152173, 152536, 151991, 151960, 153144, 153013, 152358, 152234,
153135, 152291, 153235, 152143, 152583, 152402, 153483, 152678,
152192, 152533, 152946, 151797, 153103, 152310, 152293, 151825,
152548, 153442, 152109, 152659, 153325, 152781, 152570, 152957,
151752, 152265, 153381, 152515, 151670, 198, 437, 155787,
151669, 152957, 152659, 151975, 152709, 152402, 152836, 152174,
151792, 153409, 153327, 152990, 151670, 198, 275, 155781,
151669, 152520, 153038, 152067, 153273, 153185, 152265, 152974,
151670, 198, 94273, 155799, 151669, 152953, 152938, 153427,
152244, 151920, 153423, 152929, 152367, 153052, 152129, 152331,
152257, 152987, 152777, 153448, 152408, 151696, 152408, 152326,
152699, 151670, 198, 385, 16239, 155828, 151669, 152306, 152268,
153438, 153228, 152978, 152957, 153153, 153393, 152795, 152110,
152918, 152923, 152467, 152331, 153053, 153330, 151889, 153444,
152234, 152624, 151779, 152801, 152784, 152139, 152222, 152751,
152512, 153287, 153141, 153052, 151840, 152589, 152508, 153499,
152109, 152255, 151739, 152267, 152759, 153318, 153165, 153349,
151670,});
#endif
// print the prompt token-by-token
LOG("\n");
for (auto id : prompt_inp) {
LOG("%s", common_token_to_piece(ctx_ttc, id).c_str());
}
LOG_INF("%s: prompt size: %d\n", __func__, (int) prompt_inp.size());
LOG("\n");
// create a llama_batch
// we use this object to submit token data for decoding
llama_batch batch = llama_batch_init(std::max(prompt_inp.size(), (size_t) n_parallel), 0, n_parallel);
std::vector<llama_seq_id> seq_ids(n_parallel, 0);
for (int32_t i = 0; i < n_parallel; ++i) {
seq_ids[i] = i;
}
// evaluate the initial prompt
for (size_t i = 0; i < prompt_inp.size(); ++i) {
common_batch_add(batch, prompt_inp[i], i, seq_ids, false);
}
GGML_ASSERT(batch.n_tokens == (int) prompt_inp.size());
// llama_decode will output logits only for the last token of the prompt
batch.logits[batch.n_tokens - 1] = true;
if (llama_decode(ctx_ttc, batch) != 0) {
LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1;
}
if (n_parallel > 1) {
LOG_INF("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
}
llama_synchronize(ctx_ttc);
LOG_INF("%s: time for prompt: %.3f ms\n\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
const auto t_dec_start = ggml_time_us();
// main loop
// remember the batch index of the last token for each parallel sequence
// we need this to determine which logits to sample from
std::vector<int32_t> i_batch(n_parallel, batch.n_tokens - 1);
int n_past = batch.n_tokens;
int n_decode = 0;
while (n_decode <= n_predict) {
// prepare the next batch
common_batch_clear(batch);
// sample the next token for each parallel sequence / stream
for (int32_t i = 0; i < n_parallel; ++i) {
if (i_batch[i] < 0) {
// the stream has already finished
continue;
}
const llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]);
common_sampler_accept(smpl[i], new_token_id, true);
codes.push_back(new_token_id);
const auto * cands = common_sampler_get_candidates(smpl[i]);
// is it an end of generation? -> mark the stream as finished
if (llama_token_is_eog(model_ttc, new_token_id) || n_decode == n_predict) {
std::string reason;
if (llama_token_is_eog(model_ttc, new_token_id)) {
reason = "eos";
} else {
reason = "n_predict";
}
i_batch[i] = -1;
LOG("\n");
if (n_parallel > 1) {
LOG_CNT("\n");
LOG_INF("%s: stream %d finished at n_past = %d, reason = '%s'\n", __func__, i, n_past, reason.c_str());
}
continue;
}
{
const float p = cands->data[cands->selected].p;
const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) ((3*p)*float(k_colors.size()))));
LOG_CNT("%s%d%s", k_colors[col].c_str(), i, "\033[0m");
//LOG_CNT("%d", i);
}
i_batch[i] = batch.n_tokens;
// push this new token for next evaluation
common_batch_add(batch, new_token_id, n_past, { i }, true);
}
// all streams are finished
if (batch.n_tokens == 0) {
break;
}
n_decode += 1;
n_past += 1;
// evaluate the current batch with the transformer model
if (llama_decode(ctx_ttc, batch)) {
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
return 1;
}
}
llama_batch_free(batch);
LOG("\n");
LOG_INF("%s: time for decoder: %.3f ms\n", __func__, (ggml_time_us() - t_dec_start) / 1000.0f);
}
common_perf_print(ctx_ttc, smpl[0]);
//std::vector<llama_token> codes = {198, 88225, 155856, 151669, 152205,
// 153064, 152537, 153421, 153209, 152524, 151689, 152993, 152438, 152695,
// 153091, 152945, 152829, 152534, 152934, 153020, 151997, 152263, 153010,
// 153146, 152399, 153208, 152496, 151793, 152848, 152263, 152571, 153286,
// 152227, 153300, 152934, 152263, 153208, 152263, 152965, 152430, 152296,
// 153146, 152920, 152376, 152556, 153363, 151775, 152044, 152972, 152690,
// 153379, 152368, 152233, 153422, 152490, 151996, 152022, 151694, 152061,
// 153238, 152539, 153356, 152640, 153021, 153123, 151962, 153094, 151670,
// 198, 20339, 13189, 155824, 151669, 152070, 152007, 152910, 151683,
// 152000, 152373, 152760, 152046, 151735, 152334, 152394, 153073, 152908,
// 151856, 151953, 153247, 153293, 151903, 153480, 153168, 152478, 153359,
// 153429, 151905, 151678, 152567, 152411, 152165, 152556, 153075, 153424,
// 151993, 152999, 153078, 152151, 152088, 153389, 152484, 151874, 151670,
// 198, 285, 155784, 151669, 152226, 152126, 152638, 153215, 151729,
// 152959, 153479, 153059, 151838, 151670, 198, 1782, 155783, 151669,
// 153288, 153055, 153314, 152497, 152962, 152741, 152076, 153253, 151670,
// 198, 471, 16488, 155825, 151669, 152060, 152916, 151893, 153469, 152501,
// 152080, 152743, 151932, 153161, 152096, 152761, 152698, 153401, 153242,
// 153336, 152441, 152838, 153467, 152706, 153496, 153310, 152422, 153360,
// 153115, 152763, 151998, 152373, 153450, 152554, 151968, 153323, 152055,
// 152468, 153111, 153358, 152813, 152010, 151770, 152823, 152960, 151670,
// 198, 22627, 155823, 151669, 152814, 152366, 153484, 152931, 153441,
// 152164, 152877, 152915, 153463, 151692, 152911, 152747, 152776, 151831,
// 153449, 151882, 152975, 152031, 152513, 153150, 152448, 152667, 153133,
// 153189, 152619, 153466, 152054, 152106, 153119, 152277, 152439, 153109,
// 152997, 152141, 153154, 153256, 153311, 151922, 151670, 198, 1055,
// 155781, 151669, 152633, 151850, 153060, 153270, 152560, 153348, 152729,
// 151670, 198, 25312, 155803, 151669, 152521, 153403, 152561, 153337,
// 153383, 152199, 153493, 153326, 151830, 152254, 152248, 152349, 152153,
// 153007, 151823, 153037, 152575, 152457, 152406, 152592, 153116, 153365,
// 153456, 151670, 198, 88225, 155817, 151669, 153271, 151925, 152218,
// 152418, 152253, 153140, 151903, 153151, 152626, 152338, 152647, 153464,
// 152785, 152768, 151711, 152037, 152033, 151804, 152216, 151701, 151855,
// 152348, 152995, 152955, 152905, 152342, 152340, 153391, 153453, 152418,
// 153415, 151990, 153083, 152884, 151670, 198, 151668, 198, 151645};
{
const std::string inp_txt = common_detokenize(ctx_ttc, codes, true);
LOG("\n");
LOG_INF("codes: '%s'\n", inp_txt.c_str());
LOG_INF("%s: codes size: %d\n", __func__, (int) codes.size());
}
// remove all non-audio tokens (i.e. < 151672 || > 155772)
codes.erase(std::remove_if(codes.begin(), codes.end(), [](llama_token t) { return t < 151672 || t > 155772; }), codes.end());
{
const std::string inp_txt = common_detokenize(ctx_ttc, codes, true);
LOG_INF("codes audio: '%s'\n", inp_txt.c_str());
LOG_INF("%s: codes audio size: %d\n", __func__, (int) codes.size());
}
for (auto & token : codes) {
token -= 151672;
}
const auto t_voc_start = ggml_time_us();
const int n_codes = codes.size();
llama_batch batch = llama_batch_init(n_codes, 0, 1);
for (size_t i = 0; i < codes.size(); ++i) {
common_batch_add(batch, codes[i], i, { 0 }, true); // TODO: all logits?
}
GGML_ASSERT(batch.n_tokens == n_codes);
if (llama_decode(ctx_cts, batch) != 0) {
LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1;
}
llama_synchronize(ctx_cts);
LOG_INF("%s: time for vocoder: %.3f ms\n", __func__, (ggml_time_us() - t_voc_start) / 1000.0f);
const auto t_spec_start = ggml_time_us();
#if 1
// spectral operations
const int n_embd = llama_n_embd(model_cts);
const float * embd = llama_get_embeddings(ctx_cts);
auto audio = embd_to_audio(embd, n_codes, n_embd, params.cpuparams.n_threads);
#else
// read the spectrogram from a file for debugging purposes
std::vector<float> audio;
{
std::ifstream fin("out.bin", std::ios::binary);
if (!fin) {
LOG_ERR("%s: failed to open file '%s'\n", __func__, "out.bin");
return 1;
}
std::vector<float> embd;
int n_codes;
int n_embd;
fin.read(reinterpret_cast<char *>(&n_codes), sizeof(int));
fin.read(reinterpret_cast<char *>(&n_embd), sizeof(int));
embd.resize(n_codes * n_embd);
fin.read(reinterpret_cast<char *>(embd.data()), n_codes * n_embd * sizeof(float));
fin.close();
LOG_INF("%s: n_codes: %d, n_embd: %d\n", __func__, n_codes, n_embd);
audio = embd_to_audio(embd.data(), n_codes, n_embd, params.cpuparams.n_threads);
}
#endif
const std::string fname = "output.wav";
const int n_sr = 24000; // sampling rate
// zero out first 0.25 seconds
for (int i = 0; i < 24000/4; ++i) {
audio[i] = 0.0f;
}
LOG_INF("%s: time for spectral ops: %.3f ms\n", __func__, (ggml_time_us() - t_spec_start) / 1000.0f);
LOG_INF("%s: total time: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
save_wav16(fname, audio, n_sr);
LOG_INF("%s: audio written to file '%s'\n", __func__, fname.c_str());
llama_free(ctx_ttc);
llama_free_model(model_ttc);
llama_free(ctx_cts);
llama_free_model(model_cts);
llama_backend_free();
return 0;
}

View File

@ -32,6 +32,13 @@ else()
endif() endif()
endif() endif()
# remove the lib prefix on win32 mingw
if (WIN32)
set(CMAKE_STATIC_LIBRARY_PREFIX "")
set(CMAKE_SHARED_LIBRARY_PREFIX "")
set(CMAKE_SHARED_MODULE_PREFIX "")
endif()
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT}) option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF) option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
@ -68,7 +75,7 @@ endif()
# general # general
option(GGML_STATIC "ggml: static link libraries" OFF) option(GGML_STATIC "ggml: static link libraries" OFF)
option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT}) option(GGML_NATIVE "ggml: optimize the build for the current system" ${GGML_NATIVE_DEFAULT})
option(GGML_LTO "ggml: enable link time optimization" OFF) option(GGML_LTO "ggml: enable link time optimization" OFF)
option(GGML_CCACHE "ggml: use ccache if available" ON) option(GGML_CCACHE "ggml: use ccache if available" ON)
@ -94,28 +101,32 @@ endif()
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF) option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON) option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
option(GGML_AVX "ggml: enable AVX" ${INS_ENB}) option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF) option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB}) option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
option(GGML_AVX512 "ggml: enable AVX512" OFF) option(GGML_AVX512 "ggml: enable AVX512F" OFF)
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF) option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF) option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF) option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF)
option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
if (NOT MSVC) if (NOT MSVC)
option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512 # in MSVC F16C and FMA is implied with AVX2/AVX512
option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
option(GGML_F16C "ggml: enable F16C" ${INS_ENB})
# MSVC does not seem to support AMX
option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF)
option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
endif() endif()
option(GGML_LASX "ggml: enable lasx" ON) option(GGML_LASX "ggml: enable lasx" ON)
option(GGML_LSX "ggml: enable lsx" ON) option(GGML_LSX "ggml: enable lsx" ON)
option(GGML_RVV "ggml: enable rvv" ON) option(GGML_RVV "ggml: enable rvv" ON)
option(GGML_SVE "ggml: enable SVE" OFF)
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
if (WIN32) if (WIN32)
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version") set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version")
endif() endif()
# ggml core # ggml core
@ -169,6 +180,11 @@ set (GGML_SYCL_TARGET "INTEL" CACHE STRING
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
"ggml: sycl device architecture") "ggml: sycl device architecture")
option(GGML_OPENCL "ggml: use OpenCL" OFF)
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
# extra artifacts # extra artifacts
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE}) option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE}) option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
@ -180,11 +196,7 @@ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true) set(CMAKE_C_STANDARD_REQUIRED true)
if (GGML_SYCL) set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD 17)
else()
set(CMAKE_CXX_STANDARD 11)
endif()
set(CMAKE_CXX_STANDARD_REQUIRED true) set(CMAKE_CXX_STANDARD_REQUIRED true)
set(THREADS_PREFER_PTHREAD_FLAG ON) set(THREADS_PREFER_PTHREAD_FLAG ON)

View File

@ -228,6 +228,7 @@ extern "C" {
GGML_API void ggml_backend_unload(ggml_backend_reg_t reg); GGML_API void ggml_backend_unload(ggml_backend_reg_t reg);
// Load all known backends from dynamic libraries // Load all known backends from dynamic libraries
GGML_API void ggml_backend_load_all(void); GGML_API void ggml_backend_load_all(void);
GGML_API void ggml_backend_load_all_from_path(const char * dir_path);
// //
// Backend scheduler // Backend scheduler

View File

@ -103,24 +103,14 @@ extern "C" {
// Internal types and functions exposed for tests and benchmarks // Internal types and functions exposed for tests and benchmarks
typedef void (*ggml_from_float_to_mat_t)
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc); const void * GGML_RESTRICT y, size_t by, int nrc);
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
struct ggml_type_traits_cpu { struct ggml_type_traits_cpu {
ggml_from_float_t from_float; ggml_from_float_t from_float;
ggml_from_float_to_mat_t from_float_to_mat;
ggml_vec_dot_t vec_dot; ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type; enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously int64_t nrows; // number of rows to process simultaneously
int64_t ncols; // number of columns to process simultaneously
ggml_gemv_t gemv;
ggml_gemm_t gemm;
}; };
GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type); GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
@ -140,13 +130,6 @@ extern "C" {
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
#ifdef GGML_USE_CPU_HBM
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
GGML_BACKEND_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -0,0 +1,26 @@
#ifndef GGML_OPENCL_H
#define GGML_OPENCL_H
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
//
// backend API
//
GGML_BACKEND_API ggml_backend_t ggml_backend_opencl_init(void);
GGML_BACKEND_API bool ggml_backend_is_opencl(ggml_backend_t backend);
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl_reg(void);
#ifdef __cplusplus
}
#endif
#endif // GGML_OPENCL_H

View File

@ -238,6 +238,8 @@
#define GGML_EXIT_ABORTED 1 #define GGML_EXIT_ABORTED 1
#define GGML_ROPE_TYPE_NEOX 2 #define GGML_ROPE_TYPE_NEOX 2
#define GGML_ROPE_TYPE_MROPE 8
#define GGML_ROPE_TYPE_VISION 24
#define GGUF_MAGIC "GGUF" #define GGUF_MAGIC "GGUF"
@ -384,15 +386,15 @@ extern "C" {
GGML_TYPE_F64 = 28, GGML_TYPE_F64 = 28,
GGML_TYPE_IQ1_M = 29, GGML_TYPE_IQ1_M = 29,
GGML_TYPE_BF16 = 30, GGML_TYPE_BF16 = 30,
GGML_TYPE_Q4_0_4_4 = 31, // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
GGML_TYPE_Q4_0_4_8 = 32, // GGML_TYPE_Q4_0_4_8 = 32,
GGML_TYPE_Q4_0_8_8 = 33, // GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_TQ1_0 = 34, GGML_TYPE_TQ1_0 = 34,
GGML_TYPE_TQ2_0 = 35, GGML_TYPE_TQ2_0 = 35,
GGML_TYPE_IQ4_NL_4_4 = 36, // GGML_TYPE_IQ4_NL_4_4 = 36,
// GGML_TYPE_IQ4_NL_4_8 = 37, // GGML_TYPE_IQ4_NL_4_8 = 37,
// GGML_TYPE_IQ4_NL_8_8 = 38, // GGML_TYPE_IQ4_NL_8_8 = 38,
GGML_TYPE_COUNT, GGML_TYPE_COUNT = 39,
}; };
// precision // precision
@ -433,9 +435,6 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
}; };
// available tensor operations: // available tensor operations:
@ -499,6 +498,7 @@ extern "C" {
GGML_OP_POOL_2D_BACK, GGML_OP_POOL_2D_BACK,
GGML_OP_UPSCALE, // nearest interpolate GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD, GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D,
GGML_OP_ARANGE, GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT, GGML_OP_ARGSORT,
@ -1445,6 +1445,22 @@ extern "C" {
float beta_fast, float beta_fast,
float beta_slow); float beta_slow);
GGML_API struct ggml_tensor * ggml_rope_multi(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int sections[4],
int mode,
int n_ctx_orig,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow);
// in-place, returns view(a) // in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_rope_ext_inplace( GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
@ -1548,17 +1564,6 @@ extern "C" {
int d1, // dilation dimension 1 int d1, // dilation dimension 1
bool is_2D); bool is_2D);
GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s0, // stride dimension 0
int s1, // stride dimension 1
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1); // dilation dimension 1
GGML_API struct ggml_tensor * ggml_conv_1d( GGML_API struct ggml_tensor * ggml_conv_1d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel struct ggml_tensor * a, // convolution kernel
@ -1576,6 +1581,23 @@ extern "C" {
int s, // stride int s, // stride
int d); // dilation int d); // dilation
// depthwise
// TODO: this is very likely wrong for some cases! - needs more testing
GGML_API struct ggml_tensor * ggml_conv_1d_dw(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s0, // stride
int p0, // padding
int d0); // dilation
GGML_API struct ggml_tensor * ggml_conv_1d_dw_ph(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s0, // stride
int d0); // dilation
GGML_API struct ggml_tensor * ggml_conv_transpose_1d( GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel struct ggml_tensor * a, // convolution kernel
@ -1595,7 +1617,6 @@ extern "C" {
int d0, // dilation dimension 0 int d0, // dilation dimension 0
int d1); // dilation dimension 1 int d1); // dilation dimension 1
// kernel size is a->ne[0] x a->ne[1] // kernel size is a->ne[0] x a->ne[1]
// stride is equal to kernel size // stride is equal to kernel size
// padding is zero // padding is zero
@ -1622,6 +1643,18 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// depthwise
GGML_API struct ggml_tensor * ggml_conv_2d_dw(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s0, // stride dimension 0
int s1, // stride dimension 1
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1); // dilation dimension 1
GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -1695,6 +1728,13 @@ extern "C" {
int p2, int p2,
int p3); int p3);
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int p0,
int p1);
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
// timesteps: [N,] // timesteps: [N,]
// return: [N, dim] // return: [N, dim]
@ -2198,10 +2238,18 @@ extern "C" {
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data); GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
#ifdef __cplusplus #ifdef __cplusplus
// restrict not standard in C++ // restrict not standard in C++
#define GGML_RESTRICT # if defined(__GNUC__)
# define GGML_RESTRICT __restrict__
# elif defined(__clang__)
# define GGML_RESTRICT __restrict
# elif defined(_MSC_VER)
# define GGML_RESTRICT __restrict
# else
# define GGML_RESTRICT
# endif
#else #else
#define GGML_RESTRICT restrict # define GGML_RESTRICT restrict
#endif #endif
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

View File

@ -194,11 +194,6 @@ endif()
if (WIN32) if (WIN32)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS) add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
if (BUILD_SHARED_LIBS)
# TODO: should not use this
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif()
endif() endif()
# ggml # ggml
@ -220,9 +215,7 @@ add_library(ggml-base
ggml-threading.cpp ggml-threading.cpp
ggml-threading.h ggml-threading.h
ggml-quants.c ggml-quants.c
ggml-quants.h ggml-quants.h)
ggml-aarch64.c
ggml-aarch64.h)
target_include_directories(ggml-base PRIVATE .) target_include_directories(ggml-base PRIVATE .)
@ -269,7 +262,42 @@ function(ggml_add_backend backend)
endif() endif()
endfunction() endfunction()
function(ggml_add_cpu_backend_variant tag_name)
set(GGML_CPU_TAG_NAME ${tag_name})
# other: OPENMP LLAMAFILE CPU_HBM
foreach (feat NATIVE
AVX AVX2 AVX_VNNI FMA F16C
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
AMX_TILE AMX_INT8 AMX_BF16)
set(GGML_${feat} OFF)
endforeach()
foreach (feat ${ARGN})
set(GGML_${feat} ON)
endforeach()
ggml_add_cpu_backend_variant_impl(${tag_name})
endfunction()
ggml_add_backend(CPU) ggml_add_backend(CPU)
if (GGML_CPU_ALL_VARIANTS)
if (NOT GGML_BACKEND_DL)
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
endif()
ggml_add_cpu_backend_variant(sandybridge AVX)
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA)
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
if (NOT MSVC)
# MSVC doesn't support AVX-VNNI or AMX
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
endif()
else ()
ggml_add_cpu_backend_variant_impl("")
endif()
ggml_add_backend(BLAS) ggml_add_backend(BLAS)
ggml_add_backend(CANN) ggml_add_backend(CANN)
ggml_add_backend(CUDA) ggml_add_backend(CUDA)
@ -280,6 +308,7 @@ ggml_add_backend(MUSA)
ggml_add_backend(RPC) ggml_add_backend(RPC)
ggml_add_backend(SYCL) ggml_add_backend(SYCL)
ggml_add_backend(Vulkan) ggml_add_backend(Vulkan)
ggml_add_backend(OpenCL)
foreach (target ggml-base ggml) foreach (target ggml-base ggml)
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>) target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)

View File

@ -1,129 +0,0 @@
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "ggml-aarch64.h"
#include "ggml-impl.h"
#include "ggml-quants.h"
#include <assert.h>
#define UNUSED GGML_UNUSED
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
block_q4_0x4 out;
for (int i = 0; i < 4; i++) {
out.d[i] = in[i].d;
}
const int end = QK4_0 * 2 / blck_size_interleave;
if (blck_size_interleave == 8) {
const uint64_t xor_mask = 0x8888888888888888ULL;
for (int i = 0; i < end; ++i) {
int src_id = i % 4;
int src_offset = (i / 4) * blck_size_interleave;
int dst_offset = i * blck_size_interleave;
uint64_t elems;
// Using memcpy to avoid unaligned memory accesses
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
elems ^= xor_mask;
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
}
} else if (blck_size_interleave == 4) {
const uint32_t xor_mask = 0x88888888;
for (int i = 0; i < end; ++i) {
int src_id = i % 4;
int src_offset = (i / 4) * blck_size_interleave;
int dst_offset = i * blck_size_interleave;
uint32_t elems;
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
elems ^= xor_mask;
memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
}
} else {
GGML_ASSERT(false);
}
return out;
}
// interleave 8 block_q4_0s in blocks of blck_size_interleave
// returns an interleaved block_q4_0x8
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
block_q4_0x8 out;
for (int i = 0; i < 8; i++) {
out.d[i] = in[i].d;
}
const int end = QK4_0 * 4 / blck_size_interleave;
const uint64_t xor_mask = 0x8888888888888888ULL;
for (int i = 0; i < end; ++i) {
int src_id = i % 8;
int src_offset = (i / 8) * blck_size_interleave;
int dst_offset = i * blck_size_interleave;
uint64_t elems;
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
elems ^= xor_mask;
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
}
return out;
}
static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blck_size_interleave) {
assert(n_per_row % QK4_0 == 0);
const int nb = n_per_row / QK4_0;
void * out_ptr = NULL;
if (nrows_interleaved == 8) {
out_ptr = (block_q4_0x8 *) dst;
}
else if (nrows_interleaved == 4) {
out_ptr = (block_q4_0x4 *) dst;
}
assert(nrows_interleaved <= 8);
block_q4_0 dst_tmp[8];
for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
for (int64_t x = 0; x < nb; x++) {
for (int i = 0; i < nrows_interleaved; i++ ) {
quantize_row_q4_0_ref(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0);
}
if (nrows_interleaved == 8) {
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave);
out_ptr = (block_q4_0x8 *) out_ptr + 1;
}
else if (nrows_interleaved == 4) {
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave);
out_ptr = (block_q4_0x4 *) out_ptr + 1;
}
}
}
return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0));
}
size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
UNUSED(quant_weights);
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
}
size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
UNUSED(quant_weights);
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
}
size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
UNUSED(quant_weights);
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
}

View File

@ -1,19 +0,0 @@
#pragma once
#include "ggml.h"
// GGML internal header
#ifdef __cplusplus
extern "C" {
#endif
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
#ifdef __cplusplus
}
#endif

View File

@ -534,7 +534,6 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node); size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
hn->buffer_id = buffer_id; hn->buffer_id = buffer_id;
hn->offset = offset; hn->offset = offset;
return;
} }
} }

View File

@ -46,6 +46,10 @@
#include "ggml-vulkan.h" #include "ggml-vulkan.h"
#endif #endif
#ifdef GGML_USE_OPENCL
#include "ggml-opencl.h"
#endif
#ifdef GGML_USE_BLAS #ifdef GGML_USE_BLAS
#include "ggml-blas.h" #include "ggml-blas.h"
#endif #endif
@ -146,6 +150,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_VULKAN #ifdef GGML_USE_VULKAN
register_backend(ggml_backend_vk_reg()); register_backend(ggml_backend_vk_reg());
#endif #endif
#ifdef GGML_USE_OPENCL
register_backend(ggml_backend_opencl_reg());
#endif
#ifdef GGML_USE_CANN #ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg()); register_backend(ggml_backend_cann_reg());
#endif #endif
@ -449,11 +456,21 @@ static std::string backend_filename_suffix() {
#endif #endif
} }
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) { static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
// TODO: search system paths // TODO: search system paths
std::vector<std::string> search_paths = { "./", get_executable_path() };
std::string file_prefix = backend_filename_prefix() + name + "-"; std::string file_prefix = backend_filename_prefix() + name + "-";
std::vector<std::string> search_paths;
if (user_search_path == nullptr) {
search_paths.push_back("./");
search_paths.push_back(get_executable_path());
} else {
#if defined(_WIN32)
search_paths.push_back(std::string(user_search_path) + "\\");
#else
search_paths.push_back(std::string(user_search_path) + "/");
#endif
}
int best_score = 0; int best_score = 0;
std::string best_path; std::string best_path;
@ -463,7 +480,8 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
if (!fs::exists(search_path)) { if (!fs::exists(search_path)) {
continue; continue;
} }
for (const auto & entry : fs::directory_iterator(search_path)) { fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
for (const auto & entry : dir_it) {
if (entry.is_regular_file()) { if (entry.is_regular_file()) {
std::string filename = entry.path().filename().string(); std::string filename = entry.path().filename().string();
std::string ext = entry.path().extension().string(); std::string ext = entry.path().extension().string();
@ -483,6 +501,10 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
best_score = s; best_score = s;
best_path = entry.path().string(); best_path = entry.path().string();
} }
} else {
if (!silent) {
GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
}
} }
} }
} }
@ -505,15 +527,26 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
} }
void ggml_backend_load_all() { void ggml_backend_load_all() {
ggml_backend_load_best("blas", true); ggml_backend_load_all_from_path(nullptr);
ggml_backend_load_best("cann", true); }
ggml_backend_load_best("cuda", true);
ggml_backend_load_best("hip", true); void ggml_backend_load_all_from_path(const char * dir_path) {
ggml_backend_load_best("kompute", true); #ifdef NDEBUG
ggml_backend_load_best("metal", true); bool silent = true;
ggml_backend_load_best("rpc", true); #else
ggml_backend_load_best("sycl", true); bool silent = false;
ggml_backend_load_best("vulkan", true); #endif
ggml_backend_load_best("musa", true);
ggml_backend_load_best("cpu", true); ggml_backend_load_best("blas", silent, dir_path);
ggml_backend_load_best("cann", silent, dir_path);
ggml_backend_load_best("cuda", silent, dir_path);
ggml_backend_load_best("hip", silent, dir_path);
ggml_backend_load_best("kompute", silent, dir_path);
ggml_backend_load_best("metal", silent, dir_path);
ggml_backend_load_best("rpc", silent, dir_path);
ggml_backend_load_best("sycl", silent, dir_path);
ggml_backend_load_best("vulkan", silent, dir_path);
ggml_backend_load_best("opencl", silent, dir_path);
ggml_backend_load_best("musa", silent, dir_path);
ggml_backend_load_best("cpu", silent, dir_path);
} }

View File

@ -1747,6 +1747,15 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
if (*ext_factor != 0) { if (*ext_factor != 0) {
return false; return false;
} }
const int mode = ((const int32_t *) op->op_params)[2];
if (mode & GGML_ROPE_TYPE_MROPE) {
return false;
}
if (mode & GGML_ROPE_TYPE_VISION) {
return false;
}
return true; return true;
} }
case GGML_OP_UPSCALE: { case GGML_OP_UPSCALE: {
@ -2089,7 +2098,7 @@ static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, con
static const ggml_backend_reg_i ggml_backend_cann_reg_interface = { static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
/* .get_name = */ ggml_backend_cann_reg_get_name, /* .get_name = */ ggml_backend_cann_reg_get_name,
/* .get_device_count = */ ggml_backend_cann_reg_get_device_count, /* .get_device_count = */ ggml_backend_cann_reg_get_device_count,
/* .get_device_get = */ ggml_backend_cann_reg_get_device, /* .get_device = */ ggml_backend_cann_reg_get_device,
/* .get_proc_address = */ ggml_backend_cann_reg_get_proc_address, /* .get_proc_address = */ ggml_backend_cann_reg_get_proc_address,
}; };

View File

@ -6,7 +6,20 @@
typedef uint16_t ggml_half; typedef uint16_t ggml_half;
typedef uint32_t ggml_half2; typedef uint32_t ggml_half2;
#define GGML_COMMON_AGGR #define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S
#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CPP)
#include <cstdint>
typedef uint16_t ggml_half;
typedef uint32_t ggml_half2;
// std-c++ allow anonymous unions but some compiler warn on it
#define GGML_COMMON_AGGR_U data
// std-c++ do not allow it.
#define GGML_COMMON_AGGR_S data
#define GGML_COMMON_DECL #define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_METAL) #elif defined(GGML_COMMON_DECL_METAL)
@ -15,7 +28,8 @@ typedef uint32_t ggml_half2;
typedef half ggml_half; typedef half ggml_half;
typedef half2 ggml_half2; typedef half2 ggml_half2;
#define GGML_COMMON_AGGR #define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S
#define GGML_COMMON_DECL #define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CUDA) #elif defined(GGML_COMMON_DECL_CUDA)
@ -29,7 +43,8 @@ typedef half2 ggml_half2;
typedef half ggml_half; typedef half ggml_half;
typedef half2 ggml_half2; typedef half2 ggml_half2;
#define GGML_COMMON_AGGR data #define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data
#define GGML_COMMON_DECL #define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_HIP) #elif defined(GGML_COMMON_DECL_HIP)
@ -39,7 +54,8 @@ typedef half2 ggml_half2;
typedef half ggml_half; typedef half ggml_half;
typedef half2 ggml_half2; typedef half2 ggml_half2;
#define GGML_COMMON_AGGR data #define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data
#define GGML_COMMON_DECL #define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_SYCL) #elif defined(GGML_COMMON_DECL_SYCL)
@ -49,7 +65,8 @@ typedef half2 ggml_half2;
typedef sycl::half ggml_half; typedef sycl::half ggml_half;
typedef sycl::half2 ggml_half2; typedef sycl::half2 ggml_half2;
#define GGML_COMMON_AGGR data #define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data
#define GGML_COMMON_DECL #define GGML_COMMON_DECL
#endif #endif
@ -154,9 +171,9 @@ typedef struct {
struct { struct {
ggml_half d; // delta ggml_half d; // delta
ggml_half m; // min ggml_half m; // min
} GGML_COMMON_AGGR; } GGML_COMMON_AGGR_S;
ggml_half2 dm; ggml_half2 dm;
}; } GGML_COMMON_AGGR_U;
uint8_t qs[QK4_1 / 2]; // nibbles / quants uint8_t qs[QK4_1 / 2]; // nibbles / quants
} block_q4_1; } block_q4_1;
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding"); static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
@ -175,9 +192,9 @@ typedef struct {
struct { struct {
ggml_half d; // delta ggml_half d; // delta
ggml_half m; // min ggml_half m; // min
} GGML_COMMON_AGGR; } GGML_COMMON_AGGR_S;
ggml_half2 dm; ggml_half2 dm;
}; } GGML_COMMON_AGGR_U;
uint8_t qh[4]; // 5-th bit of quants uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_1 / 2]; // nibbles / quants uint8_t qs[QK5_1 / 2]; // nibbles / quants
} block_q5_1; } block_q5_1;
@ -196,37 +213,13 @@ typedef struct {
struct { struct {
ggml_half d; // delta ggml_half d; // delta
ggml_half s; // d * sum(qs[i]) ggml_half s; // d * sum(qs[i])
} GGML_COMMON_AGGR; } GGML_COMMON_AGGR_S;
ggml_half2 ds; ggml_half2 ds;
}; } GGML_COMMON_AGGR_U;
int8_t qs[QK8_1]; // quants int8_t qs[QK8_1]; // quants
} block_q8_1; } block_q8_1;
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding"); static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
typedef struct {
ggml_half d[4]; // deltas for 4 q4_0 blocks
uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
} block_q4_0x4;
static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
typedef struct {
ggml_half d[8]; // deltas for 8 q4_0 blocks
uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
} block_q4_0x8;
static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
typedef struct {
ggml_half d[4]; // deltas for 4 q8_0 blocks
int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
} block_q8_0x4;
static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
typedef struct {
ggml_half d[8]; // deltas for 8 q8_0 blocks
int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks
} block_q8_0x8;
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
// //
// Ternary quantization // Ternary quantization
// //
@ -261,9 +254,9 @@ typedef struct {
struct { struct {
ggml_half d; // super-block scale for quantized scales ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR; } GGML_COMMON_AGGR_S;
ggml_half2 dm; ggml_half2 dm;
}; } GGML_COMMON_AGGR_U;
} block_q2_K; } block_q2_K;
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
@ -288,9 +281,9 @@ typedef struct {
struct { struct {
ggml_half d; // super-block scale for quantized scales ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR; } GGML_COMMON_AGGR_S;
ggml_half2 dm; ggml_half2 dm;
}; } GGML_COMMON_AGGR_U;
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants uint8_t qs[QK_K/2]; // 4--bit quants
} block_q4_K; } block_q4_K;
@ -305,9 +298,9 @@ typedef struct {
struct { struct {
ggml_half d; // super-block scale for quantized scales ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR; } GGML_COMMON_AGGR_S;
ggml_half2 dm; ggml_half2 dm;
}; } GGML_COMMON_AGGR_U;
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits uint8_t qs[QK_K/2]; // quants, low 4 bits
@ -418,12 +411,6 @@ typedef struct {
} block_iq4_xs; } block_iq4_xs;
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding"); static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
typedef struct {
ggml_half d[4]; // deltas for 4 iq4_nl blocks
uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks
} block_iq4_nlx4;
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
#endif // GGML_COMMON_DECL #endif // GGML_COMMON_DECL
#endif // GGML_COMMON_DECL #endif // GGML_COMMON_DECL
@ -437,6 +424,13 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = { #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_END() }; #define GGML_TABLE_END() };
#define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_CPP)
#include <cstdint>
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_END() };
#define GGML_COMMON_IMPL #define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_METAL) #elif defined(GGML_COMMON_IMPL_METAL)
#include <metal_stdlib> #include <metal_stdlib>
@ -479,7 +473,7 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255, 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
GGML_TABLE_END() GGML_TABLE_END()
//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics //#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
GGML_TABLE_BEGIN(uint64_t, ksigns64, 128) GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff, 0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff, 0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,

View File

@ -1,190 +1,163 @@
ggml_add_backend_library(ggml-cpu) function(ggml_add_cpu_backend_variant_impl tag_name)
if (tag_name)
set(GGML_CPU_NAME ggml-cpu-${tag_name})
else()
set(GGML_CPU_NAME ggml-cpu)
endif()
list (APPEND GGML_CPU_SOURCES ggml_add_backend_library(${GGML_CPU_NAME})
ggml-cpu.c
ggml-cpu.cpp list (APPEND GGML_CPU_SOURCES
ggml-cpu-aarch64.c ggml-cpu/ggml-cpu.c
ggml-cpu-aarch64.h ggml-cpu/ggml-cpu.cpp
ggml-cpu-quants.c ggml-cpu/ggml-cpu-aarch64.cpp
ggml-cpu-quants.h ggml-cpu/ggml-cpu-aarch64.h
amx/amx.cpp ggml-cpu/ggml-cpu-hbm.cpp
amx/amx.h ggml-cpu/ggml-cpu-hbm.h
amx/mmq.cpp ggml-cpu/ggml-cpu-quants.c
amx/mmq.h ggml-cpu/ggml-cpu-quants.h
ggml-cpu-impl.h ggml-cpu/ggml-cpu-traits.cpp
ggml-cpu/ggml-cpu-traits.h
ggml-cpu/amx/amx.cpp
ggml-cpu/amx/amx.h
ggml-cpu/amx/mmq.cpp
ggml-cpu/amx/mmq.h
ggml-cpu/ggml-cpu-impl.h
) )
target_compile_features(ggml-cpu PRIVATE c_std_11 cxx_std_17) target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
target_include_directories(ggml-cpu PRIVATE .) target_include_directories(${GGML_CPU_NAME} PRIVATE . ggml-cpu)
if (APPLE AND GGML_ACCELERATE) if (APPLE AND GGML_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate) find_library(ACCELERATE_FRAMEWORK Accelerate)
if (ACCELERATE_FRAMEWORK) if (ACCELERATE_FRAMEWORK)
message(STATUS "Accelerate framework found") message(STATUS "Accelerate framework found")
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_ACCELERATE) target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_ACCELERATE)
target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_NEW_LAPACK) target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_NEW_LAPACK)
target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_LAPACK_ILP64) target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_LAPACK_ILP64)
target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK}) target_link_libraries(${GGML_CPU_NAME} PRIVATE ${ACCELERATE_FRAMEWORK})
else() else()
message(WARNING "Accelerate framework not found") message(WARNING "Accelerate framework not found")
endif() endif()
endif() endif()
if (GGML_OPENMP) if (GGML_OPENMP)
find_package(OpenMP) find_package(OpenMP)
if (OpenMP_FOUND) if (OpenMP_FOUND)
message(STATUS "OpenMP found") target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_OPENMP) target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
else() else()
message(WARNING "OpenMP not found") message(WARNING "OpenMP not found")
endif() endif()
endif() endif()
if (GGML_LLAMAFILE) if (GGML_LLAMAFILE)
message(STATUS "Using llamafile") target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_LLAMAFILE)
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_LLAMAFILE)
list(APPEND GGML_CPU_SOURCES list(APPEND GGML_CPU_SOURCES
llamafile/sgemm.cpp ggml-cpu/llamafile/sgemm.cpp
llamafile/sgemm.h) ggml-cpu/llamafile/sgemm.h)
endif() endif()
if (GGML_CPU_HBM) if (GGML_CPU_HBM)
find_library(memkind memkind REQUIRED) find_library(memkind memkind REQUIRED)
message(STATUS "Using memkind for CPU HBM") message(STATUS "Using memkind for CPU HBM")
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_HBM) target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_HBM)
target_link_libraries(ggml-cpu PUBLIC memkind) target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind)
endif() endif()
if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
(NOT CMAKE_OSX_ARCHITECTURES AND (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
NOT CMAKE_GENERATOR_PLATFORM_LWR AND
CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$")) CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
message(STATUS "ARM detected") message(STATUS "ARM detected")
if (MSVC) if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
list(APPEND ARCH_DEFINITIONS __ARM_NEON)
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA)
set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
if (GGML_COMPILER_SUPPORT_DOTPROD)
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
message(STATUS "ARM feature DOTPROD enabled")
endif ()
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
message(STATUS "ARM feature MATMUL_INT8 enabled")
endif ()
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
endif ()
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
elseif (APPLE)
if (GGML_NATIVE)
set(USER_PROVIDED_MARCH FALSE)
foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS)
if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+")
set(USER_PROVIDED_MARCH TRUE)
break()
endif()
endforeach()
if (NOT USER_PROVIDED_MARCH)
set(MARCH_FLAGS "-march=armv8.2a")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
if (GGML_COMPILER_SUPPORT_DOTPROD)
set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
message(STATUS "ARM feature DOTPROD enabled")
endif ()
set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
message(STATUS "ARM feature MATMUL_INT8 enabled")
endif ()
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")
endif ()
endif ()
else() else()
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
list(APPEND ARCH_FLAGS -mfp16-format=ieee) list(APPEND ARCH_FLAGS -mfp16-format=ieee)
endif() endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
# Raspberry Pi 1, Zero if (GGML_NATIVE)
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) list(APPEND ARCH_FLAGS -mcpu=native)
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
# -mcpu=native does not always enable all the features in some compilers,
# so we check for them manually and enable them if available
include(CheckCXXSourceRuns)
set(CMAKE_REQUIRED_FLAGS "${ARCH_FLAGS}+dotprod")
check_cxx_source_runs(
"#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }"
GGML_COMPILER_SUPPORT_DOTPROD)
if (GGML_COMPILER_SUPPORT_DOTPROD)
set(ARCH_FLAGS "${ARCH_FLAGS}+dotprod")
endif() endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") set(CMAKE_REQUIRED_FLAGS "${ARCH_FLAGS}+i8mm")
# Android armeabi-v7a check_cxx_source_runs(
list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }"
GGML_COMPILER_SUPPORT_I8MM)
if (GGML_COMPILER_SUPPORT_I8MM)
set(ARCH_FLAGS "${ARCH_FLAGS}+i8mm")
endif()
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
else() else()
# Raspberry Pi 2 if (GGML_CPU_ARM_ARCH)
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
endif() endif()
endif() endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
# Android arm64-v8a # show enabled features
# Raspberry Pi 3, 4, Zero 2 (32-bit) execute_process(
list(APPEND ARCH_FLAGS -mno-unaligned-access) COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E -
INPUT_FILE "/dev/null"
OUTPUT_VARIABLE ARM_FEATURE
RESULT_VARIABLE ARM_FEATURE_RESULT
)
if (ARM_FEATURE_RESULT)
message(FATAL_ERROR "Failed to get ARM features")
else()
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
if (NOT ${feature_pos} EQUAL -1)
message(STATUS "ARM feature ${feature} enabled")
endif() endif()
if (GGML_SVE) endforeach()
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
endif() endif()
endif() endif()
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
message(STATUS "x86 detected") message(STATUS "x86 detected")
if (MSVC) if (MSVC)
# instruction set detection for MSVC only # instruction set detection for MSVC only
if (GGML_NATIVE) if (GGML_NATIVE)
include(cmake/FindSIMD.cmake) include(ggml-cpu/cmake/FindSIMD.cmake)
endif () endif ()
if (GGML_AVX512) if (GGML_AVX512)
list(APPEND ARCH_FLAGS /arch:AVX512) list(APPEND ARCH_FLAGS /arch:AVX512)
# /arch:AVX512 includes: __AVX512F__, __AVX512CD__, __AVX512BW__, __AVX512DQ__, and __AVX512VL__
# MSVC has no compile-time flags enabling specific # MSVC has no compile-time flags enabling specific
# AVX512 extensions, neither it defines the # AVX512 extensions, neither it defines the
# macros corresponding to the extensions. # macros corresponding to the extensions.
# Do it manually. # Do it manually.
list(APPEND ARCH_DEFINITIONS GGML_AVX512)
if (GGML_AVX512_VBMI) if (GGML_AVX512_VBMI)
list(APPEND ARCH_DEFINITIONS __AVX512VBMI__) list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
if (CMAKE_C_COMPILER_ID STREQUAL "Clang") if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
@ -192,81 +165,101 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
endif() endif()
endif() endif()
if (GGML_AVX512_VNNI) if (GGML_AVX512_VNNI)
list(APPEND ARCH_DEFINITIONS __AVX512VNNI__) list(APPEND ARCH_DEFINITIONS __AVX512VNNI__ GGML_AVX512_VNNI)
if (CMAKE_C_COMPILER_ID STREQUAL "Clang") if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
list(APPEND ARCH_FLAGS -mavx512vnni) list(APPEND ARCH_FLAGS -mavx512vnni)
endif() endif()
endif() endif()
if (GGML_AVX512_BF16) if (GGML_AVX512_BF16)
list(APPEND ARCH_DEFINITIONS __AVX512BF16__) list(APPEND ARCH_DEFINITIONS __AVX512BF16__ GGML_AVX512_BF16)
if (CMAKE_C_COMPILER_ID STREQUAL "Clang") if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
list(APPEND ARCH_FLAGS -mavx512bf16) list(APPEND ARCH_FLAGS -mavx512bf16)
endif() endif()
endif() endif()
if (GGML_AMX_TILE) if (GGML_AMX_TILE)
list(APPEND ARCH_DEFINITIONS __AMX_TILE__) list(APPEND ARCH_DEFINITIONS __AMX_TILE__ GGML_AMX_TILE)
endif() endif()
if (GGML_AMX_INT8) if (GGML_AMX_INT8)
list(APPEND ARCH_DEFINITIONS __AMX_INT8__) list(APPEND ARCH_DEFINITIONS __AMX_INT8__ GGML_AMX_INT8)
endif() endif()
if (GGML_AMX_BF16) if (GGML_AMX_BF16)
list(APPEND ARCH_DEFINITIONS __AMX_BF16__) list(APPEND ARCH_DEFINITIONS __AMX_BF16__ GGML_AMX_BF16)
endif() endif()
elseif (GGML_AVX2) elseif (GGML_AVX2)
list(APPEND ARCH_FLAGS /arch:AVX2) list(APPEND ARCH_FLAGS /arch:AVX2)
list(APPEND ARCH_DEFINITIONS GGML_AVX2 GGML_FMA GGML_F16C)
elseif (GGML_AVX) elseif (GGML_AVX)
list(APPEND ARCH_FLAGS /arch:AVX) list(APPEND ARCH_FLAGS /arch:AVX)
list(APPEND ARCH_DEFINITIONS GGML_AVX)
else ()
list(APPEND ARCH_FLAGS /arch:SSE4.2)
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
endif() endif()
if (GGML_AVX_VNNI) if (GGML_AVX_VNNI)
list(APPEND ARCH_DEFINITIONS __AVXVNNI__) # MSVC generates AVX512 with AVX-VNNI intrinsics even with /arch:AVX2
if (CMAKE_C_COMPILER_ID STREQUAL "Clang") #list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
list(APPEND ARCH_FLAGS -mavxvnni)
endif() endif()
endif() else ()
else()
if (GGML_NATIVE) if (GGML_NATIVE)
list(APPEND ARCH_FLAGS -march=native) list(APPEND ARCH_FLAGS -march=native)
endif() else ()
list(APPEND ARCH_FLAGS -msse4.2)
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
if (GGML_F16C) if (GGML_F16C)
list(APPEND ARCH_FLAGS -mf16c) list(APPEND ARCH_FLAGS -mf16c)
list(APPEND ARCH_DEFINITIONS GGML_F16C)
endif() endif()
if (GGML_FMA) if (GGML_FMA)
list(APPEND ARCH_FLAGS -mfma) list(APPEND ARCH_FLAGS -mfma)
list(APPEND ARCH_DEFINITIONS GGML_FMA)
endif() endif()
if (GGML_AVX) if (GGML_AVX)
list(APPEND ARCH_FLAGS -mavx) list(APPEND ARCH_FLAGS -mavx)
list(APPEND ARCH_DEFINITIONS GGML_AVX)
endif() endif()
if (GGML_AVX2) if (GGML_AVX2)
list(APPEND ARCH_FLAGS -mavx2) list(APPEND ARCH_FLAGS -mavx2)
list(APPEND ARCH_DEFINITIONS GGML_AVX2)
endif() endif()
if (GGML_AVX_VNNI) if (GGML_AVX_VNNI)
list(APPEND ARCH_FLAGS -mavxvnni) list(APPEND ARCH_FLAGS -mavxvnni)
list(APPEND ARCH_DEFINITIONS GGML_AVX_VNNI)
endif() endif()
if (GGML_AVX512) if (GGML_AVX512)
list(APPEND ARCH_FLAGS -mavx512f) list(APPEND ARCH_FLAGS -mavx512f)
list(APPEND ARCH_FLAGS -mavx512cd)
list(APPEND ARCH_FLAGS -mavx512vl)
list(APPEND ARCH_FLAGS -mavx512dq) list(APPEND ARCH_FLAGS -mavx512dq)
list(APPEND ARCH_FLAGS -mavx512bw) list(APPEND ARCH_FLAGS -mavx512bw)
list(APPEND ARCH_DEFINITIONS GGML_AVX512)
endif() endif()
if (GGML_AVX512_VBMI) if (GGML_AVX512_VBMI)
list(APPEND ARCH_FLAGS -mavx512vbmi) list(APPEND ARCH_FLAGS -mavx512vbmi)
list(APPEND ARCH_DEFINITIONS GGML_AVX512_VBMI)
endif() endif()
if (GGML_AVX512_VNNI) if (GGML_AVX512_VNNI)
list(APPEND ARCH_FLAGS -mavx512vnni) list(APPEND ARCH_FLAGS -mavx512vnni)
list(APPEND ARCH_DEFINITIONS GGML_AVX512_VNNI)
endif() endif()
if (GGML_AVX512_BF16) if (GGML_AVX512_BF16)
list(APPEND ARCH_FLAGS -mavx512bf16) list(APPEND ARCH_FLAGS -mavx512bf16)
list(APPEND ARCH_DEFINITIONS GGML_AVX512_BF16)
endif() endif()
if (GGML_AMX_TILE) if (GGML_AMX_TILE)
list(APPEND ARCH_FLAGS -mamx-tile) list(APPEND ARCH_FLAGS -mamx-tile)
list(APPEND ARCH_DEFINITIONS GGML_AMX_TILE)
endif() endif()
if (GGML_AMX_INT8) if (GGML_AMX_INT8)
list(APPEND ARCH_FLAGS -mamx-int8) list(APPEND ARCH_FLAGS -mamx-int8)
list(APPEND ARCH_DEFINITIONS GGML_AMX_INT8)
endif() endif()
if (GGML_AMX_BF16) if (GGML_AMX_BF16)
list(APPEND ARCH_FLAGS -mamx-bf16) list(APPEND ARCH_FLAGS -mamx-bf16)
list(APPEND ARCH_DEFINITIONS GGML_AMX_BF16)
endif() endif()
endif() endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected") message(STATUS "PowerPC detected")
execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M) execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
string(FIND "${POWER10_M}" "POWER10" substring_index) string(FIND "${POWER10_M}" "POWER10" substring_index)
@ -282,7 +275,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
# TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) # TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
endif() endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
message(STATUS "loongarch64 detected") message(STATUS "loongarch64 detected")
list(APPEND ARCH_FLAGS -march=loongarch64) list(APPEND ARCH_FLAGS -march=loongarch64)
@ -292,28 +285,39 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
if (GGML_LSX) if (GGML_LSX)
list(APPEND ARCH_FLAGS -mlsx) list(APPEND ARCH_FLAGS -mlsx)
endif() endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64") elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
message(STATUS "RISC-V detected") message(STATUS "RISC-V detected")
if (GGML_RVV) if (GGML_RVV)
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d) list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
endif() endif()
else() else()
message(STATUS "Unknown architecture") message(STATUS "Unknown architecture")
endif() endif()
if (GGML_CPU_AARCH64) if (GGML_CPU_AARCH64)
message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels") target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64)
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_AARCH64) endif()
endif()
target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES}) message(STATUS "Adding CPU backend variant ${GGML_CPU_NAME}: ${ARCH_FLAGS} ${ARCH_DEFINITIONS}")
set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS "${ARCH_FLAGS}") target_sources(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_SOURCES})
set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}") target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
# the feature detection code must be compiled without any architecture flags if (GGML_BACKEND_DL)
target_sources(ggml-cpu PRIVATE cpu-feats-x86.cpp) # The feature detection code is compiled as a separate target so that
# target_sources(ggml-cpu PRIVATE cpu-feats-arm.cpp) # TODO: ARM feature detection # it can be built without the architecture flags
# Since multiple variants of the CPU backend may be included in the same
# build, using set_source_files_properties() to set the arch flags is not possible
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
endif()
if (EMSCRIPTEN) if (EMSCRIPTEN)
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128") set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
endif() endif()
endfunction()

View File

@ -5,6 +5,7 @@
#include "ggml-backend.h" #include "ggml-backend.h"
#include "ggml-impl.h" #include "ggml-impl.h"
#include "ggml-cpu.h" #include "ggml-cpu.h"
#include "ggml-cpu-traits.h"
#if defined(__gnu_linux__) #if defined(__gnu_linux__)
#include <sys/syscall.h> #include <sys/syscall.h>
@ -17,31 +18,65 @@
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
// AMX type_trais
namespace ggml::cpu::amx {
class tensor_traits : public ggml::cpu::tensor_traits {
bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
size = ggml_backend_amx_desired_wsize(op);
return true;
}
bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
if (op->op == GGML_OP_MUL_MAT) {
ggml_backend_amx_mul_mat(params, op);
return true;
}
return false;
}
};
static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) {
static tensor_traits traits;
return &traits;
}
} // namespace ggml::cpu::amx
// AMX buffer interface // AMX buffer interface
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context); free(buffer->context);
} }
static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
return (void *)(buffer->context); return (void *) (buffer->context);
} }
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
memset((char *)tensor->data + offset, value, size); tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
GGML_UNUSED(buffer); GGML_UNUSED(buffer);
} }
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
uint8_t value, size_t offset, size_t size) {
memset((char *) tensor->data + offset, value, size);
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
const void * data, size_t offset, size_t size) {
if (qtype_has_amx_kernels(tensor->type)) { if (qtype_has_amx_kernels(tensor->type)) {
GGML_LOG_DEBUG("%s: amx repack tensor %s of type %s\n", __func__, tensor->name, ggml_type_name(tensor->type));
ggml_backend_amx_convert_weight(tensor, data, offset, size); ggml_backend_amx_convert_weight(tensor, data, offset, size);
} else { } else {
memcpy((char *)tensor->data + offset, data, size); memcpy((char *) tensor->data + offset, data, size);
} }
GGML_UNUSED(buffer); GGML_UNUSED(buffer);
} }
/*
// need to figure what we need to do with buffer->extra.
static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(!qtype_has_amx_kernels(tensor->type)); GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
memcpy(data, (const char *)tensor->data + offset, size); memcpy(data, (const char *)tensor->data + offset, size);
@ -62,6 +97,7 @@ static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
GGML_UNUSED(buffer); GGML_UNUSED(buffer);
} }
*/
static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
memset(buffer->context, value, buffer->size); memset(buffer->context, value, buffer->size);
@ -70,13 +106,13 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = { static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer, /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
/* .get_base = */ ggml_backend_amx_buffer_get_base, /* .get_base = */ ggml_backend_amx_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required /* .init_tensor = */ ggml_backend_amx_buffer_init_tensor,
/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor, /* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor, /* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_amx_buffer_get_tensor, /* .get_tensor = */ nullptr,
/* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor, /* .cpy_tensor = */ nullptr,
/* .clear = */ ggml_backend_amx_buffer_clear, /* .clear = */ ggml_backend_amx_buffer_clear,
/* .reset = */ NULL, /* .reset = */ nullptr,
}; };
static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) { static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
@ -86,7 +122,7 @@ static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_ty
} }
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * data = aligned_alloc(TENSOR_ALIGNMENT, size); void * data = ggml_aligned_malloc(size);
if (data == NULL) { if (data == NULL) {
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size); fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL; return NULL;
@ -101,14 +137,44 @@ static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_typ
GGML_UNUSED(buft); GGML_UNUSED(buft);
} }
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) { namespace ggml::cpu::amx {
return ggml_backend_amx_get_alloc_size(tensor); class extra_buffer_type : ggml::cpu::extra_buffer_type {
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
// handle only 2d gemm for now
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
};
GGML_UNUSED(buft); if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) && // src0 must be contiguous
} is_contiguous_2d(op->src[1]) && // src1 must be contiguous
op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) { op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x
(qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) {
// src1 must be host buffer
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
return false; return false;
}
// src1 must be float32
if (op->src[1]->type == GGML_TYPE_F32) {
return true;
}
}
return false;
}
ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer &&
op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()) {
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
}
return nullptr;
}
};
} // namespace ggml::cpu::amx
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
return ggml_backend_amx_get_alloc_size(tensor);
GGML_UNUSED(buft); GGML_UNUSED(buft);
} }
@ -129,68 +195,26 @@ static bool ggml_amx_init() {
return true; return true;
#endif #endif
} }
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() { ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = { static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
/* .iface = */ { /* .iface = */ {
/* .get_name = */ ggml_backend_amx_buffer_type_get_name, /* .get_name = */ ggml_backend_amx_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ nullptr, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
/* .is_host = */ ggml_backend_amx_buffer_type_is_host, /* .is_host = */ nullptr,
}, },
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
/* .context = */ NULL, /* .context = */ new ggml::cpu::amx::extra_buffer_type(),
}; };
if (!ggml_amx_init()) { if (!ggml_amx_init()) {
return NULL; return nullptr;
} }
return &ggml_backend_buffer_type_amx; return &ggml_backend_buffer_type_amx;
} }
bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
}
bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op) {
// handle only 2d gemm for now
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
};
switch (op->op) {
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
return true;
case GGML_OP_MUL_MAT: {
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];
const enum ggml_type type = src0->type;
const int64_t ne0 = op->ne[0];
// amx kernels enables for Q4_0, Q4_1, Q8_0, F16
// Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
bool can_use_amx =
is_contiguous_2d(src0) && // src0 must be contiguous
is_contiguous_2d(src1) && // src1 must be contiguous
src1->type == GGML_TYPE_F32 && // src1 must be float32
has_amx_kernels && // with amx kernel impls
ne0 % (TILE_N * 2) == 0; // out_features is 32x
return can_use_amx;
}
default:
return false;
}
}
#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__) #endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)

View File

@ -1,20 +1,8 @@
#include "ggml-backend.h" #include "ggml-backend.h"
#include "ggml-cpu-impl.h" #include "ggml-cpu-impl.h"
#ifdef __cplusplus // GGML internal header
extern "C" {
#endif
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void); ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft);
bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op);
void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
#endif
#ifdef __cplusplus
}
#endif #endif

Some files were not shown because too many files have changed in this diff Show More