Merge branch 'master' of https://github.com/piDack/llama.cpp into support_glm_edge_model

This commit is contained in:
piDack 2024-11-29 08:11:31 +00:00
commit 816d93db75
122 changed files with 3729 additions and 3505 deletions

View File

@ -6,6 +6,9 @@ ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_V
FROM ${BASE_MUSA_DEV_CONTAINER} AS build FROM ${BASE_MUSA_DEV_CONTAINER} AS build
# MUSA architecture to build for (defaults to all supported archs)
ARG MUSA_DOCKER_ARCH=default
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1 apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
@ -19,7 +22,11 @@ WORKDIR /app
COPY . . COPY . .
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ # Use the default MUSA archs if not specified
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc) && \ cmake --build build --config Release -j$(nproc) && \
cp build/bin/* . cp build/bin/* .

View File

@ -8,6 +8,9 @@ ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU
FROM ${BASE_MUSA_DEV_CONTAINER} AS build FROM ${BASE_MUSA_DEV_CONTAINER} AS build
# MUSA architecture to build for (defaults to all supported archs)
ARG MUSA_DOCKER_ARCH=default
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential git cmake apt-get install -y build-essential git cmake
@ -15,7 +18,11 @@ WORKDIR /app
COPY . . COPY . .
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ # Use the default MUSA archs if not specified
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-cli -j$(nproc) && \ cmake --build build --config Release --target llama-cli -j$(nproc) && \
mkdir -p /app/lib && \ mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \; find build -name "*.so" -exec cp {} /app/lib \;

View File

@ -8,6 +8,9 @@ ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU
FROM ${BASE_MUSA_DEV_CONTAINER} AS build FROM ${BASE_MUSA_DEV_CONTAINER} AS build
# MUSA architecture to build for (defaults to all supported archs)
ARG MUSA_DOCKER_ARCH=default
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential git cmake libcurl4-openssl-dev apt-get install -y build-essential git cmake libcurl4-openssl-dev
@ -15,7 +18,11 @@ WORKDIR /app
COPY . . COPY . .
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ # Use the default MUSA archs if not specified
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-server -j$(nproc) && \ cmake --build build --config Release --target llama-server -j$(nproc) && \
mkdir -p /app/lib && \ mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \; find build -name "*.so" -exec cp {} /app/lib \;

View File

@ -34,7 +34,7 @@ let
# server tests # server tests
openai openai
behave pytest
prometheus-client prometheus-client
]; ];
in in

15
.github/labeler.yml vendored
View File

@ -3,19 +3,18 @@ Kompute:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:
- ggml/include/ggml-kompute.h - ggml/include/ggml-kompute.h
- ggml/src/ggml-kompute.cpp - ggml/src/ggml-kompute/**
- README-kompute.md - README-kompute.md
Apple Metal: Apple Metal:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:
- ggml/include/ggml-metal.h - ggml/include/ggml-metal.h
- ggml/src/ggml-metal.cpp - ggml/src/ggml-metal/**
- README-metal.md - README-metal.md
SYCL: SYCL:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:
- ggml/include/ggml-sycl.h - ggml/include/ggml-sycl.h
- ggml/src/ggml-sycl.cpp
- ggml/src/ggml-sycl/** - ggml/src/ggml-sycl/**
- docs/backend/SYCL.md - docs/backend/SYCL.md
- examples/sycl/** - examples/sycl/**
@ -27,8 +26,8 @@ Nvidia GPU:
Vulkan: Vulkan:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:
- ggml/ggml_vk_generate_shaders.py - ggml/include/ggml-vulkan.h
- ggml/src/ggml-vulkan* - ggml/src/ggml-vulkan/**
documentation: documentation:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:
@ -75,11 +74,7 @@ server:
ggml: ggml:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:
- ggml/include/ggml*.h - ggml/**
- ggml/src/ggml*.c
- ggml/src/ggml*.cpp
- ggml/src/ggml*.h
- ggml-cuda/**
nix: nix:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:

View File

@ -728,7 +728,7 @@ jobs:
cmake --build build --config ${{ matrix.build }} -j $(nproc) cmake --build build --config ${{ matrix.build }} -j $(nproc)
windows-latest-cmake: windows-latest-cmake:
runs-on: windows-2019 runs-on: windows-latest
env: env:
OPENBLAS_VERSION: 0.3.23 OPENBLAS_VERSION: 0.3.23
@ -871,12 +871,33 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
name: llama-bin-win-${{ matrix.build }}.zip name: llama-bin-win-${{ matrix.build }}.zip
windows-latest-cmake-cuda: ubuntu-latest-cmake-cuda:
runs-on: ubuntu-latest
container: nvidia/cuda:12.6.2-devel-ubuntu24.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Install dependencies
env:
DEBIAN_FRONTEND: noninteractive
run: |
apt update
apt install -y cmake build-essential ninja-build libgomp1 git
- name: Build with CMake
run: |
cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON
cmake --build build
windows-2019-cmake-cuda:
runs-on: windows-2019 runs-on: windows-2019
strategy: strategy:
matrix: matrix:
cuda: ['12.2.0', '11.7.1'] cuda: ['12.4', '11.7']
build: ['cuda'] build: ['cuda']
steps: steps:
@ -886,22 +907,81 @@ jobs:
with: with:
fetch-depth: 0 fetch-depth: 0
- name: Install CUDA toolkit - name: Install Cuda Toolkit 11.7
id: cuda-toolkit if: ${{ matrix.cuda == '11.7' }}
uses: Jimver/cuda-toolkit@v0.2.15 run: |
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
choco install unzip -y
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
- name: Install Cuda Toolkit 12.4
if: ${{ matrix.cuda == '12.4' }}
run: |
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
choco install unzip -y
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
- name: Install ccache
uses: hendrikmuhs/ccache-action@v1.2
with: with:
cuda: ${{ matrix.cuda }} key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
method: 'network'
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]' - name: Install Ninja
id: install_ninja
run: |
choco install ninja
- name: Build - name: Build
id: cmake_build id: cmake_build
shell: cmd
run: | run: |
mkdir build call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cd build cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} cmake --build build --config Release
- name: Determine tag name - name: Determine tag name
id: tag id: tag
@ -930,10 +1010,12 @@ jobs:
name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
- name: Copy and pack Cuda runtime - name: Copy and pack Cuda runtime
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
run: | run: |
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}" echo "Cuda install location: ${{ env.CUDA_PATH }}"
$dst='.\build\bin\cudart\' $dst='.\build\bin\cudart\'
robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\* 7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
- name: Upload Cuda runtime - name: Upload Cuda runtime
@ -984,7 +1066,7 @@ jobs:
- name: Build the release package - name: Build the release package
id: pack_artifacts id: pack_artifacts
if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: | run: |
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin" echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
@ -1009,7 +1091,7 @@ jobs:
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
- name: Upload the release package - name: Upload the release package
if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
@ -1059,6 +1141,8 @@ jobs:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install - name: Install
id: depends id: depends
@ -1173,7 +1257,7 @@ jobs:
- macOS-latest-make - macOS-latest-make
- macOS-latest-cmake - macOS-latest-cmake
- windows-latest-cmake - windows-latest-cmake
- windows-latest-cmake-cuda - windows-2019-cmake-cuda
- windows-latest-cmake-hip-release - windows-latest-cmake-hip-release
- macOS-latest-cmake-arm64 - macOS-latest-cmake-arm64
- macOS-latest-cmake-x64 - macOS-latest-cmake-x64

View File

@ -114,7 +114,7 @@ jobs:
swap-storage: true swap-storage: true
- name: Build and push Docker image (tagged + versioned) - name: Build and push Docker image (tagged + versioned)
if: github.event_name == 'push' if: ${{ github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
uses: docker/build-push-action@v6 uses: docker/build-push-action@v6
with: with:
context: . context: .

View File

@ -1,72 +0,0 @@
name: Nix aarch64 builds
on:
workflow_dispatch: # allows manual triggering
schedule:
# Rebuild daily rather than on every push because QEMU is expensive (e.g.
# 1.5h instead of minutes with the cold cache).
#
# randint(0, 59), randint(0, 23)
- cron: '26 12 * * *'
# But also rebuild if we touched any of the Nix expressions:
push:
branches:
- master
paths: ['**/*.nix', 'flake.lock']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/*.nix', 'flake.lock']
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
# Fine-grant permission
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
permissions:
# https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
id-token: write
contents: read
jobs:
nix-build-aarch64:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install QEMU
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
run: |
sudo apt-get update
sudo apt-get install -y qemu-user-static qemu-system-aarch64
sudo usermod -a -G kvm $USER
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@v9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: |
extra-platforms = aarch64-linux
extra-system-features = nixos-test kvm
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2
with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- name: Set-up cachix to push the results to
uses: cachix/cachix-action@v13
with:
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
name: llama-cpp
- name: Show all output paths
run: >
nix run github:nix-community/nix-eval-jobs
-- --gc-roots-dir gcroot
--flake
".#packages.aarch64-linux"
- name: Build
run: >
nix run github:Mic92/nix-fast-build
-- --skip-cached --no-nom
--systems aarch64-linux
--flake
".#checks.aarch64-linux"

View File

@ -1,79 +0,0 @@
name: Nix CI
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
pull_request:
types: [opened, synchronize, reopened]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
# Fine-grant permission
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
permissions:
# https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
id-token: write
contents: read
jobs:
nix-eval:
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest, macos-latest ]
runs-on: ${{ matrix.os }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@v9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: |
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2
with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- name: List all flake outputs
run: nix flake show --all-systems
- name: Show all output paths
run: >
nix run github:nix-community/nix-eval-jobs
-- --gc-roots-dir gcroot
--flake
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
nix-build:
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest, macos-latest ]
runs-on: ${{ matrix.os }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@v9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: |
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2
with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- name: Set-up cachix to push the results to
uses: cachix/cachix-action@v13
with:
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
name: llama-cpp
- name: Build
run: >
nix run github:Mic92/nix-fast-build
-- --skip-cached --no-nom
--flake
".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"

View File

@ -1,22 +0,0 @@
name: update-flake-lock
on:
workflow_dispatch:
schedule:
- cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
jobs:
lockfile:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@main
- name: Update flake.lock
uses: DeterminateSystems/update-flake-lock@main
with:
pr-title: "nix: update flake.lock"
pr-labels: |
nix
pr-reviewers: philiptaron,SomeoneSerge
token: ${{ secrets.FLAKE_TOKEN }}

View File

@ -1,36 +0,0 @@
# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
name: "Publish a flake to flakestry & flakehub"
on:
push:
tags:
- "*"
workflow_dispatch:
inputs:
tag:
description: "The existing tag to publish"
type: "string"
required: true
jobs:
flakestry-publish:
runs-on: ubuntu-latest
permissions:
id-token: "write"
contents: "read"
steps:
- uses: flakestry/flakestry-publish@main
with:
version: "${{ inputs.tag || github.ref_name }}"
flakehub-publish:
runs-on: "ubuntu-latest"
permissions:
id-token: "write"
contents: "read"
steps:
- uses: "actions/checkout@v4"
with:
ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
- uses: "DeterminateSystems/nix-installer-action@main"
- uses: "DeterminateSystems/flakehub-push@main"
with:
visibility: "public"
tag: "${{ inputs.tag }}"

View File

@ -1,6 +1,13 @@
name: flake8 Lint name: flake8 Lint
on: [push, pull_request] on:
push:
branches:
- master
paths: ['.github/workflows/python-lint.yml', '**/*.py']
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/python-lint.yml', '**/*.py']
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}

View File

@ -122,14 +122,14 @@ jobs:
id: server_integration_tests id: server_integration_tests
run: | run: |
cd examples/server/tests cd examples/server/tests
PORT=8888 ./tests.sh ./tests.sh
- name: Slow tests - name: Slow tests
id: server_integration_tests_slow id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: | run: |
cd examples/server/tests cd examples/server/tests
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow SLOW_TESTS=1 ./tests.sh
server-windows: server-windows:
@ -180,11 +180,12 @@ jobs:
run: | run: |
cd examples/server/tests cd examples/server/tests
$env:PYTHONIOENCODING = ":replace" $env:PYTHONIOENCODING = ":replace"
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp pytest -v -x
- name: Slow tests - name: Slow tests
id: server_integration_tests_slow id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: | run: |
cd examples/server/tests cd examples/server/tests
behave.exe --stop --no-skipped --no-capture --tags slow $env:SLOW_TESTS = "1"
pytest -v -x

186
AUTHORS
View File

@ -1,4 +1,4 @@
# date: Wed Jun 26 19:36:34 EEST 2024 # date: Thu Nov 28 20:46:15 EET 2024
# this file is auto-generated by scripts/gen-authors.sh # this file is auto-generated by scripts/gen-authors.sh
0cc4m <picard12@live.de> 0cc4m <picard12@live.de>
@ -7,6 +7,7 @@
2f38b454 <dxf@protonmail.com> 2f38b454 <dxf@protonmail.com>
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com> 3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
44670 <44670@users.noreply.github.com> 44670 <44670@users.noreply.github.com>
65a <10104049+65a@users.noreply.github.com>
AN Long <aisk@users.noreply.github.com> AN Long <aisk@users.noreply.github.com>
AT <manyoso@users.noreply.github.com> AT <manyoso@users.noreply.github.com>
Aarni Koskela <akx@iki.fi> Aarni Koskela <akx@iki.fi>
@ -19,20 +20,28 @@ Adithya Balaji <adithya.b94@gmail.com>
AdithyanI <adithyan.i4internet@gmail.com> AdithyanI <adithyan.i4internet@gmail.com>
Adrian <smith.adriane@gmail.com> Adrian <smith.adriane@gmail.com>
Adrian Hesketh <a-h@users.noreply.github.com> Adrian Hesketh <a-h@users.noreply.github.com>
Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com>
Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr> Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
AidanBeltonS <aidan.belton@codeplay.com>
Aisuko <urakiny@gmail.com> Aisuko <urakiny@gmail.com>
Akarshan Biswas <akarshan.biswas@gmail.com>
Akarshan Biswas <akarshanbiswas@fedoraproject.org> Akarshan Biswas <akarshanbiswas@fedoraproject.org>
Al Mochkin <14274697+amochkin@users.noreply.github.com>
Albert Jin <albert.jin@gmail.com> Albert Jin <albert.jin@gmail.com>
Alberto <57916483+albbus-stack@users.noreply.github.com> Alberto <57916483+albbus-stack@users.noreply.github.com>
Alberto Cabrera Pérez <alberto.cabrera@codeplay.com>
Alberto Cabrera Pérez <alberto.cabrera@intel.com>
Alex <awhill19@icloud.com> Alex <awhill19@icloud.com>
Alex Azarov <alex@azarov.by> Alex Azarov <alex@azarov.by>
Alex Azarov <alexander.azarov@mapbox.com> Alex Azarov <alexander.azarov@mapbox.com>
Alex Klinkhamer <from.github.com.917@grencez.dev> Alex Klinkhamer <from.github.com.917@grencez.dev>
Alex Klinkhamer <git@grencez.dev> Alex Klinkhamer <git@grencez.dev>
Alex Nguyen <tiendung@users.noreply.github.com> Alex Nguyen <tiendung@users.noreply.github.com>
Alex O'Connell <35843486+acon96@users.noreply.github.com>
Alex Petenchea <alex.petenchea@gmail.com> Alex Petenchea <alex.petenchea@gmail.com>
Alex Renda <alexrenda@users.noreply.github.com> Alex Renda <alexrenda@users.noreply.github.com>
Alex Tuddenham <61622354+AlexsCode@users.noreply.github.com>
Alex von Gluck IV <kallisti5@unixzen.com> Alex von Gluck IV <kallisti5@unixzen.com>
Alexey Parfenov <zxed@alkatrazstudio.net> Alexey Parfenov <zxed@alkatrazstudio.net>
Ali Chraghi <63465728+alichraghi@users.noreply.github.com> Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
@ -45,18 +54,25 @@ AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
Ananta Bastola <anantarajbastola@gmail.com> Ananta Bastola <anantarajbastola@gmail.com>
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
András Salamon <ott2@users.noreply.github.com> András Salamon <ott2@users.noreply.github.com>
Andreas (Andi) Kunar <andreask@msn.com>
Andrei <abetlen@gmail.com> Andrei <abetlen@gmail.com>
Andrew Canis <andrew.canis@gmail.com> Andrew Canis <andrew.canis@gmail.com>
Andrew Downing <andrew2085@gmail.com> Andrew Downing <andrew2085@gmail.com>
Andrew Duffy <a10y@users.noreply.github.com> Andrew Duffy <a10y@users.noreply.github.com>
Andrew Godfrey <AndrewGodfrey@users.noreply.github.com> Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com>
Andy Salerno <andysalerno@gmail.com>
Andy Tai <andy-tai@users.noreply.github.com> Andy Tai <andy-tai@users.noreply.github.com>
Anthony Van de Gejuchte <anthonyvdgent@gmail.com>
Antonis Makropoulos <benuix@gmail.com>
Arik Poznanski <arikpoz@users.noreply.github.com> Arik Poznanski <arikpoz@users.noreply.github.com>
Armen Kaleshian <kriation@users.noreply.github.com>
Artem <guinmoon@gmail.com> Artem <guinmoon@gmail.com>
Artem Zinnatullin <ceo@abstractny.gay> Artem Zinnatullin <ceo@abstractny.gay>
Artyom Lebedev <vagran.ast@gmail.com> Artyom Lebedev <vagran.ast@gmail.com>
Asbjørn Olling <asbjornolling@gmail.com> Asbjørn Olling <asbjornolling@gmail.com>
Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org> Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
Asghar Ghorbani <a-ghorbani@users.noreply.github.com>
Ashish <1856117+ashishdatta@users.noreply.github.com> Ashish <1856117+ashishdatta@users.noreply.github.com>
Ashok Gelal <401055+ashokgelal@users.noreply.github.com> Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
Ashraful Islam <ashraful.meche@gmail.com> Ashraful Islam <ashraful.meche@gmail.com>
@ -76,12 +92,16 @@ Ben Williams <ben@719ben.com>
Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com> Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com> Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
Bernat Vadell <hounter.caza@gmail.com> Bernat Vadell <hounter.caza@gmail.com>
Bert Wagner <github@bertwagner.com>
Bingan <70050083+binganao@users.noreply.github.com> Bingan <70050083+binganao@users.noreply.github.com>
Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com>
Bodo Graumann <mail@bodograumann.de> Bodo Graumann <mail@bodograumann.de>
Bono Lv <lvscar@users.noreply.github.com> Bono Lv <lvscar@users.noreply.github.com>
Borislav Stanimirov <b.stanimirov@abv.bg> Borislav Stanimirov <b.stanimirov@abv.bg>
Branden Butler <bwtbutler@hotmail.com> Branden Butler <bwtbutler@hotmail.com>
Brandon Squizzato <35474886+bsquizz@users.noreply.github.com>
Brian <mofosyne@gmail.com> Brian <mofosyne@gmail.com>
Brian Cunnie <brian.cunnie@gmail.com>
Bruce MacDonald <brucewmacdonald@gmail.com> Bruce MacDonald <brucewmacdonald@gmail.com>
Bryan Honof <bryanhonof@gmail.com> Bryan Honof <bryanhonof@gmail.com>
CJ Pais <cj@cjpais.com> CJ Pais <cj@cjpais.com>
@ -90,32 +110,47 @@ Calvin Laurenson <calvin@laurenson.dev>
Cameron <csteele@steelecameron.com> Cameron <csteele@steelecameron.com>
Cameron Kaiser <classilla@users.noreply.github.com> Cameron Kaiser <classilla@users.noreply.github.com>
Carolinabanana <140120812+Carolinabanana@users.noreply.github.com> Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
CarryFun <76023481+CarryFun@users.noreply.github.com>
Carsten Kragelund Jørgensen <carsten@kragelund.me>
CarterLi999 <664681047@qq.com>
Casey Primozic <casey@cprimozic.net> Casey Primozic <casey@cprimozic.net>
Casey Primozic <me@ameo.link> Casey Primozic <me@ameo.link>
CausalLM <148736309+CausalLM@users.noreply.github.com> CausalLM <148736309+CausalLM@users.noreply.github.com>
Cebtenzzre <cebtenzzre@gmail.com> Cebtenzzre <cebtenzzre@gmail.com>
Chad Brewbaker <crb002@gmail.com> Chad Brewbaker <crb002@gmail.com>
Changyeon Kim <cyzero.kim@samsung.com>
Chao Jiang <jc19chaoj@zoho.com> Chao Jiang <jc19chaoj@zoho.com>
Charles Xu <63788048+chaxu01@users.noreply.github.com>
Charles Xu <charles.xu@arm.com>
Chen Xi <xi2.chen@intel.com>
Chen Xi <xixichen08@foxmail.com>
Cheng Shao <terrorjack@type.dance> Cheng Shao <terrorjack@type.dance>
Chenguang Li <87689256+noemotiovon@users.noreply.github.com>
Chris Elrod <elrodc@gmail.com> Chris Elrod <elrodc@gmail.com>
Chris Kuehl <ckuehl@ckuehl.me> Chris Kuehl <ckuehl@ckuehl.me>
Christian Demsar <christian@github.email.demsar.us> Christian Demsar <christian@github.email.demsar.us>
Christian Demsar <crasm@git.vczf.us> Christian Demsar <crasm@git.vczf.us>
Christian Falch <875252+chrfalch@users.noreply.github.com> Christian Falch <875252+chrfalch@users.noreply.github.com>
Christian Kögler <ck3d@gmx.de> Christian Kögler <ck3d@gmx.de>
Christian Köhnenkamp <cvk5@me.com>
Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
Clark Saben <76020733+csaben@users.noreply.github.com> Clark Saben <76020733+csaben@users.noreply.github.com>
Clint Herron <hanclinto@gmail.com> Clint Herron <hanclinto@gmail.com>
Conrad Kramer <conrad@conradkramer.com>
CrispStrobe <154636388+CrispStrobe@users.noreply.github.com> CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
Csaba Kecskemeti <csaba.kecskemeti@gmail.com>
Cuong Trinh Manh <nguoithichkhampha@gmail.com> Cuong Trinh Manh <nguoithichkhampha@gmail.com>
DAN™ <dranger003@gmail.com> DAN™ <dranger003@gmail.com>
Damian Stewart <d@damianstewart.com> Damian Stewart <d@damianstewart.com>
Dan Johansson <164997844+eddnjjn@users.noreply.github.com>
Dan Johansson <dan.johansson@arm.com>
Dane Madsen <dane_madsen@hotmail.com> Dane Madsen <dane_madsen@hotmail.com>
DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com> DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
Daniel Bevenius <daniel.bevenius@gmail.com> Daniel Bevenius <daniel.bevenius@gmail.com>
Daniel Drake <drake@endlessos.org> Daniel Drake <drake@endlessos.org>
Daniel Hiltgen <dhiltgen@users.noreply.github.com> Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Daniel Illescas Romero <illescas.daniel@protonmail.com> Daniel Illescas Romero <illescas.daniel@protonmail.com>
Daniel Kleine <53251018+d-kleine@users.noreply.github.com>
Daniele <57776841+daniandtheweb@users.noreply.github.com> Daniele <57776841+daniandtheweb@users.noreply.github.com>
DannyDaemonic <DannyDaemonic@gmail.com> DannyDaemonic <DannyDaemonic@gmail.com>
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com> Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
@ -129,19 +164,28 @@ David Pflug <david@pflug.email>
David Renshaw <dwrenshaw@gmail.com> David Renshaw <dwrenshaw@gmail.com>
David Sommers <12738+databyte@users.noreply.github.com> David Sommers <12738+databyte@users.noreply.github.com>
David Yang <davidyang6us@gmail.com> David Yang <davidyang6us@gmail.com>
DavidKorczynski <david@adalogics.com>
Dawid Potocki <github@dawidpotocki.com> Dawid Potocki <github@dawidpotocki.com>
Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com> Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
Dean <Dean.Sinaean@gmail.com> Dean <Dean.Sinaean@gmail.com>
Deins <deinsegle@gmail.com> Deins <deinsegle@gmail.com>
Denis Spasyuk <34203011+dspasyuk@users.noreply.github.com>
Derrick T. Woolworth <dwoolworth@gmail.com>
Deven Mistry <31466137+deven367@users.noreply.github.com> Deven Mistry <31466137+deven367@users.noreply.github.com>
Dibakar Gope <dibakar.gope@arm.com>
Didzis Gosko <didzis@users.noreply.github.com> Didzis Gosko <didzis@users.noreply.github.com>
Diego Devesa <slarengh@gmail.com>
Diogo Teles Sant'Anna <diogoteles@google.com>
Djip007 <djip.perois@free.fr> Djip007 <djip.perois@free.fr>
Don Mahurin <dmahurin@users.noreply.github.com> Don Mahurin <dmahurin@users.noreply.github.com>
DooWoong Lee (David) <manics99@naver.com> DooWoong Lee (David) <manics99@naver.com>
Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com> Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
Dou Xinpeng <15529241576@163.com>
Dou Xinpeng <81913537+Dou-Git@users.noreply.github.com>
Douglas Hanley <thesecretaryofwar@gmail.com> Douglas Hanley <thesecretaryofwar@gmail.com>
Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com> Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
Ebey Abraham <ebey97@gmail.com> Ebey Abraham <ebey97@gmail.com>
Echo Nolan <echo@echonolan.net>
Ed Lee <edilee@mozilla.com> Ed Lee <edilee@mozilla.com>
Ed Lepedus <ed.lepedus@googlemail.com> Ed Lepedus <ed.lepedus@googlemail.com>
Eddie-Wang <wangjinheng1120@163.com> Eddie-Wang <wangjinheng1120@163.com>
@ -151,10 +195,13 @@ Elbios <141279586+Elbios@users.noreply.github.com>
Elton Kola <eltonkola@gmail.com> Elton Kola <eltonkola@gmail.com>
Engininja2 <139037756+Engininja2@users.noreply.github.com> Engininja2 <139037756+Engininja2@users.noreply.github.com>
Equim <sayaka@ekyu.moe> Equim <sayaka@ekyu.moe>
Eric Curtin <ecurtin@redhat.com>
Eric Curtin <ericcurtin17@gmail.com>
Eric Sommerlade <es0m@users.noreply.github.com> Eric Sommerlade <es0m@users.noreply.github.com>
Eric Zhang <34133756+EZForever@users.noreply.github.com> Eric Zhang <34133756+EZForever@users.noreply.github.com>
Erik Garrison <erik.garrison@gmail.com> Erik Garrison <erik.garrison@gmail.com>
Erik Scholz <Green-Sky@users.noreply.github.com> Erik Scholz <Green-Sky@users.noreply.github.com>
Esko Toivonen <eskot98@gmail.com>
Ettore Di Giacinto <mudler@users.noreply.github.com> Ettore Di Giacinto <mudler@users.noreply.github.com>
Evan Jones <evan.q.jones@gmail.com> Evan Jones <evan.q.jones@gmail.com>
Evan Miller <emmiller@gmail.com> Evan Miller <emmiller@gmail.com>
@ -166,19 +213,26 @@ FK <sozforex@gmail.com>
Fabian <cmdrf@users.noreply.github.com> Fabian <cmdrf@users.noreply.github.com>
Fabio R. Sluzala <Fabio3rs@users.noreply.github.com> Fabio R. Sluzala <Fabio3rs@users.noreply.github.com>
Faez Shakil <faez.shakil@gmail.com> Faez Shakil <faez.shakil@gmail.com>
Faisal Zaghloul <faisal.zaghloul@gmail.com>
Faisal Zaghloul <quic_fzaghlou@quicinc.com>
Fan Shupei <dymarkfan@outlook.com>
FantasyGmm <16450052+FantasyGmm@users.noreply.github.com> FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
Farbod Bijary <110523279+farbodbj@users.noreply.github.com>
Fattire <528174+fat-tire@users.noreply.github.com> Fattire <528174+fat-tire@users.noreply.github.com>
Felix <stenbackfelix@gmail.com> Felix <stenbackfelix@gmail.com>
Finn Voorhees <finnvoorhees@gmail.com> Finn Voorhees <finnvoorhees@gmail.com>
Firat <firatkiral@gmail.com> Firat <firatkiral@gmail.com>
FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com>
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com> Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com> Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
Francisco Melo <43780565+francis2tm@users.noreply.github.com> Francisco Melo <43780565+francis2tm@users.noreply.github.com>
Frank Mai <thxcode0824@gmail.com> Frank Mai <thxcode0824@gmail.com>
FrankHB <frankhb1989@gmail.com> FrankHB <frankhb1989@gmail.com>
Frankie Robertson <frankier@users.noreply.github.com>
Fred Douglas <43351173+fredlas@users.noreply.github.com> Fred Douglas <43351173+fredlas@users.noreply.github.com>
Frederik Vogel <Schaltfehler@users.noreply.github.com> Frederik Vogel <Schaltfehler@users.noreply.github.com>
Gabe Goodhart <gabe.l.hart@gmail.com> Gabe Goodhart <gabe.l.hart@gmail.com>
Gabe Goodhart <ghart@us.ibm.com>
GainLee <perfecter.gen@gmail.com> GainLee <perfecter.gen@gmail.com>
Galunid <karolek1231456@gmail.com> Galunid <karolek1231456@gmail.com>
Gary Linscott <glinscott@gmail.com> Gary Linscott <glinscott@gmail.com>
@ -187,11 +241,13 @@ Gavin Zhao <gavinzhaojw@protonmail.com>
Genkagaku.GPT <hlhr202@163.com> Genkagaku.GPT <hlhr202@163.com>
Georgi Gerganov <ggerganov@gmail.com> Georgi Gerganov <ggerganov@gmail.com>
Gilad S <giladgd@users.noreply.github.com> Gilad S <giladgd@users.noreply.github.com>
Gilad S. <7817232+giladgd@users.noreply.github.com>
Giuseppe Scrivano <giuseppe@scrivano.org> Giuseppe Scrivano <giuseppe@scrivano.org>
GiviMAD <GiviMAD@users.noreply.github.com> GiviMAD <GiviMAD@users.noreply.github.com>
Govlzkoy <gotope@users.noreply.github.com> Govlzkoy <gotope@users.noreply.github.com>
Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com> Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
Guillaume Wenzek <gwenzek@users.noreply.github.com> Guillaume Wenzek <gwenzek@users.noreply.github.com>
Guoliang Hua <32868157+nbcsm@users.noreply.github.com>
Guoteng <32697156+SolenoidWGT@users.noreply.github.com> Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com> Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
Haggai Nuchi <h.nuchi@gmail.com> Haggai Nuchi <h.nuchi@gmail.com>
@ -213,11 +269,14 @@ Hong Bo PENG <penghb@cn.ibm.com>
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com> Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
Howard Su <howard0su@gmail.com> Howard Su <howard0su@gmail.com>
Hua Jiang <allenhjiang@outlook.com> Hua Jiang <allenhjiang@outlook.com>
Huang Qi <huangqi3@xiaomi.com>
Huawei Lin <huaweilin.cs@gmail.com> Huawei Lin <huaweilin.cs@gmail.com>
Hugo Roussel <hugo.rous@gmail.com> Hugo Roussel <hugo.rous@gmail.com>
Huifeng Ou <79071290+ho2103@users.noreply.github.com>
Ian Bull <irbull@eclipsesource.com> Ian Bull <irbull@eclipsesource.com>
Ian Bull <irbull@gmail.com> Ian Bull <irbull@gmail.com>
Ian Scrivener <github@zilogy.asia> Ian Scrivener <github@zilogy.asia>
Icecream95 <the.real.icecream95@gmail.com>
Ido S <ido.pluto@gmail.com> Ido S <ido.pluto@gmail.com>
IgnacioFDM <ignaciofdm@gmail.com> IgnacioFDM <ignaciofdm@gmail.com>
Igor Okulist <okigan@gmail.com> Igor Okulist <okigan@gmail.com>
@ -226,11 +285,15 @@ Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Ionoclast Laboratories <brigham@ionoclast.com> Ionoclast Laboratories <brigham@ionoclast.com>
Isaac McFadyen <isaac@imcf.me> Isaac McFadyen <isaac@imcf.me>
IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com> IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com>
Ivan <nekotekina@gmail.com>
Ivan Filipov <159561759+vanaka11@users.noreply.github.com>
Ivan Komarov <Ivan.Komarov@dfyz.info> Ivan Komarov <Ivan.Komarov@dfyz.info>
Ivan Stepanov <ivanstepanovftw@gmail.com> Ivan Stepanov <ivanstepanovftw@gmail.com>
JH23X <165871467+JH23X@users.noreply.github.com> JH23X <165871467+JH23X@users.noreply.github.com>
Jack Mousseau <jack@software.inc>
Jack Mousseau <jmousseau@users.noreply.github.com> Jack Mousseau <jmousseau@users.noreply.github.com>
JackJollimore <130917767+JackJollimore@users.noreply.github.com> JackJollimore <130917767+JackJollimore@users.noreply.github.com>
Jaeden Amero <jaeden@patater.com>
Jaemin Son <woalsdnd@gmail.com> Jaemin Son <woalsdnd@gmail.com>
Jag Chadha <jagtesh@gmail.com> Jag Chadha <jagtesh@gmail.com>
Jakub N <jakubniemczyk97@gmail.com> Jakub N <jakubniemczyk97@gmail.com>
@ -243,10 +306,14 @@ Jannis Schönleber <joennlae@gmail.com>
Jared Van Bortel <cebtenzzre@gmail.com> Jared Van Bortel <cebtenzzre@gmail.com>
Jared Van Bortel <jared@nomic.ai> Jared Van Bortel <jared@nomic.ai>
Jason McCartney <jmac@theroot.org> Jason McCartney <jmac@theroot.org>
Jason Stillerman <jason.t.stillerman@gmail.com>
Jean-Christophe Hoelt <hoelt@fovea.cc> Jean-Christophe Hoelt <hoelt@fovea.cc>
Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com> Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
Jed Fox <git@jedfox.com> Jed Fox <git@jedfox.com>
Jeff Bolz <jbolz@nvidia.com>
Jeffrey Morgan <jmorganca@gmail.com>
Jeffrey Quesnelle <emozilla@nousresearch.com> Jeffrey Quesnelle <emozilla@nousresearch.com>
Jeroen Mostert <jeroen.mostert@cm.com>
Jesse Jojo Johnson <williamsaintgeorge@gmail.com> Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
Jeximo <jeximo@gmail.com> Jeximo <jeximo@gmail.com>
Jhen-Jie Hong <iainst0409@gmail.com> Jhen-Jie Hong <iainst0409@gmail.com>
@ -258,6 +325,9 @@ Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
Jiří Sejkora <Sejseloid@gmail.com> Jiří Sejkora <Sejseloid@gmail.com>
Joan Fontanals <jfontanalsmartinez@gmail.com> Joan Fontanals <jfontanalsmartinez@gmail.com>
Joan Fontanals <joan.fontanals.martinez@jina.ai> Joan Fontanals <joan.fontanals.martinez@jina.ai>
João Dinis Ferreira <hello@joaof.eu>
Joe Eli McIlvain <joe.eli.mac@gmail.com>
Joe Todd <joe.todd@codeplay.com>
Johan <JohanAR@users.noreply.github.com> Johan <JohanAR@users.noreply.github.com>
Johannes Gäßler <johannesg@5d6.de> Johannes Gäßler <johannesg@5d6.de>
Johannes Rudolph <johannes.rudolph@gmail.com> Johannes Rudolph <johannes.rudolph@gmail.com>
@ -274,7 +344,9 @@ Joyce <joycebrum@google.com>
Juan Calderon-Perez <835733+gaby@users.noreply.github.com> Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
Judd <foldl@users.noreply.github.com> Judd <foldl@users.noreply.github.com>
Julius Arkenberg <arki05@users.noreply.github.com> Julius Arkenberg <arki05@users.noreply.github.com>
Jun Hee Yoo <contact.jhyoo@gmail.com>
Jun Jie <71215065+junnjiee16@users.noreply.github.com> Jun Jie <71215065+junnjiee16@users.noreply.github.com>
Junil Kim <logyourself@gmail.com>
Junyang Lin <justinlin930319@hotmail.com> Junyang Lin <justinlin930319@hotmail.com>
Juraj Bednar <juraj@bednar.io> Juraj Bednar <juraj@bednar.io>
Justin Parker <jparkerweb@gmail.com> Justin Parker <jparkerweb@gmail.com>
@ -292,12 +364,14 @@ Karthik Sethuraman <k.seth1993@gmail.com>
Kasumi <90275229+kasumi-1@users.noreply.github.com> Kasumi <90275229+kasumi-1@users.noreply.github.com>
Kawrakow <48489457+ikawrakow@users.noreply.github.com> Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Keiichi Tabata <keiichi.tabata@outlook.com> Keiichi Tabata <keiichi.tabata@outlook.com>
Keke Han <hankeke303@163.com>
Kenvix ⭐ <kenvixzure@live.com> Kenvix ⭐ <kenvixzure@live.com>
Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Kevin Gibbons <bakkot@gmail.com> Kevin Gibbons <bakkot@gmail.com>
Kevin Ji <1146876+kevinji@users.noreply.github.com> Kevin Ji <1146876+kevinji@users.noreply.github.com>
Kevin Kwok <antimatter15@gmail.com> Kevin Kwok <antimatter15@gmail.com>
Kevin Lo <kevlo@kevlo.org> Kevin Lo <kevlo@kevlo.org>
Kevin Wang <kevmo314@gmail.com>
Kolen Cheung <ickc@users.noreply.github.com> Kolen Cheung <ickc@users.noreply.github.com>
Konstantin Herud <konstantin.herud@denkbares.com> Konstantin Herud <konstantin.herud@denkbares.com>
Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com> Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
@ -315,22 +389,29 @@ LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
Leonardo Neumann <leonardo@neumann.dev.br> Leonardo Neumann <leonardo@neumann.dev.br>
Li Tan <tanliboy@gmail.com> Li Tan <tanliboy@gmail.com>
Linwei Wang <wanix1988@gmail.com> Linwei Wang <wanix1988@gmail.com>
Liu Jia <109258120+Septa2112@users.noreply.github.com>
Liu Jia <jia3.liu@intel.com>
LoganDark <github@logandark.mozmail.com> LoganDark <github@logandark.mozmail.com>
Loïc Carrère <loic.carrere@gmail.com>
LostRuins <39025047+LostRuins@users.noreply.github.com> LostRuins <39025047+LostRuins@users.noreply.github.com>
Luciano <lucianostrika44@gmail.com> Luciano <lucianostrika44@gmail.com>
Luo Tian <lt@basecity.com> Luo Tian <lt@basecity.com>
Lyle Dean <dean@lyle.dev> Lyle Dean <dean@lyle.dev>
M-A <maruel@gmail.com>
M. Yusuf Sarıgöz <yusufsarigoz@gmail.com> M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
Ma Mingfei <mingfei.ma@intel.com>
Maarten ter Huurne <maarten@treewalker.org> Maarten ter Huurne <maarten@treewalker.org>
Mack Straight <eiz@users.noreply.github.com> Mack Straight <eiz@users.noreply.github.com>
Maël Kerbiriou <m431.kerbiriou@gmail.com> Maël Kerbiriou <m431.kerbiriou@gmail.com>
MaggotHATE <clay1326@gmail.com> MaggotHATE <clay1326@gmail.com>
Mahesh Madhav <67384846+heshpdx@users.noreply.github.com>
Manuel <44313466+makuche@users.noreply.github.com> Manuel <44313466+makuche@users.noreply.github.com>
Marc Köhlbrugge <subscriptions@marckohlbrugge.com> Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
Marco Matthies <71844+marcom@users.noreply.github.com> Marco Matthies <71844+marcom@users.noreply.github.com>
Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com> Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
Marian Cepok <marian.cepok@gmail.com> Marian Cepok <marian.cepok@gmail.com>
Mark Fairbairn <thebaron88@gmail.com> Mark Fairbairn <thebaron88@gmail.com>
Mark Zhuang <zhuangqiubin@gmail.com>
Marko Tasic <mtasic85@gmail.com> Marko Tasic <mtasic85@gmail.com>
Markus Tavenrath <mtavenrath@users.noreply.github.com> Markus Tavenrath <mtavenrath@users.noreply.github.com>
Martin Delille <martin@delille.org> Martin Delille <martin@delille.org>
@ -342,11 +423,15 @@ MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com>
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com> Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
Matheus C. França <matheus-catarino@hotmail.com> Matheus C. França <matheus-catarino@hotmail.com>
Matheus Gabriel Alves Silva <matheusgasource@gmail.com> Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
Mathieu Geli <mathieu.geli@gmail.com>
Mathieu Nayrolles <MathieuNls@users.noreply.github.com> Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
Mathijs Henquet <mathijs.henquet@gmail.com>
Mathijs de Bruin <mathijs@mathijsfietst.nl> Mathijs de Bruin <mathijs@mathijsfietst.nl>
Matt Clayton <156335168+mattjcly@users.noreply.github.com> Matt Clayton <156335168+mattjcly@users.noreply.github.com>
Matt Pulver <matt.pulver@heavy.ai> Matt Pulver <matt.pulver@heavy.ai>
Matt Stephenson <mstephenson6@users.noreply.github.com>
Matteo Boschini <12133566+mbosc@users.noreply.github.com> Matteo Boschini <12133566+mbosc@users.noreply.github.com>
Matteo Mortari <matteo.mortari@gmail.com>
Mattheus Chediak <shammcity00@gmail.com> Mattheus Chediak <shammcity00@gmail.com>
Matthew Tejo <matthew.tejo@gmail.com> Matthew Tejo <matthew.tejo@gmail.com>
Matvey Soloviev <blackhole89@gmail.com> Matvey Soloviev <blackhole89@gmail.com>
@ -356,8 +441,10 @@ Maxime <672982+maximegmd@users.noreply.github.com>
Maximilian Winter <maximilian.winter.91@gmail.com> Maximilian Winter <maximilian.winter.91@gmail.com>
Meng Zhang <meng@tabbyml.com> Meng Zhang <meng@tabbyml.com>
Meng, Hengyu <hengyu.meng@intel.com> Meng, Hengyu <hengyu.meng@intel.com>
Mengqing Cao <cmq0113@163.com>
Merrick Christensen <merrick.christensen@gmail.com> Merrick Christensen <merrick.christensen@gmail.com>
Michael Coppola <m18coppola@gmail.com> Michael Coppola <m18coppola@gmail.com>
Michael Francis <edude03@gmail.com>
Michael Hueschen <m@mhueschen.dev> Michael Hueschen <m@mhueschen.dev>
Michael Kesper <mkesper@schokokeks.org> Michael Kesper <mkesper@schokokeks.org>
Michael Klimenko <mklimenko29@gmail.com> Michael Klimenko <mklimenko29@gmail.com>
@ -365,41 +452,57 @@ Michael Podvitskiy <podvitskiymichael@gmail.com>
Michael Potter <NanoTekGuy@Gmail.com> Michael Potter <NanoTekGuy@Gmail.com>
Michael de Gans <michael.john.degans@gmail.com> Michael de Gans <michael.john.degans@gmail.com>
Michaël de Vries <vriesdemichael@gmail.com> Michaël de Vries <vriesdemichael@gmail.com>
Michał Tuszyński <srgtuszy@gmail.com>
Mihai <mihai.chirculescu@yahoo.com> Mihai <mihai.chirculescu@yahoo.com>
Mike <ytianhui2004@gmail.com> Mike <ytianhui2004@gmail.com>
Mikko Juola <mikjuo@gmail.com> Mikko Juola <mikjuo@gmail.com>
Minsoo Cheong <54794500+mscheong01@users.noreply.github.com> Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
Minsoo Cheong <icycle0409@snu.ac.kr>
Mirko185 <mirkosig@gmail.com> Mirko185 <mirkosig@gmail.com>
Mirror Azure <54669636+MirrorAzure@users.noreply.github.com> Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
MistApproach <98988043+MistApproach@users.noreply.github.com>
Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com> Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com> Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com> Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
Molly Sophia <mollysophia379@gmail.com>
MorganRO8 <47795945+MorganRO8@users.noreply.github.com>
Murilo Santana <mvrilo@gmail.com> Murilo Santana <mvrilo@gmail.com>
Musab Gultekin <musabgultekin@users.noreply.github.com> Musab Gultekin <musabgultekin@users.noreply.github.com>
Nam D. Tran <42194884+namtranase@users.noreply.github.com> Nam D. Tran <42194884+namtranase@users.noreply.github.com>
Nathan Epstein <nate2@umbc.edu> Nathan Epstein <nate2@umbc.edu>
Natsu <chino@hotococoa.moe>
NawafAlansari <72708095+NawafAlansari@users.noreply.github.com> NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
Nebula <infinitewormhole@gmail.com> Nebula <infinitewormhole@gmail.com>
Neo Zhang <14088817+arthw@users.noreply.github.com> Neo Zhang <14088817+arthw@users.noreply.github.com>
Neo Zhang <zhang.jianyu@outlook.com> Neo Zhang <zhang.jianyu@outlook.com>
Neo Zhang Jianyu <jianyu.zhang@intel.com> Neo Zhang Jianyu <jianyu.zhang@intel.com>
Neuman Vong <neuman.vong@gmail.com> Neuman Vong <neuman.vong@gmail.com>
Nexes the Old <124105151+Nexesenex@users.noreply.github.com>
Nexesenex <124105151+Nexesenex@users.noreply.github.com> Nexesenex <124105151+Nexesenex@users.noreply.github.com>
Niall Coates <1349685+Niall-@users.noreply.github.com> Niall Coates <1349685+Niall-@users.noreply.github.com>
Nicholai Tukanov <nicholaitukanov@gmail.com>
Nico Bosshard <nico@bosshome.ch>
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de> Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
Nicolás Pérez <nicolas_perez@brown.edu> Nicolás Pérez <nicolas_perez@brown.edu>
Nigel Bosch <pnigelb@gmail.com> Nigel Bosch <pnigelb@gmail.com>
Niklas Korz <niklas@niklaskorz.de> Niklas Korz <niklas@niklaskorz.de>
NikolaiLyssogor <59844691+NikolaiLyssogor@users.noreply.github.com>
Nikolas <127742645+nneubacher@users.noreply.github.com> Nikolas <127742645+nneubacher@users.noreply.github.com>
Nindaleth <Nindaleth@users.noreply.github.com> Nindaleth <Nindaleth@users.noreply.github.com>
OSecret <135510162+OLSecret@users.noreply.github.com>
Oleksandr Nikitin <oleksandr@tvori.info> Oleksandr Nikitin <oleksandr@tvori.info>
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com> Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
Olivier Chafik <ochafik@users.noreply.github.com> Olivier Chafik <ochafik@users.noreply.github.com>
Ondřej Čertík <ondrej@certik.us> Ondřej Čertík <ondrej@certik.us>
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com> Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
PAB <pierreantoine.bannier@gmail.com>
Pablo Duboue <pablo.duboue@gmail.com>
Pascal Patry <ppatry@mtacitlabs.com>
Patrice Ferlet <metal3d@gmail.com> Patrice Ferlet <metal3d@gmail.com>
Paul Tsochantaris <ptsochantaris@icloud.com> Paul Tsochantaris <ptsochantaris@icloud.com>
Pavel Zloi <github.com@drteam.rocks>
Pavol Rusnak <pavol@rusnak.io> Pavol Rusnak <pavol@rusnak.io>
Paweł Wodnicki <151604+32bitmicro@users.noreply.github.com>
Pedro Cuenca <pedro@huggingface.co> Pedro Cuenca <pedro@huggingface.co>
Peter Sugihara <peter@campsh.com> Peter Sugihara <peter@campsh.com>
Phil H <5756783+phiharri@users.noreply.github.com> Phil H <5756783+phiharri@users.noreply.github.com>
@ -407,10 +510,15 @@ Philip Taron <philip.taron@gmail.com>
Phillip Kravtsov <phillip@kravtsov.net> Phillip Kravtsov <phillip@kravtsov.net>
Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com> Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
Pierrick Hymbert <pierrick.hymbert@gmail.com> Pierrick Hymbert <pierrick.hymbert@gmail.com>
Pieter Ouwerkerk <pieter.ouwerkerk@gmail.com>
Plamen Minev <pacominev@gmail.com>
Prashant Vithule <119530321+Vithulep@users.noreply.github.com>
Przemysław Pawełczyk <przemoc@gmail.com> Przemysław Pawełczyk <przemoc@gmail.com>
Qin Yue Chen <71813199+chenqiny@users.noreply.github.com> Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
Qingyou Meng <meng.qingyou@gmail.com> Qingyou Meng <meng.qingyou@gmail.com>
Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com> Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
R0CKSTAR <xiaodong.ye@mthreads.com>
R0CKSTAR <yeahdongcn@gmail.com>
RJ Adriaansen <adriaansen@eshcc.eur.nl> RJ Adriaansen <adriaansen@eshcc.eur.nl>
Radoslav Gerganov <rgerganov@gmail.com> Radoslav Gerganov <rgerganov@gmail.com>
Radosław Gryta <radek.gryta@gmail.com> Radosław Gryta <radek.gryta@gmail.com>
@ -419,11 +527,13 @@ Raj Hammeer Singh Hada <hammeerraj@gmail.com>
Ralph Soika <ralph.soika@imixs.com> Ralph Soika <ralph.soika@imixs.com>
Rand Xie <randxiexyy29@gmail.com> Rand Xie <randxiexyy29@gmail.com>
Randall Fitzgerald <randall@dasaku.net> Randall Fitzgerald <randall@dasaku.net>
Random Fly <renfei8@live.cn>
Reinforce-II <fate@eastal.com> Reinforce-II <fate@eastal.com>
Ren Xuancheng <jklj077@users.noreply.github.com> Ren Xuancheng <jklj077@users.noreply.github.com>
Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com> Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
RhinoDevel <RhinoDevel@users.noreply.github.com> RhinoDevel <RhinoDevel@users.noreply.github.com>
Riceball LEE <snowyu.lee@gmail.com> Riceball LEE <snowyu.lee@gmail.com>
Rich Dougherty <rich@rd.nz>
Richard Kiss <him@richardkiss.com> Richard Kiss <him@richardkiss.com>
Richard Roberson <richardr1126@gmail.com> Richard Roberson <richardr1126@gmail.com>
Rick G <26732651+TheFlipbook@users.noreply.github.com> Rick G <26732651+TheFlipbook@users.noreply.github.com>
@ -439,21 +549,30 @@ Robey Holderith <robey@flaminglunchbox.net>
Robyn <robyngraf@users.noreply.github.com> Robyn <robyngraf@users.noreply.github.com>
Roger Meier <r.meier@siemens.com> Roger Meier <r.meier@siemens.com>
Roland <14355895+rbur0425@users.noreply.github.com> Roland <14355895+rbur0425@users.noreply.github.com>
Romain Biessy <romain.biessy@codeplay.com>
Romain D <90720+Artefact2@users.noreply.github.com> Romain D <90720+Artefact2@users.noreply.github.com>
Romain Neutron <romain@neutron.io> Romain Neutron <romain@neutron.io>
Roman Parykin <donderom@gmail.com> Roman Parykin <donderom@gmail.com>
Ron Evans <ron@hybridgroup.com> Ron Evans <ron@hybridgroup.com>
Ron Jailall <rojailal@gmail.com> Ron Jailall <rojailal@gmail.com>
Roni <sulpher@gmx.net>
Ronny Brendel <ronnybrendel@gmail.com> Ronny Brendel <ronnybrendel@gmail.com>
Ronsor <ronsor@ronsor.pw> Ronsor <ronsor@ronsor.pw>
Rowan Hart <rowanbhart@gmail.com> Rowan Hart <rowanbhart@gmail.com>
Ruchira Hasaranga <ruchira66@gmail.com>
Ruixin Huang <18860020911@163.com>
Rune <43761327+Rune-AI@users.noreply.github.com> Rune <43761327+Rune-AI@users.noreply.github.com>
RunningLeon <maningsheng@sensetime.com>
RunningLeon <mnsheng@yeah.net>
Ryan Landay <rlanday@gmail.com> Ryan Landay <rlanday@gmail.com>
Ryder Wishart <ryderwishart@gmail.com> Ryder Wishart <ryderwishart@gmail.com>
Ryuei <louixs@users.noreply.github.com> Ryuei <louixs@users.noreply.github.com>
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com> Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
SRHMorris <69468379+SRHMorris@users.noreply.github.com>
SXX <sxx1136965276@gmail.com>
SakuraUmi <yukinon244@gmail.com> SakuraUmi <yukinon244@gmail.com>
Salvador E. Tropea <stropea@inti.gob.ar> Salvador E. Tropea <stropea@inti.gob.ar>
Salvatore Mesoraca <s.mesoraca16@gmail.com>
Sam Spilsbury <smspillaz@gmail.com> Sam Spilsbury <smspillaz@gmail.com>
Sami Farin <3876865+Safari77@users.noreply.github.com> Sami Farin <3876865+Safari77@users.noreply.github.com>
Samuel Maynard <samwmaynard@gmail.com> Samuel Maynard <samwmaynard@gmail.com>
@ -463,23 +582,29 @@ Sebastián A <sebastian.aedo29@gmail.com>
SebastianApel <13675545+SebastianApel@users.noreply.github.com> SebastianApel <13675545+SebastianApel@users.noreply.github.com>
Senemu <10880819+Senemu@users.noreply.github.com> Senemu <10880819+Senemu@users.noreply.github.com>
Sergey Alirzaev <zl29ah@gmail.com> Sergey Alirzaev <zl29ah@gmail.com>
Sergio López <slp@redhat.com>
Sergio López <slp@sinrega.org> Sergio López <slp@sinrega.org>
Sertaç Özercan <852750+sozercan@users.noreply.github.com> Sertaç Özercan <852750+sozercan@users.noreply.github.com>
SeungWon Jeong <65549245+redlion0929@users.noreply.github.com> SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
ShadovvBeast <ShadovvBeast@gmail.com> ShadovvBeast <ShadovvBeast@gmail.com>
Shakhar Dasgupta <shakhardasgupta@gmail.com> Shakhar Dasgupta <shakhardasgupta@gmail.com>
Shane A <shanea@allenai.org>
Shangning Xu <32517059+xushangning@users.noreply.github.com> Shangning Xu <32517059+xushangning@users.noreply.github.com>
Shankar <gshankar.87@gmail.com>
Shanshan Shen <467638484@qq.com>
Shijie <821898965@qq.com> Shijie <821898965@qq.com>
Shintarou Okada <kokuzen@gmail.com> Shintarou Okada <kokuzen@gmail.com>
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com> Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
Shouzheng Liu <lshzh.hi@gmail.com> Shouzheng Liu <lshzh.hi@gmail.com>
Shuichi Tsutsumi <shuichi0526@gmail.com> Shuichi Tsutsumi <shuichi0526@gmail.com>
Shupei Fan <dymarkfan@outlook.com>
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
Simon Willison <swillison@gmail.com> Simon Willison <swillison@gmail.com>
Siwen Yu <yusiwen@gmail.com> Siwen Yu <yusiwen@gmail.com>
Sky Yan <skyan83@gmail.com> Sky Yan <skyan83@gmail.com>
Slaren <2141330+slaren@users.noreply.github.com> Slaren <2141330+slaren@users.noreply.github.com>
Slava Primenko <primenko.s@gmail.com> Slava Primenko <primenko.s@gmail.com>
Small Grass Forest <zixuanxcl@gmail.com>
SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com> SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com>
Someone <sergei.kozlukov@aalto.fi> Someone <sergei.kozlukov@aalto.fi>
Someone Serge <sergei.kozlukov@aalto.fi> Someone Serge <sergei.kozlukov@aalto.fi>
@ -491,12 +616,15 @@ Stefan Sydow <stefan@sydow.email>
Steffen Röcker <sroecker@gmail.com> Steffen Röcker <sroecker@gmail.com>
Stephan Walter <stephan@walter.name> Stephan Walter <stephan@walter.name>
Stephen Nichols <snichols@users.noreply.github.com> Stephen Nichols <snichols@users.noreply.github.com>
Steve Bonds <sbonds@gmail.com>
Steve Grubb <ausearch.1@gmail.com> Steve Grubb <ausearch.1@gmail.com>
Steven Prichard <spprichard20@gmail.com> Steven Prichard <spprichard20@gmail.com>
Steven Roussey <sroussey@gmail.com> Steven Roussey <sroussey@gmail.com>
Steward Garcia <57494570+FSSRepo@users.noreply.github.com> Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
StrangeBytesDev <141275258+StrangeBytesDev@users.noreply.github.com>
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com> Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
SuperUserNameMan <yoann@terminajones.com> SuperUserNameMan <yoann@terminajones.com>
Sutou Kouhei <kou@cozmixng.org>
Tai Duc Nguyen <taiducnguyen.drexel@gmail.com> Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
Taikono-Himazin <kazu@po.harenet.ne.jp> Taikono-Himazin <kazu@po.harenet.ne.jp>
Tameem <113388789+AhmadTameem@users.noreply.github.com> Tameem <113388789+AhmadTameem@users.noreply.github.com>
@ -507,7 +635,9 @@ Theia Vogel <theia@vgel.me>
Thérence <13496987+Royalphax@users.noreply.github.com> Thérence <13496987+Royalphax@users.noreply.github.com>
Thibault Terrasson <thibault.terrasson@gmail.com> Thibault Terrasson <thibault.terrasson@gmail.com>
Thomas Klausner <wiz@gatalith.at> Thomas Klausner <wiz@gatalith.at>
Thorsten Sommer <SommerEngineering@users.noreply.github.com>
Tim Miller <drasticactions@users.noreply.github.com> Tim Miller <drasticactions@users.noreply.github.com>
Tim Wang <overocean@gmail.com>
Timmy Knight <r2d2fish@gmail.com> Timmy Knight <r2d2fish@gmail.com>
Timothy Cronin <40186632+4imothy@users.noreply.github.com> Timothy Cronin <40186632+4imothy@users.noreply.github.com>
Ting Lou <ting.lou@gmail.com> Ting Lou <ting.lou@gmail.com>
@ -517,24 +647,31 @@ Tom C <tom.corelis@gmail.com>
Tom Jobbins <784313+TheBloke@users.noreply.github.com> Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Tomas <tom.tomas.36478119@gmail.com> Tomas <tom.tomas.36478119@gmail.com>
Tomáš Pazdiora <tomas.pazdiora@gmail.com> Tomáš Pazdiora <tomas.pazdiora@gmail.com>
Tony Wasserka <4840017+neobrain@users.noreply.github.com>
Tristan Druyen <tristan@vault81.mozmail.com> Tristan Druyen <tristan@vault81.mozmail.com>
Tristan Ross <rosscomputerguy@protonmail.com> Tristan Ross <rosscomputerguy@protonmail.com>
Trivikram Kamat <16024985+trivikr@users.noreply.github.com>
Tungsten842 <886724vf@anonaddy.me> Tungsten842 <886724vf@anonaddy.me>
Tungsten842 <quantmint@protonmail.com> Tungsten842 <quantmint@protonmail.com>
Tushar <ditsuke@protonmail.com> Tushar <ditsuke@protonmail.com>
UEXTM.com <84163508+uextm@users.noreply.github.com> UEXTM.com <84163508+uextm@users.noreply.github.com>
Ujjawal Panchal <31011628+Ujjawal-K-Panchal@users.noreply.github.com>
Ulrich Drepper <drepper@gmail.com> Ulrich Drepper <drepper@gmail.com>
Uzo Nweke <uzoechi@gmail.com> Uzo Nweke <uzoechi@gmail.com>
Vaibhav Srivastav <vaibhavs10@gmail.com> Vaibhav Srivastav <vaibhavs10@gmail.com>
Val Kharitonov <mail@kharvd.com> Val Kharitonov <mail@kharvd.com>
Valentin Konovalov <valle.ketsujin@gmail.com> Valentin Konovalov <valle.ketsujin@gmail.com>
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com> Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
Vali Malinoiu <0x4139@gmail.com>
Victor Nogueira <felladrin@gmail.com> Victor Nogueira <felladrin@gmail.com>
Victor Z. Peng <ziliangdotme@gmail.com> Victor Z. Peng <ziliangdotme@gmail.com>
Viet-Anh NGUYEN (Andrew) <vietanh.dev@gmail.com>
Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com>
Vlad <spitfireage@gmail.com> Vlad <spitfireage@gmail.com>
Vladimir <bogdad@gmail.com> Vladimir <bogdad@gmail.com>
Vladimir Malyutin <first-leon@yandex.ru> Vladimir Malyutin <first-leon@yandex.ru>
Vladimir Zorin <vladimir@deviant.guru> Vladimir Zorin <vladimir@deviant.guru>
VoidIsVoid <343750470@qq.com>
Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com> Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com> WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
Weird Constructor <weirdconstructor@gmail.com> Weird Constructor <weirdconstructor@gmail.com>
@ -551,15 +688,22 @@ Xiang (Kevin) Li <kevinli020508@gmail.com>
Xiao-Yong Jin <jinxiaoyong@gmail.com> Xiao-Yong Jin <jinxiaoyong@gmail.com>
XiaotaoChen <chenxiaotao1234@gmail.com> XiaotaoChen <chenxiaotao1234@gmail.com>
Xiaoyi Chen <cxychina@gmail.com> Xiaoyi Chen <cxychina@gmail.com>
Xie Yanbo <xieyanbo@gmail.com>
Xingchen Song(宋星辰) <xingchensong1996@163.com> Xingchen Song(宋星辰) <xingchensong1996@163.com>
Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com>
Xuan Son Nguyen <thichthat@gmail.com> Xuan Son Nguyen <thichthat@gmail.com>
Yaiko <elyaiko@hotmail.com>
Yann Follet <131855179+YannFollet@users.noreply.github.com> Yann Follet <131855179+YannFollet@users.noreply.github.com>
Yaroslav <yaroslav.yashin@me.com> Yaroslav <yaroslav.yashin@me.com>
Yazan Agha-Schrader <mountaiin@icloud.com> Yazan Agha-Schrader <mountaiin@icloud.com>
Yiming Cui <conandiy@vip.qq.com> Yiming Cui <conandiy@vip.qq.com>
Yishuo Wang <MeouSker77@outlook.com> Yishuo Wang <MeouSker77@outlook.com>
Yoshi Suhara <y.suhara@gmail.com>
Yoshi Suhara <ysuhara@nvidia.com>
Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com> Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
Yui <dev@sleepyyui.com> Yui <dev@sleepyyui.com>
Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
Yusuf Kağan Hanoğlu <hanoglu@yahoo.com> Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com> Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
ZHAOKAI WANG <sanxianwei@163.com> ZHAOKAI WANG <sanxianwei@163.com>
@ -568,6 +712,8 @@ Zay <95888118+isaiahbjork@users.noreply.github.com>
Zenix <zenixls2@gmail.com> Zenix <zenixls2@gmail.com>
Zhang Peiyuan <a1286225768@gmail.com> Zhang Peiyuan <a1286225768@gmail.com>
Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com> Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com>
Zhiyuan Li <lizhiyuan@uniartisan.com>
ZhouYuChen <zhouyuchen@naver.com> ZhouYuChen <zhouyuchen@naver.com>
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com> Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com> Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
@ -581,6 +727,7 @@ alexpinel <93524949+alexpinel@users.noreply.github.com>
alonfaraj <alonfaraj@gmail.com> alonfaraj <alonfaraj@gmail.com>
alwqx <kenan3015@gmail.com> alwqx <kenan3015@gmail.com>
amd-lalithnc <lalithnc@amd.com> amd-lalithnc <lalithnc@amd.com>
amritahs-ibm <amritahs@linux.vnet.ibm.com>
andrijdavid <david@geek.mg> andrijdavid <david@geek.mg>
anon998 <131767832+anon998@users.noreply.github.com> anon998 <131767832+anon998@users.noreply.github.com>
anzz1 <anzz1@live.com> anzz1 <anzz1@live.com>
@ -588,14 +735,18 @@ apaz <aarpazdera@gmail.com>
apcameron <37645737+apcameron@users.noreply.github.com> apcameron <37645737+apcameron@users.noreply.github.com>
arch-btw <57669023+arch-btw@users.noreply.github.com> arch-btw <57669023+arch-btw@users.noreply.github.com>
arcrank <arcrank@gmail.com> arcrank <arcrank@gmail.com>
ardfork <134447697+ardfork@users.noreply.github.com>
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com> arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
at8u <129688334+at8u@users.noreply.github.com> at8u <129688334+at8u@users.noreply.github.com>
automaticcat <daogiatuank54@gmail.com> automaticcat <daogiatuank54@gmail.com>
awatuna <23447591+awatuna@users.noreply.github.com>
b4b4o <zwbao@foxmail.com>
bandoti <141645996+bandoti@users.noreply.github.com> bandoti <141645996+bandoti@users.noreply.github.com>
beiller <beiller@gmail.com> beiller <beiller@gmail.com>
bhubbb <79117352+bhubbb@users.noreply.github.com> bhubbb <79117352+bhubbb@users.noreply.github.com>
bmwl <brian.marshall@tolko.com> bmwl <brian.marshall@tolko.com>
bobqianic <129547291+bobqianic@users.noreply.github.com> bobqianic <129547291+bobqianic@users.noreply.github.com>
brucepro <git@brucepro.net>
bryanSwk <93190252+bryanSwk@users.noreply.github.com> bryanSwk <93190252+bryanSwk@users.noreply.github.com>
bsilvereagle <bsilvereagle@users.noreply.github.com> bsilvereagle <bsilvereagle@users.noreply.github.com>
bssrdf <merlintiger@hotmail.com> bssrdf <merlintiger@hotmail.com>
@ -614,10 +765,14 @@ cpumaxx <163466046+cpumaxx@users.noreply.github.com>
crasm <crasm@git.vczf.net> crasm <crasm@git.vczf.net>
crasm <crasm@git.vczf.us> crasm <crasm@git.vczf.us>
daboe01 <daboe01@googlemail.com> daboe01 <daboe01@googlemail.com>
daghanerdonmez <44506702+daghanerdonmez@users.noreply.github.com>
daminho <37615795+daminho@users.noreply.github.com>
david raistrick <keen99@users.noreply.github.com> david raistrick <keen99@users.noreply.github.com>
ddh0 <dylanhalladay02@icloud.com> ddh0 <dylanhalladay02@icloud.com>
ddpasa <112642920+ddpasa@users.noreply.github.com> ddpasa <112642920+ddpasa@users.noreply.github.com>
deepdiffuser <112834445+deepdiffuser@users.noreply.github.com> deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
devojony <61173062+devojony@users.noreply.github.com>
ditsuke <ditsuke@protonmail.com>
divinity76 <divinity76@gmail.com> divinity76 <divinity76@gmail.com>
dm4 <sunrisedm4@gmail.com> dm4 <sunrisedm4@gmail.com>
dotpy314 <33351922+dotpy314@users.noreply.github.com> dotpy314 <33351922+dotpy314@users.noreply.github.com>
@ -629,14 +784,18 @@ ebraminio <ebraminio@gmail.com>
eiery <19350831+eiery@users.noreply.github.com> eiery <19350831+eiery@users.noreply.github.com>
eric8607242 <e0928021388@gmail.com> eric8607242 <e0928021388@gmail.com>
fairydreaming <166155368+fairydreaming@users.noreply.github.com> fairydreaming <166155368+fairydreaming@users.noreply.github.com>
fengerhu1 <2748250768@qq.com>
fraxy-v <65565042+fraxy-v@users.noreply.github.com> fraxy-v <65565042+fraxy-v@users.noreply.github.com>
github-actions[bot] <github-actions[bot]@users.noreply.github.com> github-actions[bot] <github-actions[bot]@users.noreply.github.com>
gliptic <gliptic@users.noreply.github.com> gliptic <gliptic@users.noreply.github.com>
goerch <jhr.walter@t-online.de> goerch <jhr.walter@t-online.de>
grahameth <96447521+grahameth@users.noreply.github.com> grahameth <96447521+grahameth@users.noreply.github.com>
gtygo <gtydoit@gmail.com>
gwjr <502526+gwjr@users.noreply.github.com> gwjr <502526+gwjr@users.noreply.github.com>
h-h-h-h <13482553+h-h-h-h@users.noreply.github.com> h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
hankcs <cnhankmc@gmail.com> hankcs <cnhankmc@gmail.com>
haopeng <657407891@qq.com>
hipudding <huafengchun@gmail.com>
hoangmit <hoangmit@users.noreply.github.com> hoangmit <hoangmit@users.noreply.github.com>
hongbo.mo <352280764@qq.com> hongbo.mo <352280764@qq.com>
hopkins385 <98618192+hopkins385@users.noreply.github.com> hopkins385 <98618192+hopkins385@users.noreply.github.com>
@ -649,12 +808,14 @@ hxer7963 <hxer7963@gmail.com>
hydai <z54981220@gmail.com> hydai <z54981220@gmail.com>
iSma <ismail.senhaji@gmail.com> iSma <ismail.senhaji@gmail.com>
iacore <74560659+iacore@users.noreply.github.com> iacore <74560659+iacore@users.noreply.github.com>
icppWorld <124377669+icppWorld@users.noreply.github.com>
igarnier <igarnier@protonmail.com> igarnier <igarnier@protonmail.com>
intelmatt <61025942+intelmatt@users.noreply.github.com> intelmatt <61025942+intelmatt@users.noreply.github.com>
iohub <rickyang.pro@gmail.com> iohub <rickyang.pro@gmail.com>
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com> jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
jameswu2014 <545426914@qq.com> jameswu2014 <545426914@qq.com>
jdomke <28772296+jdomke@users.noreply.github.com>
jiez <373447296@qq.com> jiez <373447296@qq.com>
jneem <joeneeman@gmail.com> jneem <joeneeman@gmail.com>
joecryptotoo <80373433+joecryptotoo@users.noreply.github.com> joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
@ -677,28 +838,35 @@ klosax <131523366+klosax@users.noreply.github.com>
kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
kunnis <kunnis@users.noreply.github.com> kunnis <kunnis@users.noreply.github.com>
kuronekosaiko <EvanChanJ@163.com> kuronekosaiko <EvanChanJ@163.com>
kustaaya <58045274+kustaaya@users.noreply.github.com>
kuvaus <22169537+kuvaus@users.noreply.github.com> kuvaus <22169537+kuvaus@users.noreply.github.com>
kwin1412 <42286931+kwin1412@users.noreply.github.com> kwin1412 <42286931+kwin1412@users.noreply.github.com>
l3utterfly <gc.pthzfoldr@gmail.com> l3utterfly <gc.pthzfoldr@gmail.com>
laik <laik.lj@me.com>
ldwang <ftgreat@163.com> ldwang <ftgreat@163.com>
le.chang <cljs118@126.com> le.chang <cljs118@126.com>
leejet <leejet714@gmail.com> leejet <leejet714@gmail.com>
leo-pony <nengjunma@outlook.com>
limitedAtonement <limitedAtonement@users.noreply.github.com> limitedAtonement <limitedAtonement@users.noreply.github.com>
liuwei-git <14815172+liuwei-git@users.noreply.github.com> liuwei-git <14815172+liuwei-git@users.noreply.github.com>
lon <114724657+longregen@users.noreply.github.com> lon <114724657+longregen@users.noreply.github.com>
loonerin <132926317+loonerin@users.noreply.github.com> loonerin <132926317+loonerin@users.noreply.github.com>
ltoniazzi <61414566+ltoniazzi@users.noreply.github.com>
luoyu-intel <yu.luo@intel.com> luoyu-intel <yu.luo@intel.com>
m3ndax <adrian.goessl@outlook.com> m3ndax <adrian.goessl@outlook.com>
maddes8cht <55592906+maddes8cht@users.noreply.github.com> maddes8cht <55592906+maddes8cht@users.noreply.github.com>
makomk <makosoft@googlemail.com> makomk <makosoft@googlemail.com>
manikbhandari <mbbhandarimanik2@gmail.com> manikbhandari <mbbhandarimanik2@gmail.com>
maor-ps <154728172+maor-ps@users.noreply.github.com> maor-ps <154728172+maor-ps@users.noreply.github.com>
matiaslin <45382001+matiaslin@users.noreply.github.com>
matteo <matteogeniaccio@yahoo.it>
mdrokz <mohammadmunshi@gmail.com> mdrokz <mohammadmunshi@gmail.com>
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com> mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
minarchist <minarchist@users.noreply.github.com> minarchist <minarchist@users.noreply.github.com>
mj-shifu <77107165+mj-shifu@users.noreply.github.com> mj-shifu <77107165+mj-shifu@users.noreply.github.com>
mmyjona <jonathan.gonse@gmail.com> mmyjona <jonathan.gonse@gmail.com>
momonga <115213907+mmnga@users.noreply.github.com> momonga <115213907+mmnga@users.noreply.github.com>
momonga <146910567+mmngays@users.noreply.github.com>
moritzbrantner <31051084+moritzbrantner@users.noreply.github.com> moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
mzcu <milos.cubrilo@gmail.com> mzcu <milos.cubrilo@gmail.com>
nanahi <130121847+na-na-hi@users.noreply.github.com> nanahi <130121847+na-na-hi@users.noreply.github.com>
@ -716,8 +884,10 @@ omahs <73983677+omahs@users.noreply.github.com>
oobabooga <112222186+oobabooga@users.noreply.github.com> oobabooga <112222186+oobabooga@users.noreply.github.com>
opparco <parco.opaai@gmail.com> opparco <parco.opaai@gmail.com>
ostix360 <55257054+ostix360@users.noreply.github.com> ostix360 <55257054+ostix360@users.noreply.github.com>
pculliton <phillipculliton@gmail.com>
pengxin99 <pengxin.yuan@intel.com> pengxin99 <pengxin.yuan@intel.com>
perserk <perserk@gmail.com> perserk <perserk@gmail.com>
piDack <104877312+piDack@users.noreply.github.com>
pmysl <piotr.myslinski@outlook.com> pmysl <piotr.myslinski@outlook.com>
postmasters <namnguyen@google.com> postmasters <namnguyen@google.com>
pudepiedj <pudepiedj@gmail.com> pudepiedj <pudepiedj@gmail.com>
@ -733,6 +903,7 @@ runfuture <runfuture@users.noreply.github.com>
sandyiscool <sandyiscool@gmail.com> sandyiscool <sandyiscool@gmail.com>
sasha0552 <admin@sasha0552.org> sasha0552 <admin@sasha0552.org>
semidark <me@semidark.net> semidark <me@semidark.net>
serhii-nakon <57632032+serhii-nakon@users.noreply.github.com>
sharpHL <132747147+sharpHL@users.noreply.github.com> sharpHL <132747147+sharpHL@users.noreply.github.com>
shibe2 <shibe@tuta.io> shibe2 <shibe@tuta.io>
singularity <12184989+singularity-s0@users.noreply.github.com> singularity <12184989+singularity-s0@users.noreply.github.com>
@ -741,42 +912,55 @@ sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
slaren <2141330+slaren@users.noreply.github.com> slaren <2141330+slaren@users.noreply.github.com>
slaren <slarengh@gmail.com> slaren <slarengh@gmail.com>
snadampal <87143774+snadampal@users.noreply.github.com> snadampal <87143774+snadampal@users.noreply.github.com>
standby24x7 <standby24x7@gmail.com>
staviq <staviq@gmail.com> staviq <staviq@gmail.com>
stduhpf <stephduh@live.fr> stduhpf <stephduh@live.fr>
strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com> strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
swittk <switt1995@gmail.com> swittk <switt1995@gmail.com>
takov751 <40316768+takov751@users.noreply.github.com> takov751 <40316768+takov751@users.noreply.github.com>
tarcey <cey.tarik@gmail.com> tarcey <cey.tarik@gmail.com>
tc-mb <157115220+tc-mb@users.noreply.github.com>
texmex76 <40733439+texmex76@users.noreply.github.com> texmex76 <40733439+texmex76@users.noreply.github.com>
thement <40525767+thement@users.noreply.github.com> thement <40525767+thement@users.noreply.github.com>
thewh1teagle <61390950+thewh1teagle@users.noreply.github.com>
tjohnman <tjohnman@users.noreply.github.com> tjohnman <tjohnman@users.noreply.github.com>
toyer <2042519524@qq.com>
tslmy <tslmy@users.noreply.github.com> tslmy <tslmy@users.noreply.github.com>
ubik2 <ubik2@users.noreply.github.com> ubik2 <ubik2@users.noreply.github.com>
uint256_t <konndennsa@gmail.com> uint256_t <konndennsa@gmail.com>
uint256_t <maekawatoshiki1017@gmail.com> uint256_t <maekawatoshiki1017@gmail.com>
unbounded <haakon@likedan.net> unbounded <haakon@likedan.net>
uvos <devnull@uvos.xyz>
valiray <133289098+valiray@users.noreply.github.com> valiray <133289098+valiray@users.noreply.github.com>
vb <vaibhavs10@gmail.com>
vik <vikhyatk@gmail.com> vik <vikhyatk@gmail.com>
viric <viric@viric.name> viric <viric@viric.name>
vodkaslime <646329483@qq.com> vodkaslime <646329483@qq.com>
vvhg1 <94630311+vvhg1@users.noreply.github.com> vvhg1 <94630311+vvhg1@users.noreply.github.com>
vxiiduu <73044267+vxiiduu@users.noreply.github.com> vxiiduu <73044267+vxiiduu@users.noreply.github.com>
wangshuai09 <391746016@qq.com>
wbpxre150 <100937007+wbpxre150@users.noreply.github.com> wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
whoreson <139810751+whoreson@users.noreply.github.com> whoreson <139810751+whoreson@users.noreply.github.com>
woachk <24752637+woachk@users.noreply.github.com> woachk <24752637+woachk@users.noreply.github.com>
wonjun Jang <strutive07@gmail.com> wonjun Jang <strutive07@gmail.com>
woodx <124784234+woodx9@users.noreply.github.com> woodx <124784234+woodx9@users.noreply.github.com>
wwoodsTM <104587230+wwoodsTM@users.noreply.github.com>
wzy <32936898+Freed-Wu@users.noreply.github.com> wzy <32936898+Freed-Wu@users.noreply.github.com>
xaedes <xaedes@gmail.com> xaedes <xaedes@gmail.com>
xaedes <xaedes@googlemail.com> xaedes <xaedes@googlemail.com>
xctan <axunlei@gmail.com>
xloem <0xloem@gmail.com> xloem <0xloem@gmail.com>
yangli2 <yangli2@gmail.com> yangli2 <yangli2@gmail.com>
yuiseki <yuiseki@gmail.com> yuiseki <yuiseki@gmail.com>
yuri@FreeBSD <yurivict@users.noreply.github.com>
zakkor <edward.partenie@gmail.com> zakkor <edward.partenie@gmail.com>
zhangkaihuo <zhangkaihuo@gmail.com> zhangkaihuo <zhangkaihuo@gmail.com>
zhentaoyu <zhentao.yu@intel.com>
zhouwg <6889919+zhouwg@users.noreply.github.com> zhouwg <6889919+zhouwg@users.noreply.github.com>
zhouwg <zhouwg2000@gmail.com> zhouwg <zhouwg2000@gmail.com>
zrm <trustiosity.zrm@gmail.com> zrm <trustiosity.zrm@gmail.com>
Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com> Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
杨朱 · Kiki <baofa.fan@daocloud.io>
源文雨 <41315874+fumiama@users.noreply.github.com> 源文雨 <41315874+fumiama@users.noreply.github.com>
蕭澧邦 <45505768+shou692199@users.noreply.github.com>
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com> Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>

View File

@ -82,6 +82,7 @@ option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
# Required for relocatable CMake package # Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
# override ggml options # override ggml options
set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD}) set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD})

View File

@ -752,7 +752,7 @@ vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
endif # GGML_VULKAN endif # GGML_VULKAN
ifdef GGML_HIPBLAS ifdef GGML_HIP
ifeq ($(wildcard /opt/rocm),) ifeq ($(wildcard /opt/rocm),)
ROCM_PATH ?= /usr ROCM_PATH ?= /usr
AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch)) AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
@ -807,7 +807,7 @@ ggml/src/ggml-cuda/%.o: \
ggml/src/ggml-common.h \ ggml/src/ggml-common.h \
ggml/src/ggml-cuda/common.cuh ggml/src/ggml-cuda/common.cuh
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
endif # GGML_HIPBLAS endif # GGML_HIP
ifdef GGML_MUSA ifdef GGML_MUSA
ifeq ($(wildcard /opt/musa),) ifeq ($(wildcard /opt/musa),)
@ -815,7 +815,7 @@ ifdef GGML_MUSA
else else
MUSA_PATH ?= /opt/musa MUSA_PATH ?= /opt/musa
endif endif
MTGPU_TARGETS ?= mp_21 mp_22 MUSA_ARCHITECTURES ?= 21;22
MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib
@ -834,7 +834,8 @@ ifdef GGML_MUSA
CXX := $(MUSA_PATH)/bin/clang++ CXX := $(MUSA_PATH)/bin/clang++
MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
MUSAFLAGS += $(addprefix --cuda-gpu-arch=, $(MTGPU_TARGETS)) MUSAFLAGS = -x musa -mtgpu
MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch))
ifdef GGML_CUDA_FORCE_MMQ ifdef GGML_CUDA_FORCE_MMQ
MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ
@ -878,14 +879,14 @@ ggml/src/ggml-cuda/ggml-cuda.o: \
ggml/src/ggml-backend-impl.h \ ggml/src/ggml-backend-impl.h \
ggml/src/ggml-common.h \ ggml/src/ggml-common.h \
$(wildcard ggml/src/ggml-cuda/*.cuh) $(wildcard ggml/src/ggml-cuda/*.cuh)
$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $< $(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $<
ggml/src/ggml-cuda/%.o: \ ggml/src/ggml-cuda/%.o: \
ggml/src/ggml-cuda/%.cu \ ggml/src/ggml-cuda/%.cu \
ggml/include/ggml.h \ ggml/include/ggml.h \
ggml/src/ggml-common.h \ ggml/src/ggml-common.h \
ggml/src/ggml-cuda/common.cuh ggml/src/ggml-cuda/common.cuh
$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $< $(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $<
endif # GGML_MUSA endif # GGML_MUSA
ifdef GGML_METAL ifdef GGML_METAL

View File

@ -79,6 +79,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion) - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B) - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
- [x] [OLMo](https://allenai.org/olmo) - [x] [OLMo](https://allenai.org/olmo)
- [x] [OLMo 2](https://allenai.org/olmo)
- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924) - [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330) - [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia) - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)

33
cmake/common.cmake Normal file
View File

@ -0,0 +1,33 @@
function(llama_add_compile_flags)
if (LLAMA_FATAL_WARNINGS)
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
list(APPEND C_FLAGS -Werror)
list(APPEND CXX_FLAGS -Werror)
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
add_compile_options(/WX)
endif()
endif()
if (LLAMA_ALL_WARNINGS)
if (NOT MSVC)
list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
-Werror=implicit-int -Werror=implicit-function-declaration)
list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
list(APPEND C_FLAGS ${WARNING_FLAGS})
list(APPEND CXX_FLAGS ${WARNING_FLAGS})
ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
"$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
else()
# todo : msvc
set(C_FLAGS "" PARENT_SCOPE)
set(CXX_FLAGS "" PARENT_SCOPE)
endif()
endif()
endfunction()

View File

@ -2,6 +2,8 @@
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
llama_add_compile_flags()
# Build info header # Build info header
# #

View File

@ -128,7 +128,11 @@ static void common_params_handle_model_default(common_params & params) {
} }
params.hf_file = params.model; params.hf_file = params.model;
} else if (params.model.empty()) { } else if (params.model.empty()) {
params.model = fs_get_cache_file(string_split<std::string>(params.hf_file, '/').back()); // this is to avoid different repo having same file name, or same file name in different subdirs
std::string filename = params.hf_repo + "_" + params.hf_file;
// to make sure we don't have any slashes in the filename
string_replace_all(filename, "/", "_");
params.model = fs_get_cache_file(filename);
} }
} else if (!params.model_url.empty()) { } else if (!params.model_url.empty()) {
if (params.model.empty()) { if (params.model.empty()) {
@ -1366,8 +1370,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) { [](common_params & params, int value) {
params.n_gpu_layers = value; params.n_gpu_layers = value;
if (!llama_supports_gpu_offload()) { if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
} }
} }
).set_env("LLAMA_ARG_N_GPU_LAYERS")); ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
@ -2100,8 +2105,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) { [](common_params & params, int value) {
params.speculative.n_gpu_layers = value; params.speculative.n_gpu_layers = value;
if (!llama_supports_gpu_offload()) { if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
} }
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));

View File

@ -829,9 +829,9 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_model * model = nullptr; llama_model * model = nullptr;
if (!params.hf_repo.empty() && !params.hf_file.empty()) { if (!params.hf_repo.empty() && !params.hf_file.empty()) {
model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
} else if (!params.model_url.empty()) { } else if (!params.model_url.empty()) {
model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
} else { } else {
model = llama_load_model_from_file(params.model.c_str(), mparams); model = llama_load_model_from_file(params.model.c_str(), mparams);
} }
@ -1342,17 +1342,17 @@ static bool common_download_file(const std::string & url, const std::string & pa
} }
struct llama_model * common_load_model_from_url( struct llama_model * common_load_model_from_url(
const char * model_url, const std::string & model_url,
const char * path_model, const std::string & local_path,
const char * hf_token, const std::string & hf_token,
const struct llama_model_params & params) { const struct llama_model_params & params) {
// Basic validation of the model_url // Basic validation of the model_url
if (!model_url || strlen(model_url) == 0) { if (model_url.empty()) {
LOG_ERR("%s: invalid model_url\n", __func__); LOG_ERR("%s: invalid model_url\n", __func__);
return NULL; return NULL;
} }
if (!common_download_file(model_url, path_model, hf_token)) { if (!common_download_file(model_url, local_path, hf_token)) {
return NULL; return NULL;
} }
@ -1363,9 +1363,9 @@ struct llama_model * common_load_model_from_url(
/*.no_alloc = */ true, /*.no_alloc = */ true,
/*.ctx = */ NULL, /*.ctx = */ NULL,
}; };
auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params); auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
if (!ctx_gguf) { if (!ctx_gguf) {
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model); LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
return NULL; return NULL;
} }
@ -1384,13 +1384,13 @@ struct llama_model * common_load_model_from_url(
// Verify the first split file format // Verify the first split file format
// and extract split URL and PATH prefixes // and extract split URL and PATH prefixes
{ {
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) { if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split); LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
return NULL; return NULL;
} }
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) { if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split); LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
return NULL; return NULL;
} }
} }
@ -1417,14 +1417,14 @@ struct llama_model * common_load_model_from_url(
} }
} }
return llama_load_model_from_file(path_model, params); return llama_load_model_from_file(local_path.c_str(), params);
} }
struct llama_model * common_load_model_from_hf( struct llama_model * common_load_model_from_hf(
const char * repo, const std::string & repo,
const char * model, const std::string & remote_path,
const char * path_model, const std::string & local_path,
const char * hf_token, const std::string & hf_token,
const struct llama_model_params & params) { const struct llama_model_params & params) {
// construct hugging face model url: // construct hugging face model url:
// //
@ -1438,27 +1438,27 @@ struct llama_model * common_load_model_from_hf(
std::string model_url = "https://huggingface.co/"; std::string model_url = "https://huggingface.co/";
model_url += repo; model_url += repo;
model_url += "/resolve/main/"; model_url += "/resolve/main/";
model_url += model; model_url += remote_path;
return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params); return common_load_model_from_url(model_url, local_path, hf_token, params);
} }
#else #else
struct llama_model * common_load_model_from_url( struct llama_model * common_load_model_from_url(
const char * /*model_url*/, const std::string & /*model_url*/,
const char * /*path_model*/, const std::string & /*local_path*/,
const char * /*hf_token*/, const std::string & /*hf_token*/,
const struct llama_model_params & /*params*/) { const struct llama_model_params & /*params*/) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
return nullptr; return nullptr;
} }
struct llama_model * common_load_model_from_hf( struct llama_model * common_load_model_from_hf(
const char * /*repo*/, const std::string & /*repo*/,
const char * /*model*/, const std::string & /*remote_path*/,
const char * /*path_model*/, const std::string & /*local_path*/,
const char * /*hf_token*/, const std::string & /*hf_token*/,
const struct llama_model_params & /*params*/) { const struct llama_model_params & /*params*/) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
return nullptr; return nullptr;

View File

@ -470,8 +470,17 @@ struct llama_model_params common_model_params_to_llama ( common_params
struct llama_context_params common_context_params_to_llama(const common_params & params); struct llama_context_params common_context_params_to_llama(const common_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); struct llama_model * common_load_model_from_url(
struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); const std::string & model_url,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);
struct llama_model * common_load_model_from_hf(
const std::string & repo,
const std::string & remote_path,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);
// clear LoRA adapters from context, then apply new list of adapters // clear LoRA adapters from context, then apply new list of adapters
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters); void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);

View File

@ -23,10 +23,10 @@ $ curl -L {model-url} -o ~/{model}.gguf
Then, if you are not already in the repo directory, `cd` into `llama.cpp` and: Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
``` ```
$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}" $ ./build/bin/llama-cli -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
``` ```
Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal. Here, we show `llama-cli`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone: To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:

View File

@ -23,6 +23,8 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
## News ## News
- 2024.11
- Support F16 and F32 data type model for Ascend 310P NPU.
- 2024.8 - 2024.8
- Support `Q4_0` and `Q8_0` data type for Ascend NPU. - Support `Q4_0` and `Q8_0` data type for Ascend NPU.
- 2024.7 - 2024.7
@ -40,9 +42,11 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
### Ascend NPU ### Ascend NPU
**Verified devices** **Verified devices**
| Ascend NPU | Status | | Ascend NPU | Status |
|:-----------------------------:|:-------:| |:-----------------------------:|:-------:|
| Atlas 300T A2 | Support | | Atlas 300T A2 | Support |
| Atlas 300I Duo | Support |
*Notes:* *Notes:*

View File

@ -221,7 +221,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
- Using `make`: - Using `make`:
```bash ```bash
make GGML_HIPBLAS=1 make GGML_HIP=1
``` ```
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU): - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
```bash ```bash
@ -249,7 +249,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
- Using `make` (example for target gfx1030, build with 16 CPU threads): - Using `make` (example for target gfx1030, build with 16 CPU threads):
```bash ```bash
make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030 make -j16 GGML_HIP=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
``` ```
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU): - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):

View File

@ -6,6 +6,10 @@ find_package(Threads REQUIRED)
# ... # ...
# flags
llama_add_compile_flags()
# examples # examples
include_directories(${CMAKE_CURRENT_SOURCE_DIR}) include_directories(${CMAKE_CURRENT_SOURCE_DIR})

View File

@ -4,10 +4,17 @@ install(TARGETS ${TARGET} RUNTIME)
# clibs dependencies # clibs dependencies
include_directories(deps/) include_directories(deps/)
add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h) add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h)
target_link_libraries(${TARGET} PRIVATE xxhash) target_link_libraries(${TARGET} PRIVATE xxhash)
add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h) add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h)
target_link_libraries(${TARGET} PRIVATE sha1) target_link_libraries(${TARGET} PRIVATE sha1)
if (NOT MSVC)
# disable warnings in 3rd party code
target_compile_options(sha1 PRIVATE -w)
endif()
add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h) add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
target_link_libraries(${TARGET} PRIVATE sha256) target_link_libraries(${TARGET} PRIVATE sha256)

View File

@ -40,10 +40,17 @@
#include <cinttypes> #include <cinttypes>
#include <limits> #include <limits>
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) #if defined(LLAVA_LOG_OFF)
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) # define LOG_INF(...)
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) # define LOG_WRN(...)
#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0) # define LOG_ERR(...)
# define LOG_DBG(...)
#else // defined(LLAVA_LOG_OFF)
# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#endif // defined(LLAVA_LOG_OFF)
//#define CLIP_DEBUG_FUNCTIONS //#define CLIP_DEBUG_FUNCTIONS

View File

@ -11,13 +11,17 @@
#include <limits> #include <limits>
#include <vector> #include <vector>
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) #if defined(LLAVA_LOG_OFF)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) # define LOG_INF(...)
# define LOG_WRN(...)
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) # define LOG_ERR(...)
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) # define LOG_DBG(...)
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) #else // defined(LLAVA_LOG_OFF)
#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) # define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#endif // defined(LLAVA_LOG_OFF)
// RGB uint8 image // RGB uint8 image
struct clip_image_u8 { struct clip_image_u8 {
@ -515,10 +519,16 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
errno = 0; errno = 0;
size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
if (ferror(file)) { if (ferror(file)) {
die_fmt("read error: %s", strerror(errno)); LOG_ERR("read error: %s", strerror(errno));
free(buffer);
fclose(file);
return false;
} }
if (ret != (size_t) fileSize) { if (ret != (size_t) fileSize) {
die("unexpectedly reached end of file"); LOG_ERR("unexpectedly reached end of file");
free(buffer);
fclose(file);
return false;
} }
fclose(file); // Close the file fclose(file); // Close the file

View File

@ -2267,12 +2267,7 @@ struct server_context {
continue; // continue loop of slots continue; // continue loop of slots
} }
llama_token id; llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
{
completion_token_output result;
id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
slot.i_batch = -1; slot.i_batch = -1;
@ -2285,6 +2280,7 @@ struct server_context {
metrics.on_prompt_eval(slot); metrics.on_prompt_eval(slot);
} }
completion_token_output result;
result.tok = id; result.tok = id;
const auto * cur_p = common_sampler_get_candidates(slot.smpl); const auto * cur_p = common_sampler_get_candidates(slot.smpl);
@ -2306,11 +2302,14 @@ struct server_context {
} }
} }
// check if the slot supports speculative decoding // do speculative decoding
if (!slot.can_speculate()) { for (auto & slot : slots) {
if (!slot.is_processing() || !slot.can_speculate()) {
continue; continue;
} }
llama_token id = slot.sampled;
struct common_speculative_params params_spec; struct common_speculative_params params_spec;
params_spec.n_draft = slot.params.speculative.n_max; params_spec.n_draft = slot.params.speculative.n_max;
params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max; params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;

View File

@ -1 +1,2 @@
.venv .venv
tmp

View File

@ -1,19 +1,9 @@
# Server tests # Server tests
Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development) Python based server tests scenario using [pytest](https://docs.pytest.org/en/stable/).
and [behave](https://behave.readthedocs.io/en/latest/):
* [issues.feature](./features/issues.feature) Pending issues scenario
* [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests
* [security.feature](./features/security.feature) Security, CORS and API Key
* [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc...
Tests target GitHub workflows job runners with 4 vCPU. Tests target GitHub workflows job runners with 4 vCPU.
Requests are
using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html)
based http client.
Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail. Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail.
To mitigate it, you can increase values in `n_predict`, `kv_size`. To mitigate it, you can increase values in `n_predict`, `kv_size`.
@ -39,26 +29,19 @@ It's possible to override some scenario steps values with environment variables:
|--------------------------|------------------------------------------------------------------------------------------------| |--------------------------|------------------------------------------------------------------------------------------------|
| `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` | | `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/llama-server` | | `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/llama-server` |
| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` | | `DEBUG` | to enable steps and server verbose mode `--verbose` |
| `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` | | `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` |
### Run @bug, @wip or @wrong_usage annotated scenario To run slow tests:
Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope.
- `@bug` annotation aims to link a scenario with a GitHub issue.
- `@wrong_usage` are meant to show user issue that are actually an expected behavior
- `@wip` to focus on a scenario working in progress
- `@slow` heavy test, disabled by default
To run a scenario annotated with `@bug`, start:
```shell ```shell
DEBUG=ON ./tests.sh --no-skipped --tags bug --stop SLOW_TESTS=1 ./tests.sh
``` ```
After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated. To run with stdout/stderr display in real time (verbose output, but useful for debugging):
```shell ```shell
./tests.sh --no-skipped --tags bug,wrong_usage || echo "should failed but compile" DEBUG=1 ./tests.sh -s -v -x
``` ```
To see all available arguments, please refer to [pytest documentation](https://docs.pytest.org/en/stable/how-to/usage.html)

View File

@ -0,0 +1,15 @@
import pytest
from utils import *
# ref: https://stackoverflow.com/questions/22627659/run-code-before-and-after-each-test-in-py-test
@pytest.fixture(autouse=True)
def stop_server_after_each_test():
# do nothing before each test
yield
# stop all servers after each test
instances = set(
server_instances
) # copy the set to prevent 'Set changed size during iteration'
for server in instances:
server.stop()

View File

@ -1,66 +0,0 @@
@llama.cpp
@ctx_shift
Feature: llama.cpp server
Background: Server startup
Given a server listening on localhost:8080
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And a model file test-model.gguf
And a model alias tinyllama-2
And BOS token is 1
And 42 as server seed
And 256 KV cache size
And 32 as batch size
And 2 slots
# the prompt is 301 tokens
# the slot context is 256/2 = 128 tokens
# the prompt is truncated to keep the last 109 tokens
# 64 tokens are generated thanks to shifting the context when it gets full
Scenario: Inference with context shift
And 64 server max tokens to predict
Then the server is starting
Then the server is healthy
Given a prompt:
"""
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
"""
And a completion request with no api error
Then 64 tokens are predicted matching fun|Annaks|popcorns|pictry|bowl
And the completion is truncated
And 109 prompt tokens are processed
Scenario Outline: Inference without context shift
And <n_predict> server max tokens to predict
And disable context shifting
Then the server is starting
Then the server is healthy
Given a prompt:
"""
Hi how are you
"""
And a completion request with no api error
Then <n_token_output> tokens are predicted matching twind|Anna
And the completion is <truncated> truncated
And 8 prompt tokens are processed
Examples:
| n_predict | n_token_output | truncated |
| 64 | 64 | not |
| -1 | 120 | |
Scenario: Inference without context shift (expected error: prompt too long)
And disable context shifting
Then the server is starting
Then the server is healthy
Given a prompt:
"""
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
"""
And a completion request with 400 api error

View File

@ -1,113 +0,0 @@
@llama.cpp
@embeddings
Feature: llama.cpp server
Background: Server startup
Given a server listening on localhost:8080
And a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf
And a model file bert-bge-small.gguf
And a model alias bert-bge-small
And 42 as server seed
And 2 slots
# the bert-bge-small model has context size of 512
# since the generated prompts are as big as the batch size, we need to set the batch size to <= 512
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5/blob/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/config.json#L20
And 128 as batch size
And 128 as ubatch size
And 512 KV cache size
And enable embeddings endpoint
Then the server is starting
Then the server is healthy
Scenario: Embedding
When embeddings are computed for:
"""
What is the capital of Bulgaria ?
"""
Then embeddings are generated
Scenario: Embedding (error: prompt too long)
When embeddings are computed for:
"""
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
"""
And embeddings request with 500 api error
Scenario: OAI Embeddings compatibility
Given a model bert-bge-small
When an OAI compatible embeddings computation request for:
"""
What is the capital of Spain ?
"""
Then embeddings are generated
Scenario: OAI Embeddings compatibility with multiple inputs
Given a model bert-bge-small
Given a prompt:
"""
In which country Paris is located ?
"""
And a prompt:
"""
Is Madrid the capital of Spain ?
"""
When an OAI compatible embeddings computation request for multiple inputs
Then embeddings are generated
Scenario: Multi users embeddings
Given a prompt:
"""
Write a very long story about AI.
"""
And a prompt:
"""
Write another very long music lyrics.
"""
And a prompt:
"""
Write a very long poem.
"""
And a prompt:
"""
Write a very long joke.
"""
Given concurrent embedding requests
Then the server is busy
Then the server is idle
Then all embeddings are generated
Scenario: Multi users OAI compatibility embeddings
Given a prompt:
"""
In which country Paris is located ?
"""
And a prompt:
"""
Is Madrid the capital of Spain ?
"""
And a prompt:
"""
What is the biggest US city ?
"""
And a prompt:
"""
What is the capital of Bulgaria ?
"""
And a model bert-bge-small
Given concurrent OAI embedding requests
Then the server is busy
Then the server is idle
Then all embeddings are generated
Scenario: All embeddings should be the same
Given 10 fixed prompts
And a model bert-bge-small
Given concurrent OAI embedding requests
Then all embeddings are the same

View File

@ -1,71 +0,0 @@
import os
import signal
import socket
import sys
import time
import traceback
from contextlib import closing
from subprocess import TimeoutExpired
def before_scenario(context, scenario):
context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON'
if context.debug:
print("DEBUG=ON")
print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m")
port = 8080
if 'PORT' in os.environ:
port = int(os.environ['PORT'])
if is_server_listening("localhost", port):
assert False, "Server already started"
def after_scenario(context, scenario):
try:
if 'server_process' not in context or context.server_process is None:
return
if scenario.status == "failed":
if 'GITHUB_ACTIONS' in os.environ:
print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n")
if os.path.isfile('llama.log'):
with closing(open('llama.log', 'r')) as f:
for line in f:
print(line)
if not is_server_listening(context.server_fqdn, context.server_port):
print("\x1b[33;101mERROR: Server stopped listening\x1b[0m")
if context.server_process.poll() is not None:
assert False, f"Server not running pid={context.server_process.pid} ..."
server_graceful_shutdown(context) # SIGINT
try:
context.server_process.wait(0.5)
except TimeoutExpired:
print(f"server still alive after 500ms, force-killing pid={context.server_process.pid} ...")
context.server_process.kill() # SIGKILL
context.server_process.wait()
while is_server_listening(context.server_fqdn, context.server_port):
time.sleep(0.1)
except Exception:
print("ignoring error in after_scenario:")
traceback.print_exc(file=sys.stdout)
def server_graceful_shutdown(context):
print(f"shutting down server pid={context.server_process.pid} ...")
if os.name == 'nt':
interrupt = signal.CTRL_C_EVENT
else:
interrupt = signal.SIGINT
context.server_process.send_signal(interrupt)
def is_server_listening(server_fqdn, server_port):
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
result = sock.connect_ex((server_fqdn, server_port))
_is_server_listening = result == 0
if _is_server_listening:
print(f"server is listening on {server_fqdn}:{server_port}...")
return _is_server_listening

View File

@ -1,36 +0,0 @@
@llama.cpp
@infill
Feature: llama.cpp server
# The current model is made by adding FIM tokens to the existing stories260K
# We may want to use a better model in the future, maybe something like SmolLM 360M
Background: Server startup
Given a server listening on localhost:8080
And a model file tinyllamas/stories260K-infill.gguf from HF repo ggml-org/models
And a model file test-model-infill.gguf
And a model alias tinyllama-infill
And 42 as server seed
And 1024 as batch size
And 1024 as ubatch size
And 2048 KV cache size
And 64 max tokens to predict
And 0.0 temperature
Then the server is starting
Then the server is healthy
Scenario: Infill without input_extra
Given a prompt "Complete this"
And an infill input extra none none
And an infill input prefix "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_"
And an infill input suffix "}\n"
And an infill request with no api error
Then 64 tokens are predicted matching One|day|she|saw|big|scary|bird
Scenario: Infill with input_extra
Given a prompt "Complete this"
And an infill input extra "llama.h" "LLAMA_API int32_t llama_n_threads();\n"
And an infill input prefix "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_"
And an infill input suffix "}\n"
And an infill request with no api error
Then 64 tokens are predicted matching cuts|Jimmy|mom|came|into|the|room"

View File

@ -1,5 +0,0 @@
# List of ongoing issues
# run with: DEBUG=ON ./tests.sh --no-skipped --tags bug
@bug
Feature: Issues
# No confirmed issue at the moment

View File

@ -1,36 +0,0 @@
@llama.cpp
@lora
Feature: llama.cpp server
Background: Server startup
Given a server listening on localhost:8080
And a model url https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/stories15M_MOE-F16.gguf
And a model file stories15M_MOE-F16.gguf
And a model alias stories15M_MOE
And a lora adapter file from https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf
And 42 as server seed
And 1024 as batch size
And 1024 as ubatch size
And 2048 KV cache size
And 64 max tokens to predict
And 0.0 temperature
Then the server is starting
Then the server is healthy
Scenario: Completion LoRA disabled
Given switch off lora adapter 0
Given a prompt:
"""
Look in thy glass
"""
And a completion request with no api error
Then 64 tokens are predicted matching little|girl|three|years|old
Scenario: Completion LoRA enabled
Given switch on lora adapter 0
Given a prompt:
"""
Look in thy glass
"""
And a completion request with no api error
Then 64 tokens are predicted matching eye|love|glass|sun

View File

@ -1,131 +0,0 @@
@llama.cpp
@parallel
Feature: Parallel
Background: Server startup
Given a server listening on localhost:8080
And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
And a model file test-model-00001-of-00003.gguf
And 42 as server seed
And 128 as batch size
And 256 KV cache size
And 2 slots
And continuous batching
Then the server is starting
Then the server is healthy
Scenario Outline: Multi users completion
Given a prompt:
"""
Write a very long story about AI.
"""
And a prompt:
"""
Write another very long music lyrics.
"""
And <n_predict> max tokens to predict
Given concurrent completion requests
Then the server is busy
Then the server is idle
And all slots are idle
Then all prompts are predicted with <n_predict> tokens
Examples:
| n_predict |
| 128 |
Scenario Outline: Multi users OAI completions compatibility
Given a system prompt You are a writer.
And a model tinyllama-2
Given a prompt:
"""
Write a very long book.
"""
And a prompt:
"""
Write another a poem.
"""
And <n_predict> max tokens to predict
And streaming is <streaming>
Given concurrent OAI completions requests
Then the server is busy
Then the server is idle
Then all prompts are predicted with <n_predict> tokens
Examples:
| streaming | n_predict |
| disabled | 128 |
| enabled | 64 |
Scenario Outline: Multi users OAI completions compatibility no v1
Given a system prompt You are a writer.
And a model tinyllama-2
Given a prompt:
"""
Write a very long book.
"""
And a prompt:
"""
Write another a poem.
"""
And <n_predict> max tokens to predict
And streaming is <streaming>
Given concurrent OAI completions requests no v1
Then the server is busy
Then the server is idle
Then all prompts are predicted with <n_predict> tokens
Examples:
| streaming | n_predict |
| disabled | 128 |
| enabled | 64 |
Scenario Outline: Multi users with number of prompts exceeding number of slots
Given a system prompt You are a writer.
And a model tinyllama-2
Given a prompt:
"""
Write a very long book.
"""
And a prompt:
"""
Write another a poem.
"""
And a prompt:
"""
What is LLM?
"""
And a prompt:
"""
The sky is blue and I love it.
"""
And <n_predict> max tokens to predict
And streaming is <streaming>
Given concurrent OAI completions requests
Then the server is busy
Then the server is idle
Then all prompts are predicted with <n_predict> tokens
Examples:
| streaming | n_predict |
| disabled | 128 |
| enabled | 64 |
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969
Given a prompt:
"""
Write a very long story about AI.
"""
And a prompt:
"""
Write another very long music lyrics.
"""
And a prompt:
"""
Write a very long poem.
"""
And a prompt:
"""
Write a very long joke.
"""
And 128 max tokens to predict
Given concurrent completion requests
Then the server is busy
Then the server is idle
Then all prompts are predicted

View File

@ -1,56 +0,0 @@
# run with: ./tests.sh --no-skipped --tags passkey
@passkey
@slow
Feature: Passkey / Self-extend with context shift
Background: Server startup
Given a server listening on localhost:8080
# Generates a long text of junk and inserts a secret passkey number inside it.
# Then we query the LLM for the secret passkey.
# see #3856 and #4810
Scenario Outline: Passkey
Given a model file <hf_file> from HF repo <hf_repo>
And <n_batch> as batch size
And <n_junk> as number of junk
And <n_predicted> server max tokens to predict
And 42 as seed
And 0.0 temperature
And <n_ctx> KV cache size
And 1 slots
And <n_ga> group attention factor to extend context size through self-extend
And <n_ga_w> group attention width to extend context size through self-extend
# Can be override with N_GPU_LAYERS
And <ngl> GPU offloaded layers
Then the server is starting
# Higher timeout because the model may need to be downloaded from the internet
Then the server is healthy with timeout 120 seconds
Given available models
Then model 0 is trained on <n_ctx_train> tokens context
Given a prefix prompt:
"""
here is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.
"""
And a passkey prompt template:
"""
The pass key is <passkey> Remember it. <passkey> is the pass key.
"""
And a junk suffix prompt:
"""
The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.
"""
And a suffix prompt:
"""
What is the pass key? The pass key is
"""
Given a "<passkey>" passkey challenge prompt with the passkey inserted every <i_pos> junk
And a completion request with no api error
Then <n_predicted> tokens are predicted matching <re_content>
Examples:
| hf_repo | hf_file | n_ctx_train | ngl | n_ctx | n_batch | n_ga | n_ga_w | n_junk | i_pos | passkey | n_predicted | re_content |
| TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 8192 | 512 | 4 | 512 | 250 | 50 | 42 | 1 | 42 |
| TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 8192 | 512 | 2 | 512 | 250 | 50 | 42 | 1 | \b((?!42)\w)+\b |
#| TheBloke/Llama-2-7B-GGUF | llama-2-7b.Q2_K.gguf | 4096 | 3 | 16384 | 512 | 4 | 512 | 500 | 300 | 1234 | 5 | 1234 |
#| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768 | 2 | 16384 | 512 | 4 | 512 | 500 | 100 | 0987 | 5 | 0
# 987 |

View File

@ -1,42 +0,0 @@
@llama.cpp
@rerank
Feature: llama.cpp server
Background: Server startup
Given a server listening on localhost:8080
And a model url https://huggingface.co/ggml-org/models/resolve/main/jina-reranker-v1-tiny-en/ggml-model-f16.gguf
And a model file jina-reranker-v1-tiny-en.gguf
And a model alias jina-reranker-v1-tiny-en
And 42 as server seed
And 2 slots
And 512 as batch size
And 512 as ubatch size
And 512 KV cache size
And enable reranking endpoint
Then the server is starting
Then the server is healthy
Scenario: Rerank
Given a rerank query:
"""
Machine learning is
"""
And a rerank document:
"""
A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.
"""
And a rerank document:
"""
Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.
"""
And a rerank document:
"""
Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.
"""
And a rerank document:
"""
Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine.
"""
When reranking request
Then reranking results are returned
Then reranking highest score is index 2 and lowest score is index 3

View File

@ -1,118 +0,0 @@
@llama.cpp
@results
Feature: Results
Background: Server startup
Given a server listening on localhost:8080
And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
And a model file test-model-00001-of-00003.gguf
And 128 as batch size
And 1024 KV cache size
And 128 max tokens to predict
And continuous batching
Scenario Outline: consistent results with same seed
Given <n_slots> slots
And 1.0 temperature
Then the server is starting
Then the server is healthy
Given 4 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42
Given concurrent completion requests
Then the server is busy
Then the server is idle
And all slots are idle
Then all predictions are equal
Examples:
| n_slots |
| 1 |
# FIXME: unified KV cache nondeterminism
# | 2 |
Scenario Outline: different results with different seed
Given <n_slots> slots
And 1.0 temperature
Then the server is starting
Then the server is healthy
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 43
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 44
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 45
Given concurrent completion requests
Then the server is busy
Then the server is idle
And all slots are idle
Then all predictions are different
Examples:
| n_slots |
| 1 |
| 2 |
Scenario Outline: consistent results with same seed and varying batch size
Given 4 slots
And <temp> temperature
# And 0 as draft
Then the server is starting
Then the server is healthy
Given 1 prompts "Write a very long story about AI." with seed 42
And concurrent completion requests
# Then the server is busy # Not all slots will be utilized.
Then the server is idle
And all slots are idle
Given <n_parallel> prompts "Write a very long story about AI." with seed 42
And concurrent completion requests
# Then the server is busy # Not all slots will be utilized.
Then the server is idle
And all slots are idle
Then all predictions are equal
Examples:
| n_parallel | temp |
| 1 | 0.0 |
| 1 | 1.0 |
# FIXME: unified KV cache nondeterminism
# See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
# and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
# and https://github.com/ggerganov/llama.cpp/pull/7347 .
# | 2 | 0.0 |
# | 4 | 0.0 |
# | 2 | 1.0 |
# | 4 | 1.0 |
Scenario Outline: consistent token probs with same seed and prompt
Given <n_slots> slots
And <n_kv> KV cache size
And 1.0 temperature
And <n_predict> max tokens to predict
Then the server is starting
Then the server is healthy
Given 1 prompts "The meaning of life is" with seed 42
And concurrent completion requests
# Then the server is busy # Not all slots will be utilized.
Then the server is idle
And all slots are idle
Given <n_parallel> prompts "The meaning of life is" with seed 42
And concurrent completion requests
# Then the server is busy # Not all slots will be utilized.
Then the server is idle
And all slots are idle
Then all token probabilities are equal
Examples:
| n_slots | n_kv | n_predict | n_parallel |
| 4 | 1024 | 1 | 1 |
# FIXME: unified KV cache nondeterminism
# See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
# and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
# and https://github.com/ggerganov/llama.cpp/pull/7347 .
# | 4 | 1024 | 1 | 4 |
# | 4 | 1024 | 100 | 1 |
# This test still fails even the above patches; the first token probabilities are already different.
# | 4 | 1024 | 100 | 4 |

View File

@ -1,68 +0,0 @@
@llama.cpp
@security
Feature: Security
Background: Server startup with an api key defined
Given a server listening on localhost:8080
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And a server api key THIS_IS_THE_KEY
Then the server is starting
Then the server is healthy
Scenario Outline: Completion with some user api key
Given a prompt test
And a user api key <api_key>
And 4 max tokens to predict
And a completion request with <api_error> api error
Examples: Prompts
| api_key | api_error |
| THIS_IS_THE_KEY | no |
| THIS_IS_THE_KEY | no |
| hackeme | raised |
| | raised |
Scenario Outline: OAI Compatibility
Given a system prompt test
And a user prompt test
And a model test
And 2 max tokens to predict
And streaming is disabled
And a user api key <api_key>
Given an OAI compatible chat completions request with <api_error> api error
Examples: Prompts
| api_key | api_error |
| THIS_IS_THE_KEY | no |
| THIS_IS_THE_KEY | no |
| hackme | raised |
Scenario Outline: OAI Compatibility (invalid response formats)
Given a system prompt test
And a user prompt test
And a response format <response_format>
And a model test
And 2 max tokens to predict
And streaming is disabled
Given an OAI compatible chat completions request with raised api error
Examples: Prompts
| response_format |
| {"type": "sound"} |
| {"type": "json_object", "schema": 123} |
| {"type": "json_object", "schema": {"type": 123}} |
| {"type": "json_object", "schema": {"type": "hiccup"}} |
Scenario Outline: CORS Options
Given a user api key THIS_IS_THE_KEY
When an OPTIONS request is sent from <origin>
Then CORS header <cors_header> is set to <cors_header_value>
Examples: Headers
| origin | cors_header | cors_header_value |
| localhost | Access-Control-Allow-Origin | localhost |
| web.mydomain.fr | Access-Control-Allow-Origin | web.mydomain.fr |
| origin | Access-Control-Allow-Credentials | true |
| web.mydomain.fr | Access-Control-Allow-Methods | GET, POST |
| web.mydomain.fr | Access-Control-Allow-Headers | * |

View File

@ -1,120 +0,0 @@
@llama.cpp
@server
Feature: llama.cpp server
Background: Server startup
Given a server listening on localhost:8080
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And a model file test-model.gguf
And a model alias tinyllama-2
And BOS token is 1
And 42 as server seed
# KV Cache corresponds to the total amount of tokens
# that can be stored across all independent sequences: #4130
# see --ctx-size and #5568
And 256 KV cache size
And 32 as batch size
And 2 slots
And 64 server max tokens to predict
And prometheus compatible metrics exposed
Then the server is starting
Then the server is healthy
Scenario: Health
Then the server is ready
And all slots are idle
Scenario Outline: Completion
Given a prompt <prompt>
And <n_predict> max tokens to predict
And a completion request with no api error
Then <n_predicted> tokens are predicted matching <re_content>
And the completion is <truncated> truncated
And <n_prompt> prompt tokens are processed
And prometheus metrics are exposed
And metric llamacpp:tokens_predicted is <n_predicted>
Examples: Prompts
| prompt | n_predict | re_content | n_prompt | n_predicted | truncated |
| I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not |
| Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 46 | 64 | not |
Scenario: Completion prompt truncated
Given a prompt:
"""
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
"""
And a completion request with no api error
Then 64 tokens are predicted matching fun|Annaks|popcorns|pictry|bowl
And the completion is truncated
And 109 prompt tokens are processed
Scenario Outline: OAI Compatibility
Given a model <model>
And a system prompt <system_prompt>
And a user prompt <user_prompt>
And <max_tokens> max tokens to predict
And streaming is <enable_streaming>
Given an OAI compatible chat completions request with no api error
Then <n_predicted> tokens are predicted matching <re_content>
And <n_prompt> prompt tokens are processed
And the completion is <truncated> truncated
Examples: Prompts
| model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated |
| llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not |
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|Annabyear)+ | -1 | 64 | enabled | |
Scenario Outline: OAI Compatibility w/ response format
Given a model test
And a system prompt test
And a user prompt test
And a response format <response_format>
And 10 max tokens to predict
Given an OAI compatible chat completions request with no api error
Then <n_predicted> tokens are predicted matching <re_content>
Examples: Prompts
| response_format | n_predicted | re_content |
| {"type": "json_object", "schema": {"const": "42"}} | 6 | "42" |
| {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] |
| {"type": "json_object"} | 10 | \{ " Jacky. |
Scenario: Tokenize / Detokenize
When tokenizing:
"""
What is the capital of France ?
"""
Then tokens can be detokenized
And tokens do not begin with BOS
Scenario: Tokenize w/ BOS
Given adding special tokens
When tokenizing:
"""
What is the capital of Germany?
"""
Then tokens begin with BOS
Given first token is removed
Then tokens can be detokenized
Scenario: Tokenize with pieces
When tokenizing with pieces:
"""
What is the capital of Germany?
"""
Then tokens are given with pieces
Scenario: Models available
Given available models
Then 1 models are supported
Then model 0 is identified by tinyllama-2
Then model 0 is trained on 128 tokens context

View File

@ -1,58 +0,0 @@
@llama.cpp
@slotsave
Feature: llama.cpp server slot management
Background: Server startup
Given a server listening on localhost:8080
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And prompt caching is enabled
And 2 slots
And . as slot save path
And 2048 KV cache size
And 42 as server seed
And 24 max tokens to predict
Then the server is starting
Then the server is healthy
Scenario: Save and Restore Slot
# First prompt in slot 1 should be fully processed
Given a user prompt "What is the capital of France?"
And using slot id 1
And a completion request with no api error
Then 24 tokens are predicted matching (Lily|cake)
And 22 prompt tokens are processed
When the slot 1 is saved with filename "slot1.bin"
Then the server responds with status code 200
# Since we have cache, this should only process the last tokens
Given a user prompt "What is the capital of Germany?"
And a completion request with no api error
Then 24 tokens are predicted matching (Thank|special)
And 7 prompt tokens are processed
# Loading the original cache into slot 0,
# we should only be processing 1 prompt token and get the same output
When the slot 0 is restored with filename "slot1.bin"
Then the server responds with status code 200
Given a user prompt "What is the capital of France?"
And using slot id 0
And a completion request with no api error
Then 24 tokens are predicted matching (Lily|cake)
And 1 prompt tokens are processed
# For verification that slot 1 was not corrupted during slot 0 load, same thing
Given a user prompt "What is the capital of Germany?"
And using slot id 1
And a completion request with no api error
Then 24 tokens are predicted matching (Thank|special)
And 1 prompt tokens are processed
Scenario: Erase Slot
Given a user prompt "What is the capital of France?"
And using slot id 1
And a completion request with no api error
Then 24 tokens are predicted matching (Lily|cake)
And 22 prompt tokens are processed
When the slot 1 is erased
Then the server responds with status code 200
Given a user prompt "What is the capital of France?"
And a completion request with no api error
Then 24 tokens are predicted matching (Lily|cake)
And 22 prompt tokens are processed

File diff suppressed because it is too large Load Diff

View File

@ -1,25 +0,0 @@
# run with: ./tests.sh --no-skipped --tags wrong_usage
@wrong_usage
Feature: Wrong usage of llama.cpp server
#3969 The user must always set --n-predict option
# to cap the number of tokens any completion request can generate
# or pass n_predict/max_tokens in the request.
Scenario: Infinite loop
Given a server listening on localhost:8080
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And 42 as server seed
And 2048 KV cache size
# Uncomment below to fix the issue
#And 64 server max tokens to predict
Then the server is starting
Then the server is healthy
Given a prompt:
"""
Go to: infinite loop
"""
# Uncomment below to fix the issue
#And 128 max tokens to predict
Given concurrent completion requests
Then the server is idle
Then all prompts are predicted

View File

@ -1,7 +1,7 @@
aiohttp~=3.9.3 aiohttp~=3.9.3
behave~=1.2.6 pytest~=8.3.3
huggingface_hub~=0.23.2 huggingface_hub~=0.23.2
numpy~=1.26.4 numpy~=1.26.4
openai~=1.30.3 openai~=1.55.3
prometheus-client~=0.20.0 prometheus-client~=0.20.0
requests~=2.32.3 requests~=2.32.3

View File

@ -4,8 +4,7 @@ set -eu
if [ $# -lt 1 ] if [ $# -lt 1 ]
then then
# Start @llama.cpp scenario pytest -v -x
behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
else else
behave "$@" pytest "$@"
fi fi

View File

@ -0,0 +1,34 @@
import pytest
from utils import *
server = ServerPreset.tinyllama2()
@pytest.fixture(scope="module", autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama2()
def test_server_start_simple():
global server
server.start()
res = server.make_request("GET", "/health")
assert res.status_code == 200
def test_server_props():
global server
server.start()
res = server.make_request("GET", "/props")
assert res.status_code == 200
assert res.body["total_slots"] == server.n_slots
def test_server_models():
global server
server.start()
res = server.make_request("GET", "/models")
assert res.status_code == 200
assert len(res.body["data"]) == 1
assert res.body["data"][0]["id"] == server.model_alias

View File

@ -0,0 +1,129 @@
import pytest
from openai import OpenAI
from utils import *
server = ServerPreset.tinyllama2()
@pytest.fixture(scope="module", autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama2()
@pytest.mark.parametrize(
"model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated",
[
("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False),
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False),
]
)
def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated):
global server
server.start()
res = server.make_request("POST", "/chat/completions", data={
"model": model,
"max_tokens": max_tokens,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
})
assert res.status_code == 200
assert res.body["usage"]["prompt_tokens"] == n_prompt
assert res.body["usage"]["completion_tokens"] == n_predicted
choice = res.body["choices"][0]
assert "assistant" == choice["message"]["role"]
assert match_regex(re_content, choice["message"]["content"])
if truncated:
assert choice["finish_reason"] == "length"
else:
assert choice["finish_reason"] == "stop"
@pytest.mark.parametrize(
"model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated",
[
("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False),
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False),
]
)
def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated):
global server
server.start()
res = server.make_stream_request("POST", "/chat/completions", data={
"model": model,
"max_tokens": max_tokens,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
"stream": True,
})
content = ""
for data in res:
choice = data["choices"][0]
if choice["finish_reason"] in ["stop", "length"]:
assert data["usage"]["prompt_tokens"] == n_prompt
assert data["usage"]["completion_tokens"] == n_predicted
assert "content" not in choice["delta"]
assert match_regex(re_content, content)
# FIXME: not sure why this is incorrect in stream mode
# if truncated:
# assert choice["finish_reason"] == "length"
# else:
# assert choice["finish_reason"] == "stop"
else:
assert choice["finish_reason"] is None
content += choice["delta"]["content"]
def test_chat_completion_with_openai_library():
global server
server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
res = client.chat.completions.create(
model="gpt-3.5-turbo-instruct",
messages=[
{"role": "system", "content": "Book"},
{"role": "user", "content": "What is the best book"},
],
max_tokens=8,
seed=42,
temperature=0.8,
)
print(res)
assert res.choices[0].finish_reason == "stop"
assert res.choices[0].message.content is not None
assert match_regex("(Suddenly)+", res.choices[0].message.content)
@pytest.mark.parametrize("response_format,n_predicted,re_content", [
({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""),
({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"),
({"type": "json_object"}, 10, "(\\{|John)+"),
({"type": "sound"}, 0, None),
# invalid response format (expected to fail)
({"type": "json_object", "schema": 123}, 0, None),
({"type": "json_object", "schema": {"type": 123}}, 0, None),
({"type": "json_object", "schema": {"type": "hiccup"}}, 0, None),
])
def test_completion_with_response_format(response_format: dict, n_predicted: int, re_content: str | None):
global server
server.start()
res = server.make_request("POST", "/chat/completions", data={
"max_tokens": n_predicted,
"messages": [
{"role": "system", "content": "You are a coding assistant."},
{"role": "user", "content": "Write an example"},
],
"response_format": response_format,
})
if re_content is not None:
assert res.status_code == 200
choice = res.body["choices"][0]
assert match_regex(re_content, choice["message"]["content"])
else:
assert res.status_code != 200
assert "error" in res.body

View File

@ -0,0 +1,223 @@
import pytest
import time
from utils import *
server = ServerPreset.tinyllama2()
@pytest.fixture(scope="module", autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama2()
@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
])
def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
global server
server.start()
res = server.make_request("POST", "/completion", data={
"n_predict": n_predict,
"prompt": prompt,
})
assert res.status_code == 200
assert res.body["timings"]["prompt_n"] == n_prompt
assert res.body["timings"]["predicted_n"] == n_predicted
assert res.body["truncated"] == truncated
assert match_regex(re_content, res.body["content"])
@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
])
def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
global server
server.start()
res = server.make_stream_request("POST", "/completion", data={
"n_predict": n_predict,
"prompt": prompt,
"stream": True,
})
content = ""
for data in res:
if data["stop"]:
assert data["timings"]["prompt_n"] == n_prompt
assert data["timings"]["predicted_n"] == n_predicted
assert data["truncated"] == truncated
assert match_regex(re_content, content)
else:
content += data["content"]
@pytest.mark.parametrize("n_slots", [1, 2])
def test_consistent_result_same_seed(n_slots: int):
global server
server.n_slots = n_slots
server.start()
last_res = None
for _ in range(4):
res = server.make_request("POST", "/completion", data={
"prompt": "I believe the meaning of life is",
"seed": 42,
"temperature": 1.0,
"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
})
if last_res is not None:
assert res.body["content"] == last_res.body["content"]
last_res = res
@pytest.mark.parametrize("n_slots", [1, 2])
def test_different_result_different_seed(n_slots: int):
global server
server.n_slots = n_slots
server.start()
last_res = None
for seed in range(4):
res = server.make_request("POST", "/completion", data={
"prompt": "I believe the meaning of life is",
"seed": seed,
"temperature": 1.0,
"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
})
if last_res is not None:
assert res.body["content"] != last_res.body["content"]
last_res = res
@pytest.mark.parametrize("n_batch", [16, 32])
@pytest.mark.parametrize("temperature", [0.0, 1.0])
def test_consistent_result_different_batch_size(n_batch: int, temperature: float):
global server
server.n_batch = n_batch
server.start()
last_res = None
for _ in range(4):
res = server.make_request("POST", "/completion", data={
"prompt": "I believe the meaning of life is",
"seed": 42,
"temperature": temperature,
"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
})
if last_res is not None:
assert res.body["content"] == last_res.body["content"]
last_res = res
@pytest.mark.skip(reason="This test fails on linux, need to be fixed")
def test_cache_vs_nocache_prompt():
global server
server.start()
res_cache = server.make_request("POST", "/completion", data={
"prompt": "I believe the meaning of life is",
"seed": 42,
"temperature": 1.0,
"cache_prompt": True,
})
res_no_cache = server.make_request("POST", "/completion", data={
"prompt": "I believe the meaning of life is",
"seed": 42,
"temperature": 1.0,
"cache_prompt": False,
})
assert res_cache.body["content"] == res_no_cache.body["content"]
def test_completion_with_tokens_input():
global server
server.temperature = 0.0
server.start()
prompt_str = "I believe the meaning of life is"
res = server.make_request("POST", "/tokenize", data={
"content": prompt_str,
"add_special": True,
})
assert res.status_code == 200
tokens = res.body["tokens"]
# single completion
res = server.make_request("POST", "/completion", data={
"prompt": tokens,
})
assert res.status_code == 200
assert type(res.body["content"]) == str
# batch completion
res = server.make_request("POST", "/completion", data={
"prompt": [tokens, tokens],
})
assert res.status_code == 200
assert type(res.body) == list
assert len(res.body) == 2
assert res.body[0]["content"] == res.body[1]["content"]
# mixed string and tokens
res = server.make_request("POST", "/completion", data={
"prompt": [tokens, prompt_str],
})
assert res.status_code == 200
assert type(res.body) == list
assert len(res.body) == 2
assert res.body[0]["content"] == res.body[1]["content"]
# mixed string and tokens in one sequence
res = server.make_request("POST", "/completion", data={
"prompt": [1, 2, 3, 4, 5, 6, prompt_str, 7, 8, 9, 10, prompt_str],
})
assert res.status_code == 200
assert type(res.body["content"]) == str
@pytest.mark.parametrize("n_slots,n_requests", [
(1, 3),
(2, 2),
(2, 4),
(4, 2), # some slots must be idle
(4, 6),
])
def test_completion_parallel_slots(n_slots: int, n_requests: int):
global server
server.n_slots = n_slots
server.temperature = 0.0
server.start()
PROMPTS = [
("Write a very long book.", "(very|special|big)+"),
("Write another a poem.", "(small|house)+"),
("What is LLM?", "(Dad|said)+"),
("The sky is blue and I love it.", "(climb|leaf)+"),
("Write another very long music lyrics.", "(friends|step|sky)+"),
("Write a very long joke.", "(cat|Whiskers)+"),
]
def check_slots_status():
should_all_slots_busy = n_requests >= n_slots
time.sleep(0.1)
res = server.make_request("GET", "/slots")
n_busy = sum([1 for slot in res.body if slot["is_processing"]])
if should_all_slots_busy:
assert n_busy == n_slots
else:
assert n_busy <= n_slots
tasks = []
for i in range(n_requests):
prompt, re_content = PROMPTS[i % len(PROMPTS)]
tasks.append((server.make_request, ("POST", "/completion", {
"prompt": prompt,
"seed": 42,
"temperature": 1.0,
})))
tasks.append((check_slots_status, ()))
results = parallel_function_calls(tasks)
# check results
for i in range(n_requests):
prompt, re_content = PROMPTS[i % len(PROMPTS)]
res = results[i]
assert res.status_code == 200
assert type(res.body["content"]) == str
assert len(res.body["content"]) > 10
# FIXME: the result is not deterministic when using other slot than slot 0
# assert match_regex(re_content, res.body["content"])

View File

@ -0,0 +1,67 @@
import pytest
from utils import *
server = ServerPreset.tinyllama2()
LONG_TEXT = """
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
""".strip()
@pytest.fixture(scope="module", autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama2()
server.n_ctx = 256
server.n_slots = 2
def test_ctx_shift_enabled():
# the prompt is 301 tokens
# the slot context is 256/2 = 128 tokens
# the prompt is truncated to keep the last 109 tokens
# 64 tokens are generated thanks to shifting the context when it gets full
global server
server.start()
res = server.make_request("POST", "/completion", data={
"n_predict": 64,
"prompt": LONG_TEXT,
})
assert res.status_code == 200
assert res.body["timings"]["prompt_n"] == 109
assert res.body["timings"]["predicted_n"] == 64
assert res.body["truncated"] is True
@pytest.mark.parametrize("n_predict,n_token_output,truncated", [
(64, 64, False),
(-1, 120, True),
])
def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, truncated: bool):
global server
server.disable_ctx_shift = True
server.n_predict = -1
server.start()
res = server.make_request("POST", "/completion", data={
"n_predict": n_predict,
"prompt": "Hi how are you",
})
assert res.status_code == 200
assert res.body["timings"]["predicted_n"] == n_token_output
assert res.body["truncated"] == truncated
def test_ctx_shift_disabled_long_prompt():
global server
server.disable_ctx_shift = True
server.start()
res = server.make_request("POST", "/completion", data={
"n_predict": 64,
"prompt": LONG_TEXT,
})
assert res.status_code != 200
assert "error" in res.body
assert "exceeds the available context size" in res.body["error"]["message"]

View File

@ -0,0 +1,99 @@
import pytest
from openai import OpenAI
from utils import *
server = ServerPreset.bert_bge_small()
EPSILON = 1e-3
@pytest.fixture(scope="module", autouse=True)
def create_server():
global server
server = ServerPreset.bert_bge_small()
def test_embedding_single():
global server
server.start()
res = server.make_request("POST", "/embeddings", data={
"input": "I believe the meaning of life is",
})
assert res.status_code == 200
assert len(res.body['data']) == 1
assert 'embedding' in res.body['data'][0]
assert len(res.body['data'][0]['embedding']) > 1
# make sure embedding vector is normalized
assert abs(sum([x ** 2 for x in res.body['data'][0]['embedding']]) - 1) < EPSILON
def test_embedding_multiple():
global server
server.start()
res = server.make_request("POST", "/embeddings", data={
"input": [
"I believe the meaning of life is",
"Write a joke about AI from a very long prompt which will not be truncated",
"This is a test",
"This is another test",
],
})
assert res.status_code == 200
assert len(res.body['data']) == 4
for d in res.body['data']:
assert 'embedding' in d
assert len(d['embedding']) > 1
def test_embedding_openai_library_single():
global server
server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
res = client.embeddings.create(model="text-embedding-3-small", input="I believe the meaning of life is")
assert len(res.data) == 1
assert len(res.data[0].embedding) > 1
def test_embedding_openai_library_multiple():
global server
server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
res = client.embeddings.create(model="text-embedding-3-small", input=[
"I believe the meaning of life is",
"Write a joke about AI from a very long prompt which will not be truncated",
"This is a test",
"This is another test",
])
assert len(res.data) == 4
for d in res.data:
assert len(d.embedding) > 1
def test_embedding_error_prompt_too_long():
global server
server.start()
res = server.make_request("POST", "/embeddings", data={
"input": "This is a test " * 512,
})
assert res.status_code != 200
assert "too large" in res.body["error"]["message"]
def test_same_prompt_give_same_result():
server.start()
res = server.make_request("POST", "/embeddings", data={
"input": [
"I believe the meaning of life is",
"I believe the meaning of life is",
"I believe the meaning of life is",
"I believe the meaning of life is",
"I believe the meaning of life is",
],
})
assert res.status_code == 200
assert len(res.body['data']) == 5
for i in range(1, len(res.body['data'])):
v0 = res.body['data'][0]['embedding']
vi = res.body['data'][i]['embedding']
for x, y in zip(v0, vi):
assert abs(x - y) < EPSILON

View File

@ -0,0 +1,35 @@
import pytest
from utils import *
server = ServerPreset.tinyllama_infill()
@pytest.fixture(scope="module", autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama_infill()
def test_infill_without_input_extra():
global server
server.start()
res = server.make_request("POST", "/infill", data={
"prompt": "Complete this",
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_",
"input_suffix": "}\n",
})
assert res.status_code == 200
assert match_regex("(One|day|she|saw|big|scary|bird)+", res.body["content"])
def test_infill_with_input_extra():
global server
server.start()
res = server.make_request("POST", "/infill", data={
"prompt": "Complete this",
"input_extra": [{
"filename": "llama.h",
"text": "LLAMA_API int32_t llama_n_threads();\n"
}],
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_",
"input_suffix": "}\n",
})
assert res.status_code == 200
assert match_regex("(cuts|Jimmy|mom|came|into|the|room)+", res.body["content"])

View File

@ -0,0 +1,42 @@
import pytest
import os
from utils import *
server = ServerPreset.stories15m_moe()
LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf"
@pytest.fixture(scope="module", autouse=True)
def create_server():
global server
server = ServerPreset.stories15m_moe()
# download lora file if needed
file_name = LORA_FILE_URL.split('/').pop()
lora_file = f'../../../{file_name}'
if not os.path.exists(lora_file):
print(f"Downloading {LORA_FILE_URL} to {lora_file}")
with open(lora_file, 'wb') as f:
f.write(requests.get(LORA_FILE_URL).content)
print(f"Done downloading lora file")
server.lora_files = [lora_file]
@pytest.mark.parametrize("scale,re_content", [
# without applying lora, the model should behave like a bedtime story generator
(0.0, "(little|girl|three|years|old)+"),
# with lora, the model should behave like a Shakespearean text generator
(1.0, "(eye|love|glass|sun)+"),
])
def test_lora(scale: float, re_content: str):
global server
server.start()
res_lora_control = server.make_request("POST", "/lora-adapters", data=[
{"id": 0, "scale": scale}
])
assert res_lora_control.status_code == 200
res = server.make_request("POST", "/completion", data={
"prompt": "Look in thy glass",
})
assert res.status_code == 200
assert match_regex(re_content, res.body["content"])

View File

@ -0,0 +1,38 @@
import pytest
from utils import *
server = ServerPreset.jina_reranker_tiny()
@pytest.fixture(scope="module", autouse=True)
def create_server():
global server
server = ServerPreset.jina_reranker_tiny()
def test_rerank():
global server
server.start()
res = server.make_request("POST", "/rerank", data={
"query": "Machine learning is",
"documents": [
"A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.",
"Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.",
"Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.",
"Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine."
]
})
assert res.status_code == 200
assert len(res.body["results"]) == 4
most_relevant = res.body["results"][0]
least_relevant = res.body["results"][0]
for doc in res.body["results"]:
if doc["relevance_score"] > most_relevant["relevance_score"]:
most_relevant = doc
if doc["relevance_score"] < least_relevant["relevance_score"]:
least_relevant = doc
assert most_relevant["relevance_score"] > least_relevant["relevance_score"]
assert most_relevant["index"] == 2
assert least_relevant["index"] == 3

View File

@ -0,0 +1,83 @@
import pytest
from openai import OpenAI
from utils import *
server = ServerPreset.tinyllama2()
TEST_API_KEY = "sk-this-is-the-secret-key"
@pytest.fixture(scope="module", autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama2()
server.api_key = TEST_API_KEY
@pytest.mark.parametrize("endpoint", ["/health", "/models"])
def test_access_public_endpoint(endpoint: str):
global server
server.start()
res = server.make_request("GET", endpoint)
assert res.status_code == 200
assert "error" not in res.body
@pytest.mark.parametrize("api_key", [None, "invalid-key"])
def test_incorrect_api_key(api_key: str):
global server
server.start()
res = server.make_request("POST", "/completions", data={
"prompt": "I believe the meaning of life is",
}, headers={
"Authorization": f"Bearer {api_key}" if api_key else None,
})
assert res.status_code == 401
assert "error" in res.body
assert res.body["error"]["type"] == "authentication_error"
def test_correct_api_key():
global server
server.start()
res = server.make_request("POST", "/completions", data={
"prompt": "I believe the meaning of life is",
}, headers={
"Authorization": f"Bearer {TEST_API_KEY}",
})
assert res.status_code == 200
assert "error" not in res.body
assert "content" in res.body
def test_openai_library_correct_api_key():
global server
server.start()
client = OpenAI(api_key=TEST_API_KEY, base_url=f"http://{server.server_host}:{server.server_port}")
res = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a chatbot."},
{"role": "user", "content": "What is the meaning of life?"},
],
)
assert len(res.choices) == 1
@pytest.mark.parametrize("origin,cors_header,cors_header_value", [
("localhost", "Access-Control-Allow-Origin", "localhost"),
("web.mydomain.fr", "Access-Control-Allow-Origin", "web.mydomain.fr"),
("origin", "Access-Control-Allow-Credentials", "true"),
("web.mydomain.fr", "Access-Control-Allow-Methods", "GET, POST"),
("web.mydomain.fr", "Access-Control-Allow-Headers", "*"),
])
def test_cors_options(origin: str, cors_header: str, cors_header_value: str):
global server
server.start()
res = server.make_request("OPTIONS", "/completions", headers={
"Origin": origin,
"Access-Control-Request-Method": "POST",
"Access-Control-Request-Headers": "Authorization",
})
assert res.status_code == 200
assert cors_header in res.headers
assert res.headers[cors_header] == cors_header_value

View File

@ -0,0 +1,98 @@
import pytest
from utils import *
server = ServerPreset.tinyllama2()
@pytest.fixture(scope="module", autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama2()
server.slot_save_path = "./tmp"
server.temperature = 0.0
def test_slot_save_restore():
global server
server.start()
# First prompt in slot 1 should be fully processed
res = server.make_request("POST", "/completion", data={
"prompt": "What is the capital of France?",
"id_slot": 1,
"cache_prompt": True,
})
assert res.status_code == 200
assert match_regex("(Whiskers|Flana)+", res.body["content"])
assert res.body["timings"]["prompt_n"] == 21 # all tokens are processed
# Save state of slot 1
res = server.make_request("POST", "/slots/1?action=save", data={
"filename": "slot1.bin",
})
assert res.status_code == 200
assert res.body["n_saved"] == 84
# Since we have cache, this should only process the last tokens
res = server.make_request("POST", "/completion", data={
"prompt": "What is the capital of Germany?",
"id_slot": 1,
"cache_prompt": True,
})
assert res.status_code == 200
assert match_regex("(Jack|said)+", res.body["content"])
assert res.body["timings"]["prompt_n"] == 6 # only different part is processed
# Loading the saved cache into slot 0
res = server.make_request("POST", "/slots/0?action=restore", data={
"filename": "slot1.bin",
})
assert res.status_code == 200
assert res.body["n_restored"] == 84
# Since we have cache, slot 0 should only process the last tokens
res = server.make_request("POST", "/completion", data={
"prompt": "What is the capital of Germany?",
"id_slot": 0,
"cache_prompt": True,
})
assert res.status_code == 200
assert match_regex("(Jack|said)+", res.body["content"])
assert res.body["timings"]["prompt_n"] == 6 # only different part is processed
# For verification that slot 1 was not corrupted during slot 0 load, same thing should work
res = server.make_request("POST", "/completion", data={
"prompt": "What is the capital of Germany?",
"id_slot": 1,
"cache_prompt": True,
})
assert res.status_code == 200
assert match_regex("(Jack|said)+", res.body["content"])
assert res.body["timings"]["prompt_n"] == 1
def test_slot_erase():
global server
server.start()
res = server.make_request("POST", "/completion", data={
"prompt": "What is the capital of France?",
"id_slot": 1,
"cache_prompt": True,
})
assert res.status_code == 200
assert match_regex("(Whiskers|Flana)+", res.body["content"])
assert res.body["timings"]["prompt_n"] == 21 # all tokens are processed
# erase slot 1
res = server.make_request("POST", "/slots/1?action=erase")
assert res.status_code == 200
# re-run the same prompt, it should process all tokens again
res = server.make_request("POST", "/completion", data={
"prompt": "What is the capital of France?",
"id_slot": 1,
"cache_prompt": True,
})
assert res.status_code == 200
assert match_regex("(Whiskers|Flana)+", res.body["content"])
assert res.body["timings"]["prompt_n"] == 21 # all tokens are processed

View File

@ -0,0 +1,59 @@
import pytest
from utils import *
server = ServerPreset.tinyllama2()
@pytest.fixture(scope="module", autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama2()
def test_tokenize_detokenize():
global server
server.start()
# tokenize
content = "What is the capital of France ?"
res_tok = server.make_request("POST", "/tokenize", data={
"content": content
})
assert res_tok.status_code == 200
assert len(res_tok.body["tokens"]) > 5
# detokenize
res_detok = server.make_request("POST", "/detokenize", data={
"tokens": res_tok.body["tokens"],
})
assert res_detok.status_code == 200
assert res_detok.body["content"].strip() == content
def test_tokenize_with_bos():
global server
server.start()
# tokenize
content = "What is the capital of France ?"
bosId = 1
res_tok = server.make_request("POST", "/tokenize", data={
"content": content,
"add_special": True,
})
assert res_tok.status_code == 200
assert res_tok.body["tokens"][0] == bosId
def test_tokenize_with_pieces():
global server
server.start()
# tokenize
content = "This is a test string with unicode 媽 and emoji 🤗"
res_tok = server.make_request("POST", "/tokenize", data={
"content": content,
"with_pieces": True,
})
assert res_tok.status_code == 200
for token in res_tok.body["tokens"]:
assert "id" in token
assert token["id"] > 0
assert "piece" in token
assert len(token["piece"]) > 0

View File

@ -0,0 +1,361 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# type: ignore[reportUnusedImport]
import subprocess
import os
import re
import json
import sys
import requests
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import (
Any,
Callable,
ContextManager,
Iterable,
Iterator,
List,
Literal,
Tuple,
Set,
)
from re import RegexFlag
class ServerResponse:
headers: dict
status_code: int
body: dict | Any
class ServerProcess:
# default options
debug: bool = False
server_port: int = 8080
server_host: str = "127.0.0.1"
model_hf_repo: str = "ggml-org/models"
model_hf_file: str = "tinyllamas/stories260K.gguf"
model_alias: str = "tinyllama-2"
temperature: float = 0.8
seed: int = 42
# custom options
model_alias: str | None = None
model_url: str | None = None
model_file: str | None = None
n_threads: int | None = None
n_gpu_layer: int | None = None
n_batch: int | None = None
n_ubatch: int | None = None
n_ctx: int | None = None
n_ga: int | None = None
n_ga_w: int | None = None
n_predict: int | None = None
n_prompts: int | None = 0
slot_save_path: str | None = None
id_slot: int | None = None
cache_prompt: bool | None = None
n_slots: int | None = None
server_continuous_batching: bool | None = False
server_embeddings: bool | None = False
server_reranking: bool | None = False
server_metrics: bool | None = False
draft: int | None = None
api_key: str | None = None
response_format: str | None = None
lora_files: List[str] | None = None
disable_ctx_shift: int | None = False
# session variables
process: subprocess.Popen | None = None
def __init__(self):
if "N_GPU_LAYERS" in os.environ:
self.n_gpu_layer = int(os.environ["N_GPU_LAYERS"])
if "DEBUG" in os.environ:
self.debug = True
if "PORT" in os.environ:
self.server_port = int(os.environ["PORT"])
def start(self, timeout_seconds: int = 10) -> None:
if "LLAMA_SERVER_BIN_PATH" in os.environ:
server_path = os.environ["LLAMA_SERVER_BIN_PATH"]
elif os.name == "nt":
server_path = "../../../build/bin/Release/llama-server.exe"
else:
server_path = "../../../build/bin/llama-server"
server_args = [
"--slots", # requires to get slot status via /slots endpoint
"--host",
self.server_host,
"--port",
self.server_port,
"--temp",
self.temperature,
"--seed",
self.seed,
]
if self.model_file:
server_args.extend(["--model", self.model_file])
if self.model_url:
server_args.extend(["--model-url", self.model_url])
if self.model_hf_repo:
server_args.extend(["--hf-repo", self.model_hf_repo])
if self.model_hf_file:
server_args.extend(["--hf-file", self.model_hf_file])
if self.n_batch:
server_args.extend(["--batch-size", self.n_batch])
if self.n_ubatch:
server_args.extend(["--ubatch-size", self.n_ubatch])
if self.n_threads:
server_args.extend(["--threads", self.n_threads])
if self.n_gpu_layer:
server_args.extend(["--n-gpu-layers", self.n_gpu_layer])
if self.draft is not None:
server_args.extend(["--draft", self.draft])
if self.server_continuous_batching:
server_args.append("--cont-batching")
if self.server_embeddings:
server_args.append("--embedding")
if self.server_reranking:
server_args.append("--reranking")
if self.server_metrics:
server_args.append("--metrics")
if self.model_alias:
server_args.extend(["--alias", self.model_alias])
if self.n_ctx:
server_args.extend(["--ctx-size", self.n_ctx])
if self.n_slots:
server_args.extend(["--parallel", self.n_slots])
if self.n_predict:
server_args.extend(["--n-predict", self.n_predict])
if self.slot_save_path:
server_args.extend(["--slot-save-path", self.slot_save_path])
if self.n_ga:
server_args.extend(["--grp-attn-n", self.n_ga])
if self.n_ga_w:
server_args.extend(["--grp-attn-w", self.n_ga_w])
if self.debug:
server_args.append("--verbose")
if self.lora_files:
for lora_file in self.lora_files:
server_args.extend(["--lora", lora_file])
if self.disable_ctx_shift:
server_args.extend(["--no-context-shift"])
if self.api_key:
server_args.extend(["--api-key", self.api_key])
args = [str(arg) for arg in [server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}")
flags = 0
if "nt" == os.name:
flags |= subprocess.DETACHED_PROCESS
flags |= subprocess.CREATE_NEW_PROCESS_GROUP
flags |= subprocess.CREATE_NO_WINDOW
self.process = subprocess.Popen(
[str(arg) for arg in [server_path, *server_args]],
creationflags=flags,
stdout=sys.stdout,
stderr=sys.stdout,
env={**os.environ, "LLAMA_CACHE": "tmp"},
)
server_instances.add(self)
print(f"server pid={self.process.pid}, pytest pid={os.getpid()}")
# wait for server to start
start_time = time.time()
while time.time() - start_time < timeout_seconds:
try:
response = self.make_request("GET", "/slots", headers={
"Authorization": f"Bearer {self.api_key}" if self.api_key else None
})
if response.status_code == 200:
self.ready = True
return # server is ready
except Exception as e:
pass
print(f"Waiting for server to start...")
time.sleep(0.5)
raise TimeoutError(f"Server did not start within {timeout_seconds} seconds")
def stop(self) -> None:
server_instances.remove(self)
if self.process:
print(f"Stopping server with pid={self.process.pid}")
self.process.kill()
self.process = None
def make_request(
self,
method: str,
path: str,
data: dict | Any | None = None,
headers: dict | None = None,
) -> ServerResponse:
url = f"http://{self.server_host}:{self.server_port}{path}"
parse_body = False
if method == "GET":
response = requests.get(url, headers=headers)
parse_body = True
elif method == "POST":
response = requests.post(url, headers=headers, json=data)
parse_body = True
elif method == "OPTIONS":
response = requests.options(url, headers=headers)
else:
raise ValueError(f"Unimplemented method: {method}")
result = ServerResponse()
result.headers = dict(response.headers)
result.status_code = response.status_code
result.body = response.json() if parse_body else None
print("Response from server", result.body)
return result
def make_stream_request(
self,
method: str,
path: str,
data: dict | None = None,
headers: dict | None = None,
) -> Iterator[dict]:
url = f"http://{self.server_host}:{self.server_port}{path}"
if method == "POST":
response = requests.post(url, headers=headers, json=data, stream=True)
else:
raise ValueError(f"Unimplemented method: {method}")
for line_bytes in response.iter_lines():
line = line_bytes.decode("utf-8")
if '[DONE]' in line:
break
elif line.startswith('data: '):
data = json.loads(line[6:])
print("Partial response from server", data)
yield data
server_instances: Set[ServerProcess] = set()
class ServerPreset:
@staticmethod
def tinyllama2() -> ServerProcess:
server = ServerProcess()
server.model_hf_repo = "ggml-org/models"
server.model_hf_file = "tinyllamas/stories260K.gguf"
server.model_alias = "tinyllama-2"
server.n_ctx = 256
server.n_batch = 32
server.n_slots = 2
server.n_predict = 64
server.seed = 42
return server
@staticmethod
def bert_bge_small() -> ServerProcess:
server = ServerProcess()
server.model_hf_repo = "ggml-org/models"
server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
server.model_alias = "bert-bge-small"
server.n_ctx = 512
server.n_batch = 128
server.n_ubatch = 128
server.n_slots = 2
server.seed = 42
server.server_embeddings = True
return server
@staticmethod
def tinyllama_infill() -> ServerProcess:
server = ServerProcess()
server.model_hf_repo = "ggml-org/models"
server.model_hf_file = "tinyllamas/stories260K-infill.gguf"
server.model_alias = "tinyllama-infill"
server.n_ctx = 2048
server.n_batch = 1024
server.n_slots = 1
server.n_predict = 64
server.temperature = 0.0
server.seed = 42
return server
@staticmethod
def stories15m_moe() -> ServerProcess:
server = ServerProcess()
server.model_hf_repo = "ggml-org/stories15M_MOE"
server.model_hf_file = "stories15M_MOE-F16.gguf"
server.model_alias = "stories15m-moe"
server.n_ctx = 2048
server.n_batch = 1024
server.n_slots = 1
server.n_predict = 64
server.temperature = 0.0
server.seed = 42
return server
@staticmethod
def jina_reranker_tiny() -> ServerProcess:
server = ServerProcess()
server.model_hf_repo = "ggml-org/models"
server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
server.model_alias = "jina-reranker"
server.n_ctx = 512
server.n_batch = 512
server.n_slots = 1
server.seed = 42
server.server_reranking = True
return server
def parallel_function_calls(function_list: List[Tuple[Callable[..., Any], Tuple[Any, ...]]]) -> List[Any]:
"""
Run multiple functions in parallel and return results in the same order as calls. Equivalent to Promise.all in JS.
Example usage:
results = parallel_function_calls([
(func1, (arg1, arg2)),
(func2, (arg3, arg4)),
])
"""
results = [None] * len(function_list)
exceptions = []
def worker(index, func, args):
try:
result = func(*args)
results[index] = result
except Exception as e:
exceptions.append((index, str(e)))
with ThreadPoolExecutor() as executor:
futures = []
for i, (func, args) in enumerate(function_list):
future = executor.submit(worker, i, func, args)
futures.append(future)
# Wait for all futures to complete
for future in as_completed(futures):
pass
# Check if there were any exceptions
if exceptions:
print("Exceptions occurred:")
for index, error in exceptions:
print(f"Function at index {index}: {error}")
return results
def match_regex(regex: str, text: str) -> bool:
return (
re.compile(
regex, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL
).search(text)
is not None
)

View File

@ -3,7 +3,7 @@
The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt. The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
```bash ```bash
./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" ./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
... ...

View File

@ -70,13 +70,13 @@ int main(int argc, char ** argv) {
std::vector<llama_token> inp; std::vector<llama_token> inp;
inp = common_tokenize(ctx_tgt, params.prompt, true, true); inp = common_tokenize(ctx_tgt, params.prompt, true, true);
if (llama_n_ctx(ctx_tgt) < (int) inp.size()) { if (llama_n_ctx(ctx_tgt) < (uint32_t) inp.size()) {
LOG_ERR("%s: the prompt exceeds the context size (%d tokens, ctx %d)\n", __func__, (int) inp.size(), llama_n_ctx(ctx_tgt)); LOG_ERR("%s: the prompt exceeds the context size (%d tokens, ctx %d)\n", __func__, (int) inp.size(), llama_n_ctx(ctx_tgt));
return 1; return 1;
} }
if (llama_n_batch(ctx_tgt) < (int) inp.size()) { if (llama_n_batch(ctx_tgt) < (uint32_t) inp.size()) {
LOG_ERR("%s: the prompt exceeds the batch size (%d tokens, batch %d)\n", __func__, (int) inp.size(), llama_n_batch(ctx_tgt)); LOG_ERR("%s: the prompt exceeds the batch size (%d tokens, batch %d)\n", __func__, (int) inp.size(), llama_n_batch(ctx_tgt));
return 1; return 1;
@ -117,7 +117,8 @@ int main(int argc, char ** argv) {
llama_token id_last = inp.back(); llama_token id_last = inp.back();
// all tokens currently in the target context // all tokens currently in the target context
auto prompt_tgt = std::vector<llama_token>(inp.begin(), inp.end() - 1); llama_tokens prompt_tgt(inp.begin(), inp.end() - 1);
prompt_tgt.reserve(llama_n_ctx(ctx_tgt));
int n_past = inp.size() - 1; int n_past = inp.size() - 1;
@ -154,7 +155,7 @@ int main(int argc, char ** argv) {
// evaluate the target model on [id_last, draft0, draft1, ..., draftN-1] // evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
{ {
// do not waste time on small drafts // do not waste time on small drafts
if (draft.size() < n_draft_min) { if (draft.size() < (size_t) n_draft_min) {
draft.clear(); draft.clear();
} }
@ -181,29 +182,26 @@ int main(int argc, char ** argv) {
GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token
n_past += ids.size() - 1; n_past += ids.size() - 1;
n_drafted += batch_tgt.n_tokens - 1; n_drafted += draft.size(); // note: we ignore the discarded small drafts
n_accept += ids.size() - 1; n_accept += ids.size() - 1;
n_predict += ids.size();
// process the accepted tokens and update contexts // process the accepted tokens and update contexts
// //
// this is the standard token post-processing that we normally do // this is the standard token post-processing that we normally do
// in this case, we do it for a group of accepted tokens at once // in this case, we do it for a group of accepted tokens at once
// //
{
llama_token id;
std::string token_str;
for (size_t i = 0; i < ids.size(); ++i) { for (size_t i = 0; i < ids.size(); ++i) {
id = ids[i]; prompt_tgt.push_back(id_last);
++n_predict; id_last = ids[i];
if (llama_token_is_eog(model_tgt, id)) { if (llama_token_is_eog(model_tgt, id_last)) {
has_eos = true; has_eos = true;
break; break;
} }
token_str = common_token_to_piece(ctx_tgt, id); const std::string token_str = common_token_to_piece(ctx_tgt, id_last);
if (params.use_color && i + 1 < ids.size()) { if (params.use_color && i + 1 < ids.size()) {
LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str()); LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str());
@ -212,11 +210,7 @@ int main(int argc, char ** argv) {
} }
} }
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) { LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d)\n", (int) ids.size() - 1, (int) draft.size(), id_last);
break;
}
LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d, '%s')\n", (int) ids.size() - 1, (int) draft.size(), id, token_str.c_str());
{ {
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past); LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
@ -224,11 +218,8 @@ int main(int argc, char ** argv) {
llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1); llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
} }
prompt_tgt.push_back(id_last); if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
prompt_tgt.insert(prompt_tgt.end(), ids.begin(), ids.end() - 1); break;
// remember the last accepted token for the next iteration
id_last = id;
} }
} }

View File

@ -91,6 +91,7 @@ extern "C" {
GGML_BACKEND_API int ggml_cpu_has_neon (void); GGML_BACKEND_API int ggml_cpu_has_neon (void);
GGML_BACKEND_API int ggml_cpu_has_arm_fma (void); GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
GGML_BACKEND_API int ggml_cpu_has_fp16_va (void); GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
GGML_BACKEND_API int ggml_cpu_has_dotprod (void);
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void); GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
GGML_BACKEND_API int ggml_cpu_has_sve (void); GGML_BACKEND_API int ggml_cpu_has_sve (void);
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes

View File

@ -389,6 +389,9 @@ extern "C" {
GGML_TYPE_Q4_0_8_8 = 33, GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_TQ1_0 = 34, GGML_TYPE_TQ1_0 = 34,
GGML_TYPE_TQ2_0 = 35, GGML_TYPE_TQ2_0 = 35,
GGML_TYPE_IQ4_NL_4_4 = 36,
// GGML_TYPE_IQ4_NL_4_8 = 37,
// GGML_TYPE_IQ4_NL_8_8 = 38,
GGML_TYPE_COUNT, GGML_TYPE_COUNT,
}; };

View File

@ -24,7 +24,7 @@ if (NOT MSVC)
endif() endif()
endif() endif()
function(get_flags CCID CCVER) function(ggml_get_flags CCID CCVER)
set(C_FLAGS "") set(C_FLAGS "")
set(CXX_FLAGS "") set(CXX_FLAGS "")
@ -41,6 +41,7 @@ function(get_flags CCID CCVER)
elseif (CCID STREQUAL "GNU") elseif (CCID STREQUAL "GNU")
set(C_FLAGS -Wdouble-promotion) set(C_FLAGS -Wdouble-promotion)
set(CXX_FLAGS -Wno-array-bounds) set(CXX_FLAGS -Wno-array-bounds)
if (CCVER VERSION_GREATER_EQUAL 8.1.0) if (CCVER VERSION_GREATER_EQUAL 8.1.0)
list(APPEND CXX_FLAGS -Wextra-semi) list(APPEND CXX_FLAGS -Wextra-semi)
endif() endif()
@ -69,7 +70,7 @@ if (GGML_ALL_WARNINGS)
list(APPEND C_FLAGS ${WARNING_FLAGS}) list(APPEND C_FLAGS ${WARNING_FLAGS})
list(APPEND CXX_FLAGS ${WARNING_FLAGS}) list(APPEND CXX_FLAGS ${WARNING_FLAGS})
get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}) ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>" add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
"$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>") "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")

View File

@ -22,13 +22,14 @@ if(NOT SOC_TYPE)
detect_ascend_soc_type(SOC_VERSION) detect_ascend_soc_type(SOC_VERSION)
set(SOC_TYPE "${SOC_VERSION}") set(SOC_TYPE "${SOC_VERSION}")
message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}") message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
else()
string(TOLOWER ${SOC_TYPE} SOC_VERSION)
endif() endif()
# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P. string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND_310P.
string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}") string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}") set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
if (CANN_INSTALL_DIR) if (CANN_INSTALL_DIR)
# Only Support Linux. # Only Support Linux.

View File

@ -21,13 +21,15 @@
*/ */
#include "aclnn_ops.h" #include "aclnn_ops.h"
#include "ggml-impl.h"
#include <aclnnop/aclnn_addcdiv.h>
#include <aclnnop/aclnn_avgpool2d.h> #include <aclnnop/aclnn_avgpool2d.h>
#include <aclnnop/aclnn_batch_matmul.h>
#include <aclnnop/aclnn_cast.h> #include <aclnnop/aclnn_cast.h>
#include <aclnnop/aclnn_constant_pad_nd.h> #include <aclnnop/aclnn_constant_pad_nd.h>
#include <aclnnop/aclnn_copy.h> #include <aclnnop/aclnn_copy.h>
#include <aclnnop/aclnn_cos.h> #include <aclnnop/aclnn_cos.h>
#include <aclnnop/aclnn_div.h>
#include <aclnnop/aclnn_exp.h> #include <aclnnop/aclnn_exp.h>
#include <aclnnop/aclnn_fill_scalar.h> #include <aclnnop/aclnn_fill_scalar.h>
#include <aclnnop/aclnn_group_norm.h> #include <aclnnop/aclnn_group_norm.h>
@ -35,6 +37,7 @@
#include <aclnnop/aclnn_layer_norm.h> #include <aclnnop/aclnn_layer_norm.h>
#include <aclnnop/aclnn_matmul.h> #include <aclnnop/aclnn_matmul.h>
#include <aclnnop/aclnn_max_pool.h> #include <aclnnop/aclnn_max_pool.h>
#include <aclnnop/aclnn_mm.h>
#include <aclnnop/aclnn_permute.h> #include <aclnnop/aclnn_permute.h>
#include <aclnnop/aclnn_pow_tensor_tensor.h> #include <aclnnop/aclnn_pow_tensor_tensor.h>
#include <aclnnop/aclnn_reduce_sum.h> #include <aclnnop/aclnn_reduce_sum.h>
@ -54,6 +57,7 @@
#include <exception> #include <exception>
#include <vector> #include <vector>
#include "ggml-impl.h"
#include "kernels/ascendc_kernels.h" #include "kernels/ascendc_kernels.h"
#define GGML_COMMON_DECL_C #define GGML_COMMON_DECL_C
@ -1101,9 +1105,9 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
} }
/** /**
* @brief Creates an ACL tensor initialized with ones using a provided buffer. * @brief Creates an ACL tensor initialized with value using a provided buffer.
* *
* This function initializes a tensor with ones using the specified buffer and * This function initializes a tensor with value using the specified buffer and
* tensor parameters. * tensor parameters.
* *
* @param ctx The context for the CANN backend operations. * @param ctx The context for the CANN backend operations.
@ -1116,9 +1120,9 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
* @param type_size The size of each element in the tensor data type. * @param type_size The size of each element in the tensor data type.
* @param value The value to be used for initializing the tensor (default * @param value The value to be used for initializing the tensor (default
* is 1.0). * is 1.0).
* @return An ACL tensor initialized with ones. * @return An ACL tensor initialized with value.
*/ */
static aclTensor* aclnn_ones(ggml_backend_cann_context& ctx, void* buffer, static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
size_t n_bytes, int64_t* ne, int64_t dims, size_t n_bytes, int64_t* ne, int64_t dims,
aclDataType type, size_t type_size, aclDataType type, size_t type_size,
float value = 1.0f) { float value = 1.0f) {
@ -1163,7 +1167,7 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src); size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src);
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes); ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
aclTensor* acl_gamma = aclnn_ones( aclTensor* acl_gamma = aclnn_values(
ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1, ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1,
ggml_cann_type_mapping(src->type), ggml_element_size(src)); ggml_cann_type_mapping(src->type), ggml_element_size(src));
@ -1207,8 +1211,8 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes); ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
aclTensor* mask_tensor = aclTensor* mask_tensor =
aclnn_ones(ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, aclnn_values(ctx, one_tensor_allocator.get(), one_tensor_n_bytes,
GGML_MAX_DIMS, ggml_cann_type_mapping(src->type), src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
ggml_element_size(src), value); ggml_element_size(src), value);
uint64_t workspaceSize = 0; uint64_t workspaceSize = 0;
@ -1766,6 +1770,92 @@ static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor, ctx.stream())); ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor, ctx.stream()));
} }
/**
* @brief Performs element-wise division of tensor1 by tensor2 , multiplies the
result by the scalar value and adds it to self .
*
* Performs element-wise division of tensor1 by tensor2,
* multiplies the result by the scalar value and adds it to self .
* The operation is defined as:
* \f[
* \text{out}_i = \text{selft}_i + \text{value} \times
\frac{\text{tensor1}_i}{\text{tensor2}_i}
* \f]
* @param ctx The context for the CANN backend operations.
* @param acl_self The source tensor on which the addcdiv function will be
applied.
* @param tensor1 Numerator tensor.
* @param tensor2 Denominator tensor.
* @param value The value to be used for coefficient.
*/
static void aclnn_inplace_addcdiv(ggml_backend_cann_context& ctx,
aclTensor* acl_self, aclTensor* tensor1,
aclTensor* tensor2, float value) {
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
ACL_CHECK(aclnnInplaceAddcdivGetWorkspaceSize(
acl_self, tensor1, tensor2, acl_value, &workspaceSize, &executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
ACL_CHECK(aclnnInplaceAddcdiv(workspaceAddr, workspaceSize, executor,
ctx.stream()));
}
/**
* @brief Matrix division, optionally in-place.
*
* This function division each element of the source tensor `acl_src` by the
* tensor `acl_other` and stores the result in the destination tensor `acl_dst`.
* If `inplace` is true, `acl_dst` will not be used and the operation is
* performed in-place on `acl_src`. The operation is defined as: \f[
* \text{dst}_i = \frac{\text{acl_src}_i}{\text{acl_other}_i}
* \f]
*
* @param ctx The context for the CANN backend operations.
* @param acl_src Numerator tensor..
* @param acl_other Denominator tensor.
* @param acl_dst The destination tensor where the result will be stored if
* `inplace` is false.
* @param inplace Flag indicating whether to perform the operation in-place on
* `acl_src`.
*/
static void aclnn_div_tensor(ggml_backend_cann_context& ctx, aclTensor* acl_src,
aclTensor* acl_other, aclTensor* acl_dst,
bool inplace) {
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
if (inplace) {
ACL_CHECK(aclnnInplaceDivGetWorkspaceSize(acl_src, acl_other,
&workspaceSize, &executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
ACL_CHECK(aclnnInplaceDiv(workspaceAddr, workspaceSize, executor,
ctx.stream()));
} else {
ACL_CHECK(aclnnDivGetWorkspaceSize(acl_src, acl_other, acl_dst,
&workspaceSize, &executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
ACL_CHECK(
aclnnDiv(workspaceAddr, workspaceSize, executor, ctx.stream()));
}
}
void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
ggml_tensor* dst) { ggml_tensor* dst) {
const ggml_tensor* src = dst->src[0]; const ggml_tensor* src = dst->src[0];
@ -2309,12 +2399,13 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ctx.stream())); ctx.stream()));
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F32: case GGML_TYPE_F32: {
{
#ifdef ASCEND_310P #ifdef ASCEND_310P
// Special operation for get_row_f32 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes // Special operation for get_row_f32 kernel of 310P: clear the
// content of dest data buffer when row is not aligned to 32 bytes
if ((src0->ne[0] % 8) != 0) { if ((src0->ne[0] % 8) != 0) {
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] *
src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len)); ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
} }
#endif #endif
@ -2327,12 +2418,15 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->nb); ((ggml_tensor*)dst->extra)->nb);
break; break;
} }
case GGML_TYPE_F16: case GGML_TYPE_F16: {
{
#ifdef ASCEND_310P #ifdef ASCEND_310P
// Special operation for get_row_f16 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes // Special operation for get_row_f16 kernel of 310P: clear the
// content of dest data buffer when row is not aligned to 32 bytes
if ((src0->ne[0] % 16) != 0) { if ((src0->ne[0] % 16) != 0) {
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16 size_t dst_len =
src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] *
ggml_type_size(
GGML_TYPE_F32); // out is also f32, even input is f16
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len)); ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
} }
#endif #endif
@ -2423,7 +2517,6 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
aclTensor* acl_weight, aclTensor* acl_dst) { aclTensor* acl_weight, aclTensor* acl_dst) {
int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is
// fp32, atlas a2 will transpose it to HFLOAT32. // fp32, atlas a2 will transpose it to HFLOAT32.
uint64_t workspaceSize = 0; uint64_t workspaceSize = 0;
aclOpExecutor* executor; aclOpExecutor* executor;
void* workspaceAddr = nullptr; void* workspaceAddr = nullptr;
@ -2441,6 +2534,81 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream())); aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
} }
/**
* @brief Performs matrix multiplication of two 2D tensors.
*
* This function computes the matrix multiplication of the input tensor
* `acl_input` and the weight tensor `acl_weight`, and stores the result in the
* destination tensor `acl_dst`.
* The operation is defined as:
* \f[
* \text {acl_dst}=\text {acl_input@acl_weight}
* \f]
*
* @param ctx The context for the CANN backend operations.
* @param acl_input The input tensor for the matrix multiplication.
* @param acl_weight The weight tensor for the matrix multiplication.
* @param acl_dst The destination tensor where the result of the matrix
* multiplication will be stored.
*/
static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx,
aclTensor* acl_input, aclTensor* acl_weight,
aclTensor* acl_dst) {
int8_t cube_math_type = 2;
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst,
cube_math_type, &workspaceSize,
&executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
ACL_CHECK(aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
}
/**
* @brief Performs matrix multiplication of two 3D tensors.
*
* This function computes the matrix multiplication of the input tensor
* `acl_input` and the weight tensor `acl_weight`, and stores the result in the
* destination tensor `acl_dst`.
* The operation is defined as:
* \f[
* \text {acl_dst}=\text {acl_input@acl_weight}
* \f]
*
* @param ctx The context for the CANN backend operations.
* @param acl_input The input tensor for the matrix multiplication.
* @param acl_weight The weight tensor for the matrix multiplication.
* @param acl_dst The destination tensor where the result of the matrix
* multiplication will be stored.
*/
static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx,
aclTensor* acl_input, aclTensor* acl_weight,
aclTensor* acl_dst) {
int8_t cube_math_type = 2;
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
cube_math_type, &workspaceSize,
&executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
ACL_CHECK(
aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
}
/** /**
* @brief Performs matrix multiplication with floating-point precision on * @brief Performs matrix multiplication with floating-point precision on
* tensors using the CANN backend. * tensors using the CANN backend.
@ -2462,20 +2630,39 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
// broadcast, when weight ne2 or ne3 is not 1, weight need repeat. // broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
BCAST_MUL_MAT_SHAPE(input, weight, dst); BCAST_MUL_MAT_SHAPE(input, weight, dst);
// transpose weight: [1,2,3,4] -> [1,2,4,3] int64_t n_dims = bcast_dims;
if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) {
if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) {
n_dims = 2;
} else if (bcast_input_ne[2] == 1) {
n_dims = 3;
}
}
aclTensor* acl_input_tensor =
ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0], int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
bcast_weight_ne[2], bcast_weight_ne[3], bcast_weight_ne[2], bcast_weight_ne[3],
bcast_weight_ne[4], bcast_weight_ne[5]}; bcast_weight_ne[4], bcast_weight_ne[5]};
size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0], size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
bcast_weight_nb[2], bcast_weight_nb[3], bcast_weight_nb[2], bcast_weight_nb[3],
bcast_weight_nb[4], bcast_weight_nb[5]}; bcast_weight_nb[4], bcast_weight_nb[5]};
aclTensor* acl_weight_tensor = aclTensor* acl_weight_tensor =
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, bcast_dims); ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
aclTensor* acl_input_tensor = aclTensor* acl_dst =
ggml_cann_create_tensor(input, BCAST_MUL_MAT_PARAM(input)); ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
aclTensor* acl_dst = ggml_cann_create_tensor(dst, BCAST_MUL_MAT_PARAM(dst));
switch (n_dims) {
case 2:
aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
break;
case 3:
aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
break;
default:
aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
break;
}
ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
ACL_CHECK(aclDestroyTensor(acl_input_tensor)); ACL_CHECK(aclDestroyTensor(acl_input_tensor));
@ -2501,46 +2688,42 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
ggml_tensor* src0 = dst->src[0]; // weight ggml_tensor* src0 = dst->src[0]; // weight
ggml_tensor* src1 = dst->src[1]; // input ggml_tensor* src1 = dst->src[1]; // input
// The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC // The shape of the weight is NCHW.
// is regarded as batch. weight need transpose. // Matrix multiplication uses HW dims.
int64_t weight_ne[] = {src0->ne[1], src0->ne[0]}; // HC is regarded as batch.
// weight need transpose.
float weight_elem_size; float weight_elem_size;
if (type == GGML_TYPE_Q4_0) { if (type == GGML_TYPE_Q4_0) {
weight_elem_size = float(sizeof(uint8_t)) / 2; weight_elem_size = float(sizeof(uint8_t)) / 2;
} } else if (type == GGML_TYPE_Q8_0) {
else if (type == GGML_TYPE_Q8_0) {
weight_elem_size = float(sizeof(uint8_t)); weight_elem_size = float(sizeof(uint8_t));
} } else {
else {
GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT"); GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
} }
float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size}; float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
// size of one matrix is element_size * height * width.
size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3]; size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
// scale stored at the end of weight. Also need transpose. // scale stored at the end of weight. Also need transpose.
GGML_ASSERT(QK4_0 == QK8_0);
int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
size_t scale_elem_size = sizeof(uint16_t); size_t scale_elem_size = sizeof(uint16_t);
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
scale_elem_size}; scale_elem_size};
size_t scale_stride = scale_elem_size * src0->ne[0] * src0->ne[1] / QK8_0; size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
char* scale_offset = (char*)src0->data + weight_size; char* scale_offset = (char*)src0->data + weight_size;
// input // input
void* input_buffer;
size_t input_elem_size = sizeof(uint16_t); size_t input_elem_size = sizeof(uint16_t);
int64_t input_ne[] = {src1->ne[0], src1->ne[1]}; int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]}; size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1]; size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
ggml_cann_pool_alloc input_alloctor(ctx.pool()); ggml_cann_pool_alloc input_alloctor(ctx.pool());
void* input_buffer = src1->data;
// case in
if (src1->type != GGML_TYPE_F16) { if (src1->type != GGML_TYPE_F16) {
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1); aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
input_buffer =
input_alloctor.alloc(ggml_nelements(src1) * input_elem_size); input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
input_buffer = input_alloctor.get();
int64_t* input_cast_ne = src1->ne; int64_t* input_cast_ne = src1->ne;
size_t input_cast_nb[GGML_MAX_DIMS]; size_t input_cast_nb[GGML_MAX_DIMS];
@ -2553,70 +2736,119 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne, input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
input_cast_nb, GGML_MAX_DIMS); input_cast_nb, GGML_MAX_DIMS);
aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16); aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
ACL_CHECK(aclDestroyTensor(acl_input_tensor)); ACL_CHECK(aclDestroyTensor(acl_input_tensor));
ACL_CHECK(aclDestroyTensor(acl_src1_tensor)); ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
} else {
input_buffer = src1->data;
} }
// output // output
size_t output_elem_size = sizeof(uint16_t); size_t output_elem_size = sizeof(uint16_t);
int64_t output_ne[] = {dst->ne[0], dst->ne[1]}; size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
size_t output_nb[] = {output_elem_size, output_elem_size * dst->ne[0]}; ggml_cann_pool_alloc output_allocator(ctx.pool());
ggml_cann_pool_alloc output_alloctor( void* output_buffer =
ctx.pool(), ggml_nelements(dst) * output_elem_size); output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
void* output_buffer = output_alloctor.get(); size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
size_t output_stride = output_elem_size * dst->ne[0] * dst->ne[1];
// aclnn // aclnn
int64_t max_elem_size = 65535;
int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
ggml_cann_pool_alloc workspace_allocator(ctx.pool());
aclOpExecutor* executor = nullptr;
uint64_t workspaceSize = 0; uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr; void* workspaceAddr = nullptr;
for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) { for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) { for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]); int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]); int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
int64_t batch1 = n1 * src1->ne[2] + c1; int64_t batch1 = (n1 * src1->ne[2]) + c1;
int64_t batch0 = n0 * src0->ne[2] + c0; int64_t batch0 = (n0 * src0->ne[2]) + c0;
aclTensor* acl_input_tensor = ggml_cann_create_tensor( aclTensor* acl_input_tensor = ggml_cann_create_tensor(
(char*)input_buffer + batch1 * input_stride, ACL_FLOAT16, (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
input_elem_size, input_ne, input_nb, 2); input_elem_size, input_ne, input_nb, 2);
// first split
int64_t weight_ne_offset = 0;
int64_t weight_ne[2] = {
max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size,
src0->ne[0]};
int64_t scale_ne_offset = 0;
int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
int64_t output_ne_offset = 0;
int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};
aclTensor* acl_weight_tensor = ggml_cann_create_tensor( aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
(char*)src0->data + batch0 * weight_stride, (char*)src0->data + batch0 * weight_stride,
ggml_cann_type_mapping(type), weight_elem_size, weight_ne, ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
weight_nb, 2); weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
aclTensor* acl_scale_tensor = ggml_cann_create_tensor( aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_offset + batch0 * scale_stride, ACL_FLOAT16,
scale_elem_size, scale_ne, scale_nb, 2); scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
scale_ne_offset);
aclTensor* acl_output_tensor = ggml_cann_create_tensor( aclTensor* acl_output_tensor = ggml_cann_create_tensor(
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16, (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
output_elem_size, output_ne, output_nb, 2); output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
output_ne_offset);
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr, acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
nullptr, nullptr, nullptr, QK8_0, acl_output_tensor, nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
&workspaceSize, &executor)); &workspaceSize, &executor));
if (workspaceAddr == nullptr) {
if (workspaceSize > 0 && workspaceAddr == nullptr) { workspaceAddr = workspace_allocator.alloc(workspaceSize);
ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
workspaceSize);
workspaceAddr = workspace_allocator.get();
} }
ACL_CHECK(aclnnWeightQuantBatchMatmulV2( ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
workspaceAddr, workspaceSize, executor, ctx.stream())); workspaceAddr, workspaceSize, executor, ctx.stream()));
ACL_CHECK(aclDestroyTensor(acl_input_tensor)); ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
// other splits
for (int64_t split = 1; split < split_size; split++) {
weight_ne_offset +=
weight_elem_size * weight_ne[0] * weight_ne[1];
weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1]
? src0->ne[1] - (max_elem_size * split)
: max_elem_size;
scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
scale_ne[0] = weight_ne[0];
output_ne_offset +=
output_elem_size * output_ne[0] * output_ne[1];
output_ne[0] = weight_ne[0];
acl_weight_tensor = ggml_cann_create_tensor(
(char*)src0->data + batch0 * weight_stride,
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
acl_scale_tensor = ggml_cann_create_tensor(
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
scale_ne_offset);
acl_output_tensor = ggml_cann_create_tensor(
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
output_ne_offset);
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
nullptr, nullptr, nullptr, nullptr, QK8_0,
acl_output_tensor, &workspaceSize, &executor));
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
workspaceAddr, workspaceSize, executor, ctx.stream()));
ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
ACL_CHECK(aclDestroyTensor(acl_scale_tensor)); ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
ACL_CHECK(aclDestroyTensor(acl_output_tensor)); ACL_CHECK(aclDestroyTensor(acl_output_tensor));
} }
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
}
} }
// cast out // cast out
if (dst->type != GGML_TYPE_F16) {
int64_t* output_cast_ne = dst->ne; int64_t* output_cast_ne = dst->ne;
size_t output_cast_nb[GGML_MAX_DIMS]; size_t output_cast_nb[GGML_MAX_DIMS];
output_cast_nb[0] = sizeof(uint16_t); output_cast_nb[0] = sizeof(uint16_t);
@ -2624,14 +2856,16 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1]; output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
} }
aclTensor* acl_output_tensor = aclTensor* acl_output_tensor = ggml_cann_create_tensor(
ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, output_elem_size, output_buffer, ACL_FLOAT16, output_elem_size, output_cast_ne,
output_cast_ne, output_cast_nb, GGML_MAX_DIMS); output_cast_nb, GGML_MAX_DIMS);
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst); aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ACL_FLOAT); aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor,
ggml_cann_type_mapping(dst->type));
ACL_CHECK(aclDestroyTensor(acl_output_tensor)); ACL_CHECK(aclDestroyTensor(acl_output_tensor));
ACL_CHECK(aclDestroyTensor(acl_dst_tensor)); ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
}
} }
void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@ -2730,12 +2964,14 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
aclTensor* acl_cos_repeat_tensor, aclTensor* acl_cos_repeat_tensor,
aclTensor* acl_sin_repeat_tensor, aclTensor* acl_sin_repeat_tensor,
float theta_scale, bool is_neox) { float theta_scale, float freq_scale,
float attn_factor, bool is_neox) {
// int sin/cos cache, cache has different repeat method depond on // int sin/cos cache, cache has different repeat method depond on
// @param.is_neox // @param.is_neox
ggml_tensor* src0 = dst->src[0]; // input ggml_tensor* src0 = dst->src[0]; // input
ggml_tensor* src1 = dst->src[1]; // position ggml_tensor* src1 = dst->src[1]; // position
ggml_tensor* src2 = dst->src[2]; // freq_factors
// arange, [0,1,...,ne0/2] // arange, [0,1,...,ne0/2]
int64_t arange_length = src0->ne[0] / 2; int64_t arange_length = src0->ne[0] / 2;
@ -2764,11 +3000,26 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(), ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(),
arange_length * sizeof(float_t)); arange_length * sizeof(float_t));
void* theta_scale_buffer = theta_scale_allocator.get(); void* theta_scale_buffer = theta_scale_allocator.get();
aclTensor* acl_theta_scale_tensor = aclnn_ones( aclTensor* acl_theta_scale_tensor = aclnn_values(
ctx, theta_scale_buffer, arange_length * sizeof(float_t), arange_ne, ctx, theta_scale_buffer, arange_length * sizeof(float_t), arange_ne,
GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), theta_scale); GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), theta_scale);
aclnn_pow_tensor_tensor(ctx, acl_theta_scale_tensor, acl_arange_tensor); aclnn_pow_tensor_tensor(ctx, acl_theta_scale_tensor, acl_arange_tensor);
// freq_scale
if (freq_scale != 1) {
aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true);
}
// freq_factors
if (src2) {
aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor(
src2->data, ggml_cann_type_mapping(src2->type),
ggml_type_size(src2->type), arange_ne, arange_nb, GGML_MAX_DIMS);
aclnn_div_tensor(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor,
nullptr, true);
ACL_CHECK(aclDestroyTensor(acl_freq_factors_tensor));
}
// position // position
GGML_ASSERT(src1->type == GGML_TYPE_I32); GGML_ASSERT(src1->type == GGML_TYPE_I32);
int64_t position_length = src1->ne[0]; int64_t position_length = src1->ne[0];
@ -2832,6 +3083,12 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
GGML_MAX_DIMS, ACL_FORMAT_ND); GGML_MAX_DIMS, ACL_FORMAT_ND);
aclnn_cos(ctx, acl_permute_tensor, acl_cos_tensor); aclnn_cos(ctx, acl_permute_tensor, acl_cos_tensor);
// attn_factor
if (attn_factor != 1) {
aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true);
aclnn_muls(ctx, acl_cos_tensor, attn_factor, nullptr, true);
}
// repeat // repeat
if (is_neox) { if (is_neox) {
int64_t repeatsArray[] = {1, 1, 1, 2}; int64_t repeatsArray[] = {1, 1, 1, 2};
@ -2895,19 +3152,11 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float)); memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float)); memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
// TODO: with freq_factors
GGML_ASSERT(src2 == NULL);
// TODO: attn_factor != 1
GGML_ASSERT(attn_factor == 1);
// TODO: n_dims <= ne0 // TODO: n_dims <= ne0
GGML_ASSERT(n_dims == ne0); GGML_ASSERT(n_dims == ne0);
GGML_ASSERT(n_dims % 2 == 0); GGML_ASSERT(n_dims % 2 == 0);
// TODO: ext_factor != 0 // TODO: ext_factor != 0
GGML_ASSERT(ext_factor == 0); GGML_ASSERT(ext_factor == 0);
// TODO: freq_scale != 1
GGML_ASSERT(freq_scale == 1);
// TODO: type == GGML_TYPE_F16
GGML_ASSERT(src0->type == GGML_TYPE_F32);
const float theta_scale = powf(freq_base, -2.0f / n_dims); const float theta_scale = powf(freq_base, -2.0f / n_dims);
@ -2938,7 +3187,217 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t), ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t),
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor, aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
theta_scale, is_neox); theta_scale, freq_scale, attn_factor, is_neox);
aclTensor* acl_src = ggml_cann_create_tensor(src0);
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
#ifdef ASCEND_310P
// Special ROPE operation for 310P
// roll input
void* input_roll_buffer;
aclTensor* acl_minus_one_tensor;
void* minus_one_scale_buffer = nullptr;
ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
ggml_cann_pool_alloc minus_one_scale_allocator(
ctx.pool(), sizeof(float_t) * src0->ne[0]);
if (!is_neox) {
// roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
input_roll_buffer = roll_allocator.get();
int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
src0->ne[2], src0->ne[3]};
size_t input_roll_nb[GGML_MAX_DIMS];
input_roll_nb[0] = ggml_type_size(src0->type);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
}
aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
input_roll_buffer, ggml_cann_type_mapping(src0->type),
ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
GGML_MAX_DIMS);
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
src0->data, ggml_cann_type_mapping(src0->type),
ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
GGML_MAX_DIMS);
int64_t shifts[] = {1};
int64_t dims[] = {3};
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
// init [-1, 1, -1, 1, ...]
minus_one_scale_buffer = minus_one_scale_allocator.get();
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
size_t minus_one_nb[GGML_MAX_DIMS];
minus_one_nb[0] = sizeof(float_t);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
}
acl_minus_one_tensor = aclnn_values(
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
int64_t dim = 3;
int64_t* index = new int64_t[src0->ne[0]];
for (int i = 0; i < src0->ne[0]; i++) {
index[i] = i / 2 * 2;
}
int64_t index_num = src0->ne[0];
float value = -1;
aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
index_num, value);
} else {
// roll input: [q0,q1,q2,...] ->
// [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
input_roll_buffer = roll_allocator.get();
aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
input_roll_buffer, ggml_cann_type_mapping(src0->type),
ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
int64_t shifts[] = {src0->ne[0] / 2};
int64_t dims[] = {3};
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
// init [-1, -1, -1, 1, 11...]
minus_one_scale_buffer = minus_one_scale_allocator.get();
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
size_t minus_one_nb[GGML_MAX_DIMS];
minus_one_nb[0] = sizeof(float_t);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
}
acl_minus_one_tensor = aclnn_values(
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
// -1 * first half
int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
size_t first_half_nb[GGML_MAX_DIMS];
first_half_nb[0] = sizeof(float_t);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
}
aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
first_half_nb, GGML_MAX_DIMS);
bool inplace = true;
float scale = -1;
aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
ACL_CHECK(aclDestroyTensor(acl_first_half_tensor));
}
// TODO: n_dims < ne0
GGML_ASSERT(n_dims == src0->ne[0]);
// input * scale
ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
ggml_nbytes(src0));
void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
size_t input_nb[GGML_MAX_DIMS];
input_nb[0] = ggml_type_size(src0->type);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
}
aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
input_roll_buffer, ggml_cann_type_mapping(src0->type),
ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
acl_input_roll_mul_scale_tensor);
// output
void* output_fp32_buffer;
if (src0->type == GGML_TYPE_F32) {
aclnn_inplace_mul(ctx, acl_src, acl_cos_reshape_tensor);
aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor,
acl_sin_reshape_tensor);
aclnn_add(ctx, acl_src, acl_input_roll_mul_scale_tensor, acl_dst);
// TODO: ne0 != n_dims in mode2
} else if (src0->type == GGML_TYPE_F16) {
size_t input_fp32_nb[GGML_MAX_DIMS];
input_fp32_nb[0] = sizeof(float_t);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
}
ggml_cann_pool_alloc fp32_allocator1(
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
void* input_fp32_buffer1 = fp32_allocator1.get();
aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
input_fp32_nb, GGML_MAX_DIMS);
ggml_cann_pool_alloc fp32_allocator2(
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
void* input_fp32_buffer2 = fp32_allocator2.get();
aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
input_fp32_nb, GGML_MAX_DIMS);
ggml_cann_pool_alloc fp32_allocator(
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
output_fp32_buffer = fp32_allocator.get();
aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
input_fp32_nb, GGML_MAX_DIMS);
aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1);
aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
input_fp32_tensor2);
aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
output_fp32_tensor);
aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
ACL_CHECK(aclDestroyTensor(acl_src));
}
return;
#endif
// src0 == GGML_TYPE_F16
// TODO: optimization this `if` code
if (src0->type == GGML_TYPE_F16) {
ggml_cann_pool_alloc sin_final_allocator(
ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type));
ggml_cann_pool_alloc cos_final_allocator(
ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type));
void* sin_final_buffer = sin_final_allocator.get();
void* cos_final_buffer = cos_final_allocator.get();
int64_t sin_final_ne[4] = {src0->ne[0], 1, src0->ne[2], 1};
size_t sin_final_nb[GGML_MAX_DIMS];
sin_final_nb[0] = ggml_type_size(src0->type);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
sin_final_nb[i] = sin_final_nb[i - 1] * sin_final_ne[i - 1];
}
aclTensor* acl_sin_final_tensor = ggml_cann_create_tensor(
sin_final_buffer, ggml_cann_type_mapping(src0->type),
ggml_type_size(src0->type), sin_final_ne, sin_final_nb,
GGML_MAX_DIMS);
aclTensor* acl_cos_final_tensor = ggml_cann_create_tensor(
cos_final_buffer, ggml_cann_type_mapping(src0->type),
ggml_type_size(src0->type), sin_final_ne, sin_final_nb,
GGML_MAX_DIMS);
aclnn_cast(ctx, acl_sin_reshape_tensor, acl_sin_final_tensor,
ggml_cann_type_mapping(src0->type));
aclnn_cast(ctx, acl_cos_reshape_tensor, acl_cos_final_tensor,
ggml_cann_type_mapping(src0->type));
ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
acl_sin_reshape_tensor = acl_sin_final_tensor;
acl_cos_reshape_tensor = acl_cos_final_tensor;
}
uint64_t workspaceSize = 0; uint64_t workspaceSize = 0;
aclOpExecutor* executor; aclOpExecutor* executor;
@ -2950,10 +3409,9 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
acl_mode = 1; acl_mode = 1;
} }
aclTensor* acl_x = ggml_cann_create_tensor(src0);
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize( ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor)); acl_src, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
acl_dst, &workspaceSize, &executor));
if (workspaceSize > 0) { if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get(); workspaceAddr = workspace_allocator.get();
@ -2962,7 +3420,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize, ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize,
executor, ctx.stream())); executor, ctx.stream()));
ACL_CHECK(aclDestroyTensor(acl_x)); ACL_CHECK(aclDestroyTensor(acl_src));
ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor)); ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor)); ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
ACL_CHECK(aclDestroyTensor(acl_dst)); ACL_CHECK(aclDestroyTensor(acl_dst));

View File

@ -211,17 +211,20 @@ struct ggml_cann_pool_alloc {
struct ggml_backend_cann_context { struct ggml_backend_cann_context {
int32_t device; /**< Device ID. */ int32_t device; /**< Device ID. */
std::string name; /**< Name of the device. */ std::string name; /**< Name of the device. */
std::string description; /**< Description of the device. */
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */ aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
aclrtStream streams[GGML_CANN_MAX_STREAMS] = { aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
{nullptr}}; /**< Array of streams for the device. */
/** /**
* @brief Constructor for initializing the context with a given device. * @brief Constructor for initializing the context with a given device.
* @param device Device ID. * @param device Device ID.
*/ */
explicit ggml_backend_cann_context(int device) explicit ggml_backend_cann_context(int device)
: device(device), name("CANN" + std::to_string(device)) {} : device(device), name("CANN" + std::to_string(device)) {
ggml_cann_set_device(device);
description = aclrtGetSocName();
}
/** /**
* @brief Destructor for cleaning up resources. * @brief Destructor for cleaning up resources.

View File

@ -122,6 +122,10 @@ static ggml_cann_device_info ggml_cann_init() {
ACL_CHECK(aclrtMemGetAllocationGranularity( ACL_CHECK(aclrtMemGetAllocationGranularity(
&prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED, &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
&info.devices[id].vmm_granularity)); &info.devices[id].vmm_granularity));
size_t free, total;
ggml_backend_cann_get_device_memory(id, &free, &total);
info.devices[id].total_vram = free;
} }
// TODO: add more device info later. // TODO: add more device info later.
@ -208,6 +212,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
* @return A pointer to the allocated buffer. * @return A pointer to the allocated buffer.
*/ */
void* alloc(size_t size, size_t* actual_size) override { void* alloc(size_t size, size_t* actual_size) override {
const size_t alignment = 128;
size = GGML_PAD(size, alignment);
if (size == 0) {
size = alignment;
}
#ifdef DEBUG_CANN_MALLOC #ifdef DEBUG_CANN_MALLOC
int nnz = 0; int nnz = 0;
size_t max_size = 0; size_t max_size = 0;
@ -246,13 +255,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
return ptr; return ptr;
} }
void* ptr; void* ptr;
size_t look_ahead_size = (size_t)(1.05 * size);
look_ahead_size = 256 * ((look_ahead_size + 255) / 256);
ggml_cann_set_device(device); ggml_cann_set_device(device);
ACL_CHECK( ACL_CHECK(
aclrtMalloc(&ptr, look_ahead_size, ACL_MEM_MALLOC_HUGE_FIRST)); aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
*actual_size = look_ahead_size; *actual_size = size;
pool_size += look_ahead_size; pool_size += size;
#ifdef DEBUG_CANN_MALLOC #ifdef DEBUG_CANN_MALLOC
GGML_LOG_INFO( GGML_LOG_INFO(
"%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, " "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
@ -296,7 +303,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
/** /**
* @brief The maximum size of the virtual memory pool (32 GB). * @brief The maximum size of the virtual memory pool (32 GB).
*/ */
static const size_t CANN_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB size_t max_size;
/** /**
* @brief The device ID associated with this buffer pool. * @brief The device ID associated with this buffer pool.
@ -341,7 +348,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
*/ */
explicit ggml_cann_pool_vmm(int device) explicit ggml_cann_pool_vmm(int device)
: device(device), : device(device),
granularity(ggml_cann_info().devices[device].vmm_granularity) {} granularity(ggml_cann_info().devices[device].vmm_granularity) {
auto dev = ggml_cann_info().devices[device];
granularity = dev.vmm_granularity;
max_size = dev.total_vram;
}
/** /**
* @brief Destructor to free all buffers in the virtual memory pool. * @brief Destructor to free all buffers in the virtual memory pool.
@ -370,17 +381,19 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
// round up the allocation size to the alignment to ensure that all // round up the allocation size to the alignment to ensure that all
// allocations are aligned for all data types // allocations are aligned for all data types
const size_t alignment = 128; const size_t alignment = 128;
size = alignment * ((size + alignment - 1) / alignment); size = GGML_PAD(size, alignment);
if (size == 0) {
size = alignment;
}
size_t avail = pool_size - pool_used; size_t avail = pool_size - pool_used;
if (size > avail) { if (size > avail) {
// round up to the next multiple of the granularity // round up to the next multiple of the granularity
size_t reserve_size = size - avail; size_t reserve_size = size - avail;
reserve_size = reserve_size = GGML_PAD(reserve_size, granularity);
granularity * ((reserve_size + granularity - 1) / granularity);
GGML_ASSERT(pool_size + reserve_size <= CANN_POOL_VMM_MAX_SIZE); GGML_ASSERT(pool_size + reserve_size <= max_size);
// allocate more physical memory // allocate more physical memory
aclrtPhysicalMemProp prop = {}; aclrtPhysicalMemProp prop = {};
@ -396,7 +409,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
// reserve virtual address space (if not already reserved) // reserve virtual address space (if not already reserved)
if (pool_addr == 0) { if (pool_addr == 0) {
ACL_CHECK(aclrtReserveMemAddress( ACL_CHECK(aclrtReserveMemAddress(
&pool_addr, CANN_POOL_VMM_MAX_SIZE, 0, NULL, 1)); &pool_addr, max_size, 0, NULL, 1));
} }
// map at the end of the pool // map at the end of the pool
@ -409,10 +422,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
// add to the pool // add to the pool
pool_size += reserve_size; pool_size += reserve_size;
// GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB ( #ifdef DEBUG_CANN_MALLOC
// reserved %llu MB)\n", GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
// device, (unsigned long long) (pool_size/1024/1024), device, (unsigned long long) (pool_size/1024/1024),
// (unsigned long long) (reserve_size/1024/1024)); (unsigned long long) (reserve_size/1024/1024));
#endif
} }
GGML_ASSERT(pool_addr != 0); GGML_ASSERT(pool_addr != 0);
@ -457,7 +471,6 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
*/ */
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device( std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
int device) { int device) {
// return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_leg(device));
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device)); return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
} }
@ -1130,10 +1143,10 @@ ggml_backend_cann_buffer_type(int32_t device) {
static bool ggml_backend_cann_buffer_type_initialized = false; static bool ggml_backend_cann_buffer_type_initialized = false;
if (!ggml_backend_cann_buffer_type_initialized) { if (!ggml_backend_cann_buffer_type_initialized) {
for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) { for (int32_t i = 0; i < ggml_cann_info().device_count; i++) {
ggml_backend_cann_buffer_types[i] = { ggml_backend_cann_buffer_types[i] = {
/* .iface = */ ggml_backend_cann_buffer_type_interface, /* .iface = */ ggml_backend_cann_buffer_type_interface,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device), /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
/* .context = */ /* .context = */
new ggml_backend_cann_buffer_type_context{ new ggml_backend_cann_buffer_type_context{
i, "CANN" + std::to_string(i)}, i, "CANN" + std::to_string(i)},
@ -1199,10 +1212,15 @@ static void * ggml_cann_host_malloc(size_t size) {
return nullptr; return nullptr;
} }
const size_t alignment = 128;
size = GGML_PAD(size, alignment);
if (size == 0) {
size = alignment;
}
void * hostPtr = nullptr; void * hostPtr = nullptr;
aclError err = aclrtMallocHost((void **) &hostPtr, size); aclError err = aclrtMallocHost((void **) &hostPtr, size);
if (err != ACL_SUCCESS) { if (err != ACL_SUCCESS) {
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
size / 1024.0 / 1024.0, aclGetRecentErrMsg()); size / 1024.0 / 1024.0, aclGetRecentErrMsg());
return nullptr; return nullptr;
@ -1720,13 +1738,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
} }
case GGML_OP_ROPE: { case GGML_OP_ROPE: {
// TODO: with ops-test v == 1 // TODO: with ops-test v == 1
float * freq_scale = (float*)((int32_t*)op->op_params + 6);
float * ext_factor = (float*)((int32_t*)op->op_params + 7); float * ext_factor = (float*)((int32_t*)op->op_params + 7);
float * attn_factor = (float*)((int32_t*)op->op_params + 8);
// TODO: with freq_factors
if (op->src[2] != NULL) {
return false;
}
// TODO: n_dims <= ne0 // TODO: n_dims <= ne0
if (op->src[0]->ne[0] != op->op_params[1]) { if (op->src[0]->ne[0] != op->op_params[1]) {
return false; return false;
@ -1735,21 +1747,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
if (*ext_factor != 0) { if (*ext_factor != 0) {
return false; return false;
} }
// TODO: freq_scale != 1
if (*freq_scale != 1) {
return false;
}
// TODO: attn_factor != 1
if (*attn_factor != 1) {
return false;
}
//TODO: type == GGML_TYPE_F16
switch (op->src[0]->type) {
case GGML_TYPE_F32:
return true; return true;
default:
return false;
}
} }
case GGML_OP_UPSCALE: { case GGML_OP_UPSCALE: {
// aclnnUpsampleNearest2dGetWorkspaceSize not support // aclnnUpsampleNearest2dGetWorkspaceSize not support

View File

@ -25,6 +25,6 @@ ascendc_library(ascendc_kernels STATIC
${SRC_FILES} ${SRC_FILES}
) )
message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.") message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP) # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)

View File

@ -20,7 +20,6 @@ class DupByRows {
// Input has four dims. // Input has four dims.
int64_t op_block_num = GetBlockNum(); int64_t op_block_num = GetBlockNum();
int64_t op_block_idx = GetBlockIdx(); int64_t op_block_idx = GetBlockIdx();
assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM);
// param // param
num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3]; num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];

View File

@ -2,6 +2,15 @@
// optimize me. Use template to avoid copy code. // optimize me. Use template to avoid copy code.
using namespace AscendC; using namespace AscendC;
#ifdef ASCEND_310P // 310P not support 4bit get row
extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
printf("Ascend310P not support 4bit get row.\n");
}
#else
#define BUFFER_NUM 2 #define BUFFER_NUM 2
@ -110,12 +119,9 @@ class GET_ROW_Q4_0 {
LocalTensor<float> output_local = output_queue.AllocTensor<float>(); LocalTensor<float> output_local = output_queue.AllocTensor<float>();
// TODO: cast more data to speed up. // TODO: cast more data to speed up.
#ifdef ASCEND_310P
// TODO: 310P support quantification
#else
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0); Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0); Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
#endif
// Only mul need compile by group. // Only mul need compile by group.
half scale = scale_gm.GetValue(scale_offset); half scale = scale_gm.GetValue(scale_offset);
@ -194,3 +200,5 @@ extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
indices_nb_ub, output_ne_ub, output_nb_ub); indices_nb_ub, output_ne_ub, output_nb_ub);
op.calculate(); op.calculate();
} }
#endif // #ifdef ASCEND_310P

View File

@ -1,6 +1,14 @@
#include "kernel_operator.h" #include "kernel_operator.h"
using namespace AscendC; using namespace AscendC;
#ifdef ASCEND_310P
extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
printf("Ascend310P not support f16->8bit quantization.\n");
}
#else
#define BUFFER_NUM 2 #define BUFFER_NUM 2
#define QK8_0 32 #define QK8_0 32
@ -206,3 +214,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
op.calculate(); op.calculate();
} }
#endif // #ifdef ASCEND_310P

View File

@ -1,6 +1,14 @@
#include "kernel_operator.h" #include "kernel_operator.h"
using namespace AscendC; using namespace AscendC;
#ifdef ASCEND_310P // 310P not support f32->8bit quantization
extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
printf("Ascend310P not support f32->8bit quantization.\n");
}
#else
#define BUFFER_NUM 2 #define BUFFER_NUM 2
#define QK8_0 32 #define QK8_0 32
@ -204,3 +212,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
op.calculate(); op.calculate();
} }
#endif // #ifdef ASCEND_310P

View File

@ -1,6 +1,21 @@
#include "kernel_operator.h" #include "kernel_operator.h"
using namespace AscendC; using namespace AscendC;
#ifdef ASCEND_310P // 310P not support float->4bit quantization
extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
printf("Ascend310P not support f32->4bit quantization.\n");
}
extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
printf("Ascend310P not support f16->4bit quantization.\n");
}
#else
#define BUFFER_NUM 2 #define BUFFER_NUM 2
#define Group_Size 32 #define Group_Size 32
@ -276,3 +291,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
op.calculate(); op.calculate();
} }
#endif // #ifdef ASCEND_310P

View File

@ -418,6 +418,12 @@ typedef struct {
} block_iq4_xs; } block_iq4_xs;
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding"); static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
typedef struct {
ggml_half d[4]; // deltas for 4 iq4_nl blocks
uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks
} block_iq4_nlx4;
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
#endif // GGML_COMMON_DECL #endif // GGML_COMMON_DECL
#endif // GGML_COMMON_DECL #endif // GGML_COMMON_DECL

View File

@ -82,20 +82,65 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
if (GGML_COMPILER_SUPPORT_DOTPROD) if (GGML_COMPILER_SUPPORT_DOTPROD)
add_compile_definitions(__ARM_FEATURE_DOTPROD) add_compile_definitions(__ARM_FEATURE_DOTPROD)
message(STATUS "ARM feature DOTPROD enabled")
endif () endif ()
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
if (GGML_COMPILER_SUPPORT_MATMUL_INT8) if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
message(STATUS "ARM feature MATMUL_INT8 enabled")
endif () endif ()
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
endif () endif ()
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
elseif (APPLE)
if (GGML_NATIVE)
set(USER_PROVIDED_MARCH FALSE)
foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS)
if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+")
set(USER_PROVIDED_MARCH TRUE)
break()
endif()
endforeach()
if (NOT USER_PROVIDED_MARCH)
set(MARCH_FLAGS "-march=armv8.2a")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
if (GGML_COMPILER_SUPPORT_DOTPROD)
set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
add_compile_definitions(__ARM_FEATURE_DOTPROD)
message(STATUS "ARM feature DOTPROD enabled")
endif ()
set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
message(STATUS "ARM feature MATMUL_INT8 enabled")
endif ()
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")
endif ()
endif ()
else() else()
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")

View File

@ -1,7 +1,3 @@
// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
// SPDX-License-Identifier: MIT
//
#define GGML_COMMON_IMPL_C #define GGML_COMMON_IMPL_C
#include "ggml-common.h" #include "ggml-common.h"
@ -187,6 +183,8 @@ static inline __m256i mul_sum_i8_pairs_int32x8(const __m256i x, const __m256i y)
} }
#endif #endif
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
static void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) { static void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) {
assert(QK8_0 == 32); assert(QK8_0 == 32);
assert(k % QK8_0 == 0); assert(k % QK8_0 == 0);
@ -528,7 +526,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
UNUSED(blocklen); UNUSED(blocklen);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
if (ggml_cpu_has_neon()) { if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
const void * b_ptr = vx; const void * b_ptr = vx;
const void * a_ptr = vy; const void * a_ptr = vy;
float * res_ptr = s; float * res_ptr = s;
@ -996,6 +994,102 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
} }
} }
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
const int qk = QK8_0;
const int nb = n / qk;
const int ncols_interleaved = 4;
const int blocklen = 4;
assert (n % qk == 0);
assert (nc % ncols_interleaved == 0);
UNUSED(s);
UNUSED(bs);
UNUSED(vx);
UNUSED(vy);
UNUSED(nr);
UNUSED(nc);
UNUSED(nb);
UNUSED(ncols_interleaved);
UNUSED(blocklen);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
float * res_ptr = s;
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
float32x4_t sumf = vdupq_n_f32(0);
for (int l = 0; l < nb; l++) {
uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0);
uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16);
uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32);
uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48);
int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4);
int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F);
int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4);
int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F);
int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4);
int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F);
int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4);
int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F);
int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0);
int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16);
int32x4_t sumi = vdupq_n_s32(0);
sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0);
sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0);
sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1);
sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1);
sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2);
sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2);
sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3);
sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3);
float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d));
float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
float32x4_t d = a_d * b_d;
sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi));
}
vst1q_f32(res_ptr + x * 4, sumf);
}
return;
}
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
{
float sumf[4];
int sumi;
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
for (int l = 0; l < nb; l++) {
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
for (int j = 0; j < ncols_interleaved; j++) {
sumi = 0;
for (int i = 0; i < blocklen; ++i) {
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
}
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
}
}
}
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
}
}
}
void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
const int qk = QK8_0; const int qk = QK8_0;
const int nb = n / qk; const int nb = n / qk;
@ -1017,7 +1111,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
UNUSED(blocklen); UNUSED(blocklen);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
if (ggml_cpu_has_neon()) { if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
const void * b_ptr = vx; const void * b_ptr = vx;
const void * a_ptr = vy; const void * a_ptr = vy;
float * res_ptr = s; float * res_ptr = s;
@ -3386,6 +3480,117 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
} }
} }
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
const int qk = QK8_0;
const int nb = n / qk;
const int ncols_interleaved = 4;
const int blocklen = 4;
assert (n % qk == 0);
assert (nr % 4 == 0);
assert (nc % ncols_interleaved == 0);
UNUSED(s);
UNUSED(bs);
UNUSED(vx);
UNUSED(vy);
UNUSED(nr);
UNUSED(nc);
UNUSED(nb);
UNUSED(ncols_interleaved);
UNUSED(blocklen);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
for (int y = 0; y < nr / 4; y++) {
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
float32x4_t sumf[4];
for (int m = 0; m < 4; m++) {
sumf[m] = vdupq_n_f32(0);
}
for (int l = 0; l < nb; l++) {
float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d));
float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
int32x4_t sumi_0 = vdupq_n_s32(0);
int32x4_t sumi_1 = vdupq_n_s32(0);
int32x4_t sumi_2 = vdupq_n_s32(0);
int32x4_t sumi_3 = vdupq_n_s32(0);
for (int k = 0; k < 4; k++) {
int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0);
int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64);
uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k);
int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4);
int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF);
sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0);
sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1);
sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2);
sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3);
sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0);
sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1);
sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2);
sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3);
}
sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
}
for (int m = 0; m < 4; m++) {
vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
}
}
}
return;
}
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
{
float sumf[4][4];
int sumi;
for (int y = 0; y < nr / 4; y++) {
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
}
for (int l = 0; l < nb; l++) {
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
sumi = 0;
for (int i = 0; i < blocklen; ++i) {
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
}
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
}
}
}
}
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++)
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
}
}
}
}
}
// FIXME: this code is duplicated from ggml-aarch64.c // FIXME: this code is duplicated from ggml-aarch64.c
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) { static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
block_q4_0x4 out; block_q4_0x4 out;
@ -3518,6 +3723,70 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block,
GGML_UNUSED(data_size); GGML_UNUSED(data_size);
} }
static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
block_iq4_nlx4 out;
for (int i = 0; i < 4; i++) {
out.d[i] = in[i].d;
}
const int end = QK4_NL * 2 / blck_size_interleave;
if (blck_size_interleave == 8) {
for (int i = 0; i < end; ++i) {
int src_id = i % 4;
int src_offset = (i / 4) * blck_size_interleave;
int dst_offset = i * blck_size_interleave;
// Using memcpy to avoid unaligned memory accesses
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
}
} else if (blck_size_interleave == 4) {
for (int i = 0; i < end; ++i) {
int src_id = i % 4;
int src_offset = (i / 4) * blck_size_interleave;
int dst_offset = i * blck_size_interleave;
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
}
} else {
GGML_ASSERT(false);
}
return out;
}
static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) {
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
const block_iq4_nl * src = (const block_iq4_nl *)data;
block_iq4_nl dst_tmp[4];
int nrow = t->ne[1]; // Number of rows
int nrows_interleaved = 4;
int nblocks = t->ne[0] / QK4_0;
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
return -1;
}
for (int b = 0; b < nrow; b += nrows_interleaved) {
for (int64_t x = 0; x < nblocks; x++) {
for (int i = 0; i < nrows_interleaved; i++) {
dst_tmp[i] = src[x + i * nblocks];
}
*dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
}
src += nrows_interleaved * nblocks;
}
return 0;
GGML_UNUSED(data_size);
}
// Prepare for optimized kernels if applicable // Prepare for optimized kernels if applicable
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) { void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) {
if (cur->type == repack_type) { if (cur->type == repack_type) {
@ -3525,8 +3794,7 @@ void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_
return; return;
} }
GGML_ASSERT(cur->type == GGML_TYPE_Q4_0); if (cur->type == GGML_TYPE_Q4_0) {
switch (repack_type) { switch (repack_type) {
case GGML_TYPE_Q4_0_8_8: case GGML_TYPE_Q4_0_8_8:
repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size); repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
@ -3540,6 +3808,17 @@ void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_
default: default:
GGML_ABORT("Unsupported type"); GGML_ABORT("Unsupported type");
} }
} else if (cur->type == GGML_TYPE_IQ4_NL) {
switch (repack_type) {
case GGML_TYPE_IQ4_NL_4_4:
repack_iq4_nl_to_iq4_nl_4_bl(cur, 4, data, data_size);
break;
default:
GGML_ABORT("Unsupported type");
}
} else {
GGML_ABORT("Unsupported type");
}
} }
enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) { enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) {
@ -3551,9 +3830,13 @@ enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * c
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
return GGML_TYPE_Q4_0_4_8; return GGML_TYPE_Q4_0_4_8;
} }
if (ggml_cpu_has_neon()) { if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
return GGML_TYPE_Q4_0_4_4; return GGML_TYPE_Q4_0_4_4;
} }
} else if (cur->type == GGML_TYPE_IQ4_NL) {
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
return GGML_TYPE_IQ4_NL_4_4;
}
} }
return cur->type; return cur->type;

View File

@ -15,11 +15,13 @@ void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
// GEMM // GEMM
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size); void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur); enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);

View File

@ -1813,11 +1813,13 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
l1, r1)), l2, r2)), l3, r3))), scale); l1, r1)), l2, r2)), l3, r3))), scale);
} }
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
vst1_f32(s, vget_low_f32(sumv2)); vst1_f32(s, vget_low_f32 (sumv2));
vst1_f32(s + bs, vget_high_f32(sumv2)); vst1_f32(s + bs, vget_high_f32(sumv2));
return; return;
} }
#endif #endif

View File

@ -109,10 +109,11 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
#if defined(__ARM_ARCH) #if defined(__ARM_ARCH)
struct ggml_arm_arch_features_type { struct ggml_arm_arch_features_type {
int has_neon; int has_neon;
int has_dotprod;
int has_i8mm; int has_i8mm;
int has_sve; int has_sve;
int sve_cnt; int sve_cnt;
} ggml_arm_arch_features = {-1, -1, -1, 0}; } ggml_arm_arch_features = {-1, -1, -1, -1, 0};
#endif #endif
@ -446,6 +447,15 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_IQ4_NL_4_4] = {
.from_float = NULL,
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
.ncols = 4,
.gemv = ggml_gemv_iq4_nl_4x4_q8_0,
.gemm = ggml_gemm_iq4_nl_4x4_q8_0,
},
}; };
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) { const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
@ -2439,6 +2449,7 @@ static void ggml_init_arm_arch_features(void) {
uint32_t hwcap2 = getauxval(AT_HWCAP2); uint32_t hwcap2 = getauxval(AT_HWCAP2);
ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD); ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
ggml_arm_arch_features.has_dotprod = !!(hwcap && HWCAP_ASIMDDP);
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM); ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE); ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
@ -2453,6 +2464,11 @@ static void ggml_init_arm_arch_features(void) {
} }
ggml_arm_arch_features.has_neon = oldp; ggml_arm_arch_features.has_neon = oldp;
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) {
oldp = 0;
}
ggml_arm_arch_features.has_dotprod = oldp;
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) { if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
oldp = 0; oldp = 0;
} }
@ -7560,14 +7576,6 @@ UseGgmlGemm2:;
// This is the size of the rest of the dimensions of the result // This is the size of the rest of the dimensions of the result
const int64_t nr1 = ne1 * ne2 * ne3; const int64_t nr1 = ne1 * ne2 * ne3;
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
int64_t num_rows_per_vec_dot = vec_dot_num_rows;
// TODO: currently the mmla kernels support only even numbered rows/cols.
// this check can be removed once they are extended to support odd numbered rows/cols too
if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
num_rows_per_vec_dot = 1;
}
// Now select a reasonable chunk size. // Now select a reasonable chunk size.
int chunk_size = 16; int chunk_size = 16;
@ -7630,6 +7638,15 @@ UseGgmlGemm2:;
const int64_t ir1_start = dr1 * ith1; const int64_t ir1_start = dr1 * ith1;
const int64_t ir1_end = MIN(ir1_start + dr1, nr1); const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
int64_t num_rows_per_vec_dot = vec_dot_num_rows;
// TODO: currently the mmla kernels support only even numbered rows/cols.
// this check can be removed once they are extended to support odd numbered rows/cols too
if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
num_rows_per_vec_dot = 1;
}
ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end); ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
if (nth >= nchunk0 * nchunk1) { if (nth >= nchunk0 * nchunk1) {
@ -9133,6 +9150,7 @@ static void ggml_compute_forward_clamp(
case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8: case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8: case GGML_TYPE_Q4_0_8_8:
case GGML_TYPE_IQ4_NL_4_4:
case GGML_TYPE_I8: case GGML_TYPE_I8:
case GGML_TYPE_I16: case GGML_TYPE_I16:
case GGML_TYPE_I32: case GGML_TYPE_I32:
@ -13880,6 +13898,14 @@ int ggml_cpu_has_neon(void) {
#endif #endif
} }
int ggml_cpu_has_dotprod(void) {
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
return ggml_arm_arch_features.has_dotprod;
#else
return 0;
#endif
}
int ggml_cpu_has_sve(void) { int ggml_cpu_has_sve(void) {
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE) #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
return ggml_arm_arch_features.has_sve; return ggml_arm_arch_features.has_sve;

View File

@ -457,7 +457,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
const struct ggml_tensor * src1 = op->src[1]; const struct ggml_tensor * src1 = op->src[1];
if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) { if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
if (op->op != GGML_OP_MUL_MAT || src0->type != GGML_TYPE_Q4_0 || ggml_aarch64_get_optimal_repack_type(src0) == GGML_TYPE_Q4_0) { if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) {
return false; return false;
} }
} }

View File

@ -132,7 +132,7 @@ if (CUDAToolkit_FOUND)
message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}") message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
get_flags(${CUDA_CCID} ${CUDA_CCVER}) ggml_get_flags(${CUDA_CCID} ${CUDA_CCVER})
list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later
endif() endif()

View File

@ -47,9 +47,20 @@
#define CC_TURING 750 #define CC_TURING 750
#define CC_AMPERE 800 #define CC_AMPERE 800
#define CC_OFFSET_AMD 1000000 #define CC_OFFSET_AMD 1000000
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
#define CC_RDNA2 (CC_OFFSET_AMD + 1030) // GCN/CNDA, wave size is 64
#define CC_RDNA3 (CC_OFFSET_AMD + 1100) #define CC_GCN4 (CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
#define CC_VEGA (CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
#define CC_VEGA20 (CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
#define CC_CDNA (CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
#define CC_CDNA2 (CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
#define CC_CDNA3 (CC_OFFSET_AMD + 942) // MI300
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
#define CC_RDNA1 (CC_OFFSET_AMD + 1010) // RX 5000
#define CC_RDNA2 (CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
#define CC_RDNA3 (CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
#define CC_QY1 210 #define CC_QY1 210
#define CC_QY2 220 #define CC_QY2 220

View File

@ -1107,6 +1107,11 @@ static void ggml_cuda_op_mul_mat_cublas(
const half alpha_f16 = 1.0f; const half alpha_f16 = 1.0f;
const half beta_f16 = 0.0f; const half beta_f16 = 0.0f;
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
cu_compute_type = CUBLAS_COMPUTE_32F;
}
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream)); CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
CUBLAS_CHECK( CUBLAS_CHECK(
cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N, cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
@ -1114,7 +1119,7 @@ static void ggml_cuda_op_mul_mat_cublas(
&alpha_f16, src0_ptr, CUDA_R_16F, ne00, &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
src1_ptr, CUDA_R_16F, ne10, src1_ptr, CUDA_R_16F, ne10,
&beta_f16, dst_f16.get(), CUDA_R_16F, ldc, &beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
CUBLAS_COMPUTE_16F, cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP)); CUBLAS_GEMM_DEFAULT_TENSOR_OP));
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
@ -1607,6 +1612,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F; cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
cudaDataType_t cu_data_type = CUDA_R_16F; cudaDataType_t cu_data_type = CUDA_R_16F;
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
cu_compute_type = CUBLAS_COMPUTE_32F;
}
// dst strides // dst strides
size_t nbd2 = dst->nb[2]; size_t nbd2 = dst->nb[2];
size_t nbd3 = dst->nb[3]; size_t nbd3 = dst->nb[3];

View File

@ -148,5 +148,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
} }
return cc < CC_RDNA3 || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; return (cc < CC_RDNA3 && cc != CC_CDNA && cc != CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
} }

View File

@ -2570,9 +2570,9 @@ static __device__ void mul_mat_q_process_tile(
template <ggml_type type, int mmq_x, int nwarps, bool need_check> template <ggml_type type, int mmq_x, int nwarps, bool need_check>
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2) #if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
__launch_bounds__(WARP_SIZE*nwarps, 2) __launch_bounds__(WARP_SIZE*nwarps, 2)
#endif // defined(RDNA3) || defined(RDNA2) #endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
#else #else
#if __CUDA_ARCH__ >= CC_VOLTA #if __CUDA_ARCH__ >= CC_VOLTA
__launch_bounds__(WARP_SIZE*nwarps, 1) __launch_bounds__(WARP_SIZE*nwarps, 1)

View File

@ -142,7 +142,7 @@ static void mul_mat_vec_q_cuda(
int64_t nwarps = 1; int64_t nwarps = 1;
int64_t rows_per_cuda_block = 1; int64_t rows_per_cuda_block = 1;
if (ggml_cuda_info().devices[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2 if (ggml_cuda_info().devices[id].cc < CC_CDNA || ggml_cuda_info().devices[id].cc == CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA
switch(ncols_y) { switch(ncols_y) {
case 1: case 1:
nwarps = 4; nwarps = 4;

View File

@ -95,6 +95,14 @@
#define __CUDA_ARCH__ 1300 #define __CUDA_ARCH__ 1300
#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
#define GCN
#endif
#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
#define CDNA
#endif
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \ #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
defined(__gfx1150__) || defined(__gfx1151__) defined(__gfx1150__) || defined(__gfx1151__)
#define RDNA3 #define RDNA3

View File

@ -14,7 +14,7 @@
#include <arm_sve.h> #include <arm_sve.h>
#endif // __ARM_FEATURE_SVE #endif // __ARM_FEATURE_SVE
#if defined(__ARM_NEON) #if defined(__ARM_NEON) && !defined(__CUDACC__)
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example: // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
// //
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/

View File

@ -105,8 +105,10 @@ if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
kompute-shaders/op_getrows_q4_0.comp kompute-shaders/op_getrows_q4_0.comp
kompute-shaders/op_getrows_q4_1.comp kompute-shaders/op_getrows_q4_1.comp
kompute-shaders/op_getrows_q6_k.comp kompute-shaders/op_getrows_q6_k.comp
kompute-shaders/op_rope_f16.comp kompute-shaders/op_rope_norm_f16.comp
kompute-shaders/op_rope_f32.comp kompute-shaders/op_rope_norm_f32.comp
kompute-shaders/op_rope_neox_f16.comp
kompute-shaders/op_rope_neox_f32.comp
kompute-shaders/op_cpy_f16_f16.comp kompute-shaders/op_cpy_f16_f16.comp
kompute-shaders/op_cpy_f16_f32.comp kompute-shaders/op_cpy_f16_f32.comp
kompute-shaders/op_cpy_f32_f16.comp kompute-shaders/op_cpy_f32_f16.comp
@ -139,8 +141,10 @@ if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
shaderop_getrows_q4_0.h shaderop_getrows_q4_0.h
shaderop_getrows_q4_1.h shaderop_getrows_q4_1.h
shaderop_getrows_q6_k.h shaderop_getrows_q6_k.h
shaderop_rope_f16.h shaderop_rope_norm_f16.h
shaderop_rope_f32.h shaderop_rope_norm_f32.h
shaderop_rope_neox_f16.h
shaderop_rope_neox_f32.h
shaderop_cpy_f16_f16.h shaderop_cpy_f16_f16.h
shaderop_cpy_f16_f32.h shaderop_cpy_f16_f32.h
shaderop_cpy_f32_f16.h shaderop_cpy_f32_f16.h

View File

@ -28,8 +28,10 @@
#include "shaderop_getrows_q4_0.h" #include "shaderop_getrows_q4_0.h"
#include "shaderop_getrows_q4_1.h" #include "shaderop_getrows_q4_1.h"
#include "shaderop_getrows_q6_k.h" #include "shaderop_getrows_q6_k.h"
#include "shaderop_rope_f16.h" #include "shaderop_rope_norm_f16.h"
#include "shaderop_rope_f32.h" #include "shaderop_rope_norm_f32.h"
#include "shaderop_rope_neox_f16.h"
#include "shaderop_rope_neox_f32.h"
#include "shaderop_cpy_f16_f16.h" #include "shaderop_cpy_f16_f16.h"
#include "shaderop_cpy_f16_f32.h" #include "shaderop_cpy_f16_f32.h"
#include "shaderop_cpy_f32_f16.h" #include "shaderop_cpy_f32_f16.h"
@ -345,7 +347,7 @@ void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t
std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = { std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
vk::DescriptorPoolSize( vk::DescriptorPoolSize(
vk::DescriptorType::eStorageBuffer, vk::DescriptorType::eStorageBuffer,
3 * size // Descriptor count is number of possible tensors to pass into an algorithm 4 * size // Descriptor count is number of possible tensors to pass into an algorithm
) )
}; };
@ -788,7 +790,8 @@ static void ggml_vk_soft_max(
const std::shared_ptr<kp::Tensor>& out, const std::shared_ptr<kp::Tensor>& out,
uint32_t inAOff, uint32_t inBOff, uint32_t outOff, uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03, int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03,
float scale float scale, float max_bias, float m0, float m1,
uint32_t n_head_log2
) { ) {
const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv, const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv,
kp::shader_data::op_softmax_comp_spv_len); kp::shader_data::op_softmax_comp_spv_len);
@ -796,12 +799,14 @@ static void ggml_vk_soft_max(
struct PushConstants { struct PushConstants {
uint32_t inAOff, inBOff, outOff; uint32_t inAOff, inBOff, outOff;
int32_t ne00, ne01, ne02; int32_t ne00, ne01, ne02;
float scale; float scale, max_bias, m0, m1;
uint32_t n_head_log2;
int32_t mask; int32_t mask;
} pushConsts { } pushConsts {
safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
ne00, ne01, ne02, ne00, ne01, ne02,
scale, scale, max_bias, m0, m1,
n_head_log2,
bool(inB) bool(inB)
}; };
@ -911,9 +916,9 @@ static void ggml_vk_mul_mat_f16(
const std::shared_ptr<kp::Tensor>& out, const std::shared_ptr<kp::Tensor>& out,
uint32_t inAOff, uint32_t inBOff, uint32_t outOff, uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne00, int32_t ne01, int32_t ne02,
uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13,
int32_t ne0, int32_t ne1, int32_t ne0, int32_t ne1,
uint32_t r2, uint32_t r3 uint32_t r2, uint32_t r3
) { ) {
@ -923,17 +928,17 @@ static void ggml_vk_mul_mat_f16(
struct PushConstants { struct PushConstants {
uint32_t inAOff, inBOff, outOff; uint32_t inAOff, inBOff, outOff;
int32_t ne00, ne01, ne02; int32_t ne00, ne01, ne02;
uint32_t nb00, nb01, nb02; uint32_t nb00, nb01, nb02, nb03;
int32_t ne10, ne11, ne12; int32_t ne10, ne11, ne12;
uint32_t nb10, nb11, nb12; uint32_t nb10, nb11, nb12, nb13;
int32_t ne0, ne1; int32_t ne0, ne1;
uint32_t r2, r3; uint32_t r2, r3;
} pushConsts { } pushConsts {
safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4), safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
ne00, ne01, ne02, ne00, ne01, ne02,
nb00, nb01, nb02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, ne10, ne11, ne12,
nb10, nb11, nb12, nb10, nb11, nb12, nb13,
ne0, ne1, ne0, ne1,
r2, r3 r2, r3
}; };
@ -1013,6 +1018,8 @@ static void ggml_vk_mul_mat_impl(
int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne00, int32_t ne01, int32_t ne02,
int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
int32_t ne0, int32_t ne1, int32_t ne0, int32_t ne1,
uint32_t nb01, uint32_t nb02, uint32_t nb03,
uint32_t nb11, uint32_t nb12, uint32_t nb13,
uint32_t r2, uint32_t r3 uint32_t r2, uint32_t r3
) { ) {
struct PushConstants { struct PushConstants {
@ -1020,19 +1027,23 @@ static void ggml_vk_mul_mat_impl(
int32_t ne00, ne01, ne02; int32_t ne00, ne01, ne02;
int32_t ne10, ne12; int32_t ne10, ne12;
int32_t ne0, ne1; int32_t ne0, ne1;
uint32_t nb01, nb02, nb03;
uint32_t nb11, nb12, nb13;
uint32_t r2, r3; uint32_t r2, r3;
} pushConsts { } pushConsts {
safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4), safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
ne00, ne01, ne02, ne00, ne01, ne02,
ne10, ne12, ne10, ne12,
ne0, ne1, ne0, ne1,
nb01, nb02, nb03,
nb11, nb12, nb13,
r2, r3 r2, r3
}; };
auto name = std::string(__func__) + "_" + suffix; auto name = std::string(__func__) + "_" + suffix;
std::shared_ptr<kp::Algorithm> s_algo = nullptr; std::shared_ptr<kp::Algorithm> s_algo = nullptr;
if (!komputeManager()->hasAlgorithm(name)) { if (!komputeManager()->hasAlgorithm(name)) {
const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2; const uint32_t local_x = (ggml_vk_current_device().subgroupSize * 2) / 8;
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts}); s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts});
} else { } else {
s_algo = komputeManager()->getAlgorithm(name); s_algo = komputeManager()->getAlgorithm(name);
@ -1074,19 +1085,26 @@ static void ggml_vk_mul_mat_q4_k(
const std::shared_ptr<kp::Tensor>& inB, const std::shared_ptr<kp::Tensor>& inB,
const std::shared_ptr<kp::Tensor>& out, const std::shared_ptr<kp::Tensor>& out,
uint32_t inAOff, uint32_t inBOff, uint32_t outOff, uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne10, int32_t ne00, int32_t ne01, int32_t ne02,
int32_t ne11, int32_t ne12, int32_t ne13, int32_t ne0, int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
int32_t ne1, int32_t r2, int32_t r3 int32_t ne0, int32_t ne1,
uint32_t nb01, uint32_t nb02, uint32_t nb03,
uint32_t nb11, uint32_t nb12, uint32_t nb13,
uint32_t r2, uint32_t r3
) { ) {
const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv, const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv,
kp::shader_data::op_mul_mat_q4_k_comp_spv_len); kp::shader_data::op_mul_mat_q4_k_comp_spv_len);
struct PushConstants { struct PushConstants {
uint32_t inAOff, inBOff, outOff; uint32_t inAOff, inBOff, outOff;
int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12, r2, r3; int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12;
uint32_t nb01, nb02, nb03, nb11, nb12, nb13;
uint32_t r2, r3;
} pushConsts { } pushConsts {
0, 0, 0, inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
ne00, ne10, ne0, ne1, ne01, ne02, ne12, r2, r3 ne00, ne10, ne0, ne1, ne01, ne02, ne12,
nb01, nb02, nb03, nb11, nb12, nb13,
r2, r3
}; };
std::shared_ptr<kp::Algorithm> s_algo = nullptr; std::shared_ptr<kp::Algorithm> s_algo = nullptr;
@ -1108,28 +1126,37 @@ static void ggml_vk_mul_mat_q6_k(
const std::shared_ptr<kp::Tensor>& inB, const std::shared_ptr<kp::Tensor>& inB,
const std::shared_ptr<kp::Tensor>& out, const std::shared_ptr<kp::Tensor>& out,
uint32_t inAOff, uint32_t inBOff, uint32_t outOff, uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1, int32_t ne00, int32_t ne01, int32_t ne02,
int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02 int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
int32_t ne0, int32_t ne1,
uint32_t nb01, uint32_t nb02, uint32_t nb03,
uint32_t nb11, uint32_t nb12, uint32_t nb13,
uint32_t r2, uint32_t r3
) { ) {
const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv, const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv,
kp::shader_data::op_mul_mat_q6_k_comp_spv_len); kp::shader_data::op_mul_mat_q6_k_comp_spv_len);
struct PushConstants { struct PushConstants {
uint32_t inAOff, inBOff, outOff; uint32_t inAOff, inBOff, outOff;
int32_t ne00, ne10, ne0, ne1, ne01, gqa; int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12;
uint32_t nb01, nb02, nb03, nb11, nb12, nb13;
uint32_t r2, r3;
} pushConsts { } pushConsts {
inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4), inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
ne00, ne10, ne0, ne1, ne01, ne12/ne02 ne00, ne10, ne0, ne1, ne01, ne02, ne12,
nb01, nb02, nb03, nb11, nb12, nb13,
r2, r3
}; };
std::shared_ptr<kp::Algorithm> s_algo = nullptr; std::shared_ptr<kp::Algorithm> s_algo = nullptr;
if (!komputeManager()->hasAlgorithm(__func__)) { if (!komputeManager()->hasAlgorithm(__func__)) {
const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2; const uint32_t local_x = 2;
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts}); const uint32_t local_y = ggml_vk_current_device().subgroupSize;
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)}, {local_x, local_y}, {pushConsts});
} else { } else {
s_algo = komputeManager()->getAlgorithm(__func__); s_algo = komputeManager()->getAlgorithm(__func__);
s_algo->setTensors({inA, inB, out}); s_algo->setTensors({inA, inB, out});
s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}); s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)});
s_algo->setPushConstants<PushConstants>({pushConsts}); s_algo->setPushConstants<PushConstants>({pushConsts});
s_algo->updateDescriptors(s_kompute_context->pool.get()); s_algo->updateDescriptors(s_kompute_context->pool.get());
} }
@ -1217,10 +1244,11 @@ static void ggml_vk_rope(
kp::Sequence& seq, kp::Sequence& seq,
const std::shared_ptr<kp::Tensor>& inA, const std::shared_ptr<kp::Tensor>& inA,
const std::shared_ptr<kp::Tensor>& inB, const std::shared_ptr<kp::Tensor>& inB,
const std::shared_ptr<kp::Tensor>& inC,
const std::shared_ptr<kp::Tensor>& out, const std::shared_ptr<kp::Tensor>& out,
uint32_t inAOff, uint32_t inBOff, uint32_t outOff, uint32_t inAOff, uint32_t inBOff, uint32_t inCOff, uint32_t outOff,
ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig, ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig,
float freq_base, float freq_scale, float ext_factor, float attn_factor, float beta_fast, float beta_slow, float freq_base, float freq_scale, bool has_freq_factors, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
int32_t ne01, int32_t ne02, int32_t ne03, int32_t ne01, int32_t ne02, int32_t ne03,
uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03, uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
int32_t ne0, int32_t ne0,
@ -1228,11 +1256,17 @@ static void ggml_vk_rope(
) { ) {
GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32); GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32);
static const auto spirv_f16 = getSpirvShader( static const auto spirv_norm_f16 = getSpirvShader(
kp::shader_data::op_rope_f16_comp_spv, kp::shader_data::op_rope_f16_comp_spv_len kp::shader_data::op_rope_norm_f16_comp_spv, kp::shader_data::op_rope_norm_f16_comp_spv_len
); );
static const auto spirv_f32 = getSpirvShader( static const auto spirv_norm_f32 = getSpirvShader(
kp::shader_data::op_rope_f32_comp_spv, kp::shader_data::op_rope_f32_comp_spv_len kp::shader_data::op_rope_norm_f32_comp_spv, kp::shader_data::op_rope_norm_f32_comp_spv_len
);
static const auto spirv_neox_f16 = getSpirvShader(
kp::shader_data::op_rope_neox_f16_comp_spv, kp::shader_data::op_rope_neox_f16_comp_spv_len
);
static const auto spirv_neox_f32 = getSpirvShader(
kp::shader_data::op_rope_neox_f32_comp_spv, kp::shader_data::op_rope_neox_f32_comp_spv_len
); );
int type_size = src0t == GGML_TYPE_F16 ? 2 : 4; int type_size = src0t == GGML_TYPE_F16 ? 2 : 4;
@ -1247,32 +1281,40 @@ static void ggml_vk_rope(
GGML_ASSERT(nb0 % type_size == 0); GGML_ASSERT(nb0 % type_size == 0);
struct PushConstants { struct PushConstants {
uint32_t inAOff, inBOff, outOff; uint32_t inAOff, inBOff, inCOff, outOff;
int32_t n_dims, mode, n_ctx_orig; int32_t n_dims, mode, n_ctx_orig;
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; float freq_base, freq_scale;
bool has_freq_factors;
float ext_factor, attn_factor, beta_fast, beta_slow;
uint32_t nb00, nb01, nb02, nb03; uint32_t nb00, nb01, nb02, nb03;
int32_t ne0; int32_t ne0;
uint32_t nb0, nb1, nb2, nb3; uint32_t nb0, nb1, nb2, nb3;
} pushConsts { } pushConsts {
safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(outOff, type_size), safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(inCOff, type_size), safe_divide(outOff, type_size),
n_dims, mode, n_ctx_orig, n_dims, mode, n_ctx_orig,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, freq_base, freq_scale,
has_freq_factors,
ext_factor, attn_factor, beta_fast, beta_slow,
nb00, nb01, nb02, nb03, nb00, nb01, nb02, nb03,
ne0, ne0,
nb0, nb1, nb2, nb3 nb0, nb1, nb2, nb3
}; };
auto name = std::string(__func__) + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32"); auto & inC_ = inC ? inC : inA;
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
const bool is_f16 = src0t == GGML_TYPE_F16;
auto name = std::string(__func__) + (is_neox ? "_neox" : "_norm") + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32");
std::shared_ptr<kp::Algorithm> s_algo = nullptr; std::shared_ptr<kp::Algorithm> s_algo = nullptr;
if (!komputeManager()->hasAlgorithm(name)) { if (!komputeManager()->hasAlgorithm(name)) {
auto & spirv = is_neox ? is_f16 ? spirv_neox_f16 : spirv_neox_f32 : is_f16 ? spirv_norm_f16 : spirv_norm_f32;
s_algo = komputeManager()->algorithm<float, PushConstants>( s_algo = komputeManager()->algorithm<float, PushConstants>(
name, s_kompute_context->pool.get(), {inA, inB, out}, name, s_kompute_context->pool.get(), {inA, inB, inC_, out}, spirv,
src0t == GGML_TYPE_F16 ? spirv_f16 : spirv_f32,
{unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts} {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}
); );
} else { } else {
s_algo = komputeManager()->getAlgorithm(name); s_algo = komputeManager()->getAlgorithm(name);
s_algo->setTensors({inA, inB, out}); s_algo->setTensors({inA, inB, inC_, out});
s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
s_algo->setPushConstants<PushConstants>({pushConsts}); s_algo->setPushConstants<PushConstants>({pushConsts});
s_algo->updateDescriptors(s_kompute_context->pool.get()); s_algo->updateDescriptors(s_kompute_context->pool.get());
@ -1351,11 +1393,15 @@ static void ggml_vk_cpy_f16_f32(Args&&... args) {
} }
static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
int64_t n = ggml_nelements(op);
switch (op->op) { switch (op->op) {
case GGML_OP_UNARY: case GGML_OP_UNARY:
if (n % 4 != 0) return false;
switch (ggml_get_unary_op(op)) { switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_RELU:
case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU:
if (n % 8 != 0) return false;
// fall through
case GGML_UNARY_OP_RELU:
case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_SILU:
return ggml_is_contiguous(op->src[0]); return ggml_is_contiguous(op->src[0]);
default: default:
@ -1413,8 +1459,8 @@ static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, cons
switch (op->src[0]->type) { switch (op->src[0]->type) {
case GGML_TYPE_F32: case GGML_TYPE_F32:
case GGML_TYPE_Q6_K:
return op->ne[3] == 1; return op->ne[3] == 1;
case GGML_TYPE_Q6_K:
case GGML_TYPE_F16: case GGML_TYPE_F16:
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
@ -1515,9 +1561,11 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
const static std::shared_ptr<kp::Tensor> nullTensor = nullptr; const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
uint32_t off_src0 = 0; uint32_t off_src0 = 0;
uint32_t off_src1 = 0; uint32_t off_src1 = 0;
uint32_t off_src2 = 0;
uint32_t off_dst = 0; uint32_t off_dst = 0;
const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor; const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor;
const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor; const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor;
const std::shared_ptr<kp::Tensor>& id_src2 = src2 ? ggml_vk_get_tensor(src2, &off_src2) : nullTensor;
const std::shared_ptr<kp::Tensor>& id_dst = dst ? ggml_vk_get_tensor(dst, &off_dst) : nullTensor; const std::shared_ptr<kp::Tensor>& id_dst = dst ? ggml_vk_get_tensor(dst, &off_dst) : nullTensor;
switch (dst->op) { switch (dst->op) {
@ -1593,11 +1641,16 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021") #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32); GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
#pragma message("TODO: add ALiBi support") const int64_t nrows_x = ggml_nrows(src0);
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/7192") const int64_t nrows_y = src0->ne[1];
GGML_ASSERT(max_bias == 0.0f);
ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale); const uint32_t n_head = nrows_x/nrows_y;
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale, max_bias, m0, m1, n_head_log2);
} break; } break;
case GGML_OP_DIAG_MASK_INF: case GGML_OP_DIAG_MASK_INF:
{ {
@ -1649,38 +1702,44 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
case GGML_TYPE_F16: case GGML_TYPE_F16:
ggml_vk_mul_mat_f16( ggml_vk_mul_mat_f16(
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, ne13, nb10, nb11, nb12, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13,
ne0, ne1, r2, r3 ne0, ne1, r2, r3
); );
break; break;
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
ggml_vk_mul_mat_q8_0( ggml_vk_mul_mat_q8_0(
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3 ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
); );
break; break;
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
ggml_vk_mul_mat_q4_0( ggml_vk_mul_mat_q4_0(
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3 ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
); );
break; break;
case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_1:
ggml_vk_mul_mat_q4_1( ggml_vk_mul_mat_q4_1(
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3 ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
); );
break; break;
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
ggml_vk_mul_mat_q4_k( ggml_vk_mul_mat_q4_k(
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, ne12/ne02, ne13/ne03 ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
); );
break; break;
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
ggml_vk_mul_mat_q6_k( ggml_vk_mul_mat_q6_k(
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02 ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
); );
break; break;
default: { default: {
@ -1709,13 +1768,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
} break; } break;
case GGML_OP_ROPE: case GGML_OP_ROPE:
{ {
#pragma message("TODO: implement phi3 frequency factors support")
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
#pragma message("TODO: update rope NORM mode to match NEOX mode")
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
GGML_ASSERT(ne10 == ne02); GGML_ASSERT(ne10 == ne02);
GGML_ASSERT(src0t == dstt); GGML_ASSERT(src0t == dstt);
// const int n_past = ((int32_t *) dst->op_params)[0]; // const int n_past = ((int32_t *) dst->op_params)[0];
@ -1724,6 +1776,8 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
// skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan // skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
const bool has_freq_factors = dst->src[2] != nullptr;
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
@ -1732,8 +1786,8 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
ggml_vk_rope( ggml_vk_rope(
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_ctx_orig, seq, id_src0, id_src1, id_src2, id_dst, off_src0, off_src1, off_src2, off_dst, src0t, n_dims, mode, n_ctx_orig,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, freq_base, freq_scale, has_freq_factors, ext_factor, attn_factor, beta_fast, beta_slow,
ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3 ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
); );
} break; } break;

View File

@ -3,6 +3,7 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require #extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require #extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int64: require
#extension GL_EXT_control_flow_attributes: enable #extension GL_EXT_control_flow_attributes: enable
#extension GL_KHR_shader_subgroup_arithmetic : require #extension GL_KHR_shader_subgroup_arithmetic : require
#extension GL_EXT_debug_printf : enable #extension GL_EXT_debug_printf : enable

View File

@ -20,12 +20,14 @@ layout (push_constant) uniform parameter {
uint nb00; uint nb00;
uint nb01; uint nb01;
uint nb02; uint nb02;
uint nb03;
int ne10; int ne10;
int ne11; int ne11;
int ne12; int ne12;
uint nb10; uint nb10;
uint nb11; uint nb11;
uint nb12; uint nb12;
uint nb13;
int ne0; int ne0;
int ne1; int ne1;
uint r2; uint r2;
@ -42,7 +44,7 @@ void main() {
const uint i12 = im%pcs.ne12; const uint i12 = im%pcs.ne12;
const uint i13 = im/pcs.ne12; const uint i13 = im/pcs.ne12;
const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb02*pcs.ne02; const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb03;
const uint x = offset0 / 2 + pcs.inAOff; // Based from inA const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
@ -52,7 +54,7 @@ void main() {
break; break;
} }
const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // Based from inB const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
float sumf = 0; float sumf = 0;
for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) { for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {

View File

@ -24,8 +24,14 @@ layout (push_constant) uniform parameter {
int ne01; int ne01;
int ne02; int ne02;
int ne12; int ne12;
int r2; uint nb01;
int r3; uint nb02;
uint nb03;
uint nb11;
uint nb12;
uint nb13;
uint r2;
uint r3;
} pcs; } pcs;
void main() { void main() {
@ -50,10 +56,11 @@ void main() {
const uint i12 = im%pcs.ne12; const uint i12 = im%pcs.ne12;
const uint i13 = im/pcs.ne12; const uint i13 = im/pcs.ne12;
const uint offset0 = (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02); const uint offset0 = first_row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
const uint offset1 = r1*pcs.nb11 + (i12 )*pcs.nb12 + (i13 )*pcs.nb13;
const uint xblk = ib_row + offset0 + pcs.inAOff; const uint xblk = offset0 + pcs.inAOff;
const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff; const uint y = (offset1 / 4) + pcs.inBOff;
float yl[16]; float yl[16];
float yh[16]; float yh[16];
@ -74,7 +81,7 @@ void main() {
} }
for (int row = 0; row < N_DST; row++) { for (int row = 0; row < N_DST; row++) {
uint row_idx = row * nb; uint row_idx = row * (pcs.nb01 / SIZE_OF_BLOCK);
uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0); uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0);
uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2); uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2);

View File

@ -21,7 +21,16 @@ layout (push_constant) uniform parameter {
int ne0; int ne0;
int ne1; int ne1;
int ne01; int ne01;
int gqa; int ne02;
int ne12;
uint nb01;
uint nb02;
uint nb03;
uint nb11;
uint nb12;
uint nb13;
uint r2;
uint r3;
} pcs; } pcs;
void main() { void main() {
@ -34,12 +43,15 @@ void main() {
const uint r0 = gl_WorkGroupID.x; const uint r0 = gl_WorkGroupID.x;
const uint r1 = gl_WorkGroupID.y; const uint r1 = gl_WorkGroupID.y;
const uint r2 = gl_WorkGroupID.z; const uint im = gl_WorkGroupID.z;
const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID); const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
const uint offset0 = r2/pcs.gqa*(nb*pcs.ne0);
const uint x = row * nb + offset0; // Based from inA without base offset const uint i12 = im%pcs.ne12;
const uint yy = r1*pcs.ne10 + r2*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB const uint i13 = im/pcs.ne12;
const uint x = row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
const uint yy = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
float sumf = 0; float sumf = 0;
@ -89,6 +101,6 @@ void main() {
const float tot = subgroupAdd(sumf); const float tot = subgroupAdd(sumf);
if (subgroupElect()) { if (subgroupElect()) {
out_[r1*pcs.ne0 + r2*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot; out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
} }
} }

View File

@ -14,10 +14,15 @@ void main() {
const uint i12 = im%pcs.ne12; const uint i12 = im%pcs.ne12;
const uint i13 = im/pcs.ne12; const uint i13 = im/pcs.ne12;
const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02); // pointers to src0 rows
uint ax[N_ROWS];
for (int row = 0; row < N_ROWS; ++row) {
const uint offset0 = (first_row + row)*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
const uint x = offset0; // Based from inA without base offset ax[row] = offset0 + pcs.inAOff;
const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB }
const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f}; float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
@ -32,8 +37,7 @@ void main() {
for (uint ib = ix; ib < nb; ib += 16) { for (uint ib = ix; ib < nb; ib += 16) {
for (int row = 0; row < N_ROWS; row++) { for (int row = 0; row < N_ROWS; row++) {
const uint block_index = x + ib + row * nb; sumf[row] += block_q_n_dot_y(ax[row] + ib, yb, il);
sumf[row] += block_q_n_dot_y(block_index, yb, il);
} }
yb += BLOCKS_IN_QUANT * 16; yb += BLOCKS_IN_QUANT * 16;

View File

@ -1,5 +1,5 @@
layout(local_size_x_id = 0) in; layout(local_size_x_id = 0) in;
layout(local_size_y = 1) in; layout(local_size_y = 8) in;
layout(local_size_z = 1) in; layout(local_size_z = 1) in;
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
@ -17,6 +17,12 @@ layout (push_constant) uniform parameter {
int ne12; int ne12;
int ne0; int ne0;
int ne1; int ne1;
uint nb01;
uint nb02;
uint nb03;
uint nb11;
uint nb12;
uint nb13;
uint r2; uint r2;
uint r3; uint r3;
} pcs; } pcs;

Some files were not shown because too many files have changed in this diff Show More