Merge branch 'master' into support_glm_edge_model

This commit is contained in:
piDack 2024-12-03 13:27:23 +08:00 committed by GitHub
commit 82cbfda7b9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
104 changed files with 2300 additions and 2467 deletions

View File

@ -17,8 +17,10 @@ Checks: >
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*,
portability-*,
-portability-simd-intrinsics,
misc-*,
-misc-const-correctness,
-misc-non-private-member-variables-in-classes,
-misc-no-recursion,
-misc-use-anonymous-namespace,
FormatStyle: none

View File

@ -3,22 +3,34 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && \
apt-get install -y build-essential git libcurl4-openssl-dev
apt-get install -y build-essential git cmake libcurl4-openssl-dev
WORKDIR /app
COPY . .
ENV LLAMA_CURL=1
RUN make -j$(nproc) llama-server
RUN \
# Build multiple versions of the CPU backend
scripts/build-cpu.sh avx -DGGML_AVX=ON -DGGML_AVX2=OFF && \
scripts/build-cpu.sh avx2 -DGGML_AVX=ON -DGGML_AVX2=ON && \
scripts/build-cpu.sh avx512 -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON && \
scripts/build-cpu.sh amx -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON -DGGML_AVX_VNNI=ON -DGGML_AVX512_VNNI=ON -DGGML_AMX_TILE=ON -DGGML_AMX_INT8=ON && \
# Build llama-server
cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
cmake --build build --target llama-server -j $(nproc) && \
# Copy the built libraries to /app/lib
mkdir -p /app/lib && \
mv libggml-cpu* /app/lib/ && \
find build -name "*.so" -exec cp {} /app/lib/ \;
FROM ubuntu:$UBUNTU_VERSION AS runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl
COPY --from=build /app/llama-server /llama-server
COPY --from=build /app/build/bin/llama-server /llama-server
COPY --from=build /app/lib/ /
ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine

View File

@ -1,7 +1 @@
- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
- Self-reported review complexity:
- [ ] Low
- [ ] Medium
- [ ] High
*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*

View File

@ -160,66 +160,6 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
name: llama-bin-macos-x64.zip
ubuntu-focal-make:
runs-on: ubuntu-20.04
env:
LLAMA_NODE_AVAILABLE: true
LLAMA_PYTHON_AVAILABLE: true
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential gcc-8
- uses: actions/setup-node@v4
with:
node-version: "20"
- uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Build
id: make_build
env:
LLAMA_FATAL_WARNINGS: 1
run: |
CC=gcc-8 make -j $(nproc)
- name: Test
id: make_test
run: |
CC=gcc-8 make tests -j $(nproc)
make test -j $(nproc)
ubuntu-focal-make-curl:
runs-on: ubuntu-20.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev
- name: Build
id: make_build
env:
LLAMA_FATAL_WARNINGS: 1
LLAMA_CURL: 1
run: |
CC=gcc-8 make -j $(nproc)
ubuntu-latest-cmake:
runs-on: ubuntu-latest
@ -517,36 +457,6 @@ jobs:
cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
cmake --build . --config Release -j $(nproc)
# TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# how to debug it.
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
macOS-latest-make:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: make_build
env:
LLAMA_FATAL_WARNINGS: 1
run: |
GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
- name: Test
id: make_test
run: |
GGML_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
GGML_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu)
# TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# how to debug it.
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
@ -642,33 +552,35 @@ jobs:
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
macOS-latest-swift:
runs-on: macos-latest
strategy:
matrix:
destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: xcodebuild for swift package
id: xcodebuild
run: |
xcodebuild -scheme llama -destination "${{ matrix.destination }}"
- name: Build Swift Example
id: make_build_swift_example
run: |
make swift
# TODO: tmp disabled. see for possible re-enable:
# https://github.com/ggerganov/llama.cpp/pull/10525
# macOS-latest-swift:
# runs-on: macos-latest
#
# strategy:
# matrix:
# destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v4
#
# - name: Dependencies
# id: depends
# continue-on-error: true
# run: |
# brew update
#
# - name: xcodebuild for swift package
# id: xcodebuild
# run: |
# xcodebuild -scheme llama -destination "${{ matrix.destination }}"
#
# - name: Build Swift Example
# id: make_build_swift_example
# run: |
# make swift
windows-msys2:
runs-on: windows-latest
@ -695,21 +607,6 @@ jobs:
mingw-w64-${{matrix.env}}-cmake
mingw-w64-${{matrix.env}}-openblas
- name: Build using make
shell: msys2 {0}
run: |
make -j $(nproc)
- name: Clean after building using make
shell: msys2 {0}
run: |
make clean
- name: Build using make w/ OpenBLAS
shell: msys2 {0}
run: |
make GGML_OPENBLAS=1 -j $(nproc)
- name: Build using CMake
shell: msys2 {0}
run: |
@ -1121,6 +1018,11 @@ jobs:
run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
- name: Install ccache
uses: hendrikmuhs/ccache-action@v1.2
with:
key: ${{ github.job }}
- name: Build
id: cmake_build
run: |
@ -1252,9 +1154,7 @@ jobs:
runs-on: ubuntu-latest
needs:
- ubuntu-focal-make
- ubuntu-latest-cmake
- macOS-latest-make
- macOS-latest-cmake
- windows-latest-cmake
- windows-2019-cmake-cuda

View File

@ -96,10 +96,6 @@ if (NOT DEFINED GGML_LLAMAFILE)
set(GGML_LLAMAFILE_DEFAULT ON)
endif()
if (NOT DEFINED GGML_AMX)
set(GGML_AMX ON)
endif()
if (NOT DEFINED GGML_CUDA_GRAPHS)
set(GGML_CUDA_GRAPHS_DEFAULT ON)
endif()

3
CODEOWNERS Normal file
View File

@ -0,0 +1,3 @@
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
ci/ @ggerganov

View File

@ -1,9 +1,10 @@
# Pull requests (for contributors)
- Test your changes:
- Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the `ggml` library
- Execute [the full CI locally on your machine](ci/README.md) before publishing
- Optionally rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
@ -12,6 +13,7 @@
- Squash-merge PRs
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
- Consider adding yourself to [CODEOWNERS](CODEOWNERS)
# Coding guidelines

View File

@ -1,3 +1,7 @@
ifndef LLAMA_MAKEFILE
$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
endif
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
libllava.a \
@ -251,11 +255,11 @@ endif
# Compile flags
#
# keep standard at C11 and C++11
# keep standard at C11 and C++17
MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++11 -fPIC
MK_NVCCFLAGS = -std=c++11
MK_CXXFLAGS = -std=c++17 -fPIC
MK_NVCCFLAGS = -std=c++17
ifdef LLAMA_NO_CCACHE
GGML_NO_CCACHE := 1
@ -575,9 +579,12 @@ endif
ifndef GGML_NO_AMX
MK_CPPFLAGS += -DGGML_USE_AMX
OBJ_GGML_EXT += ggml/src/ggml-amx/ggml-amx.o ggml/src/ggml-amx/mmq.o
OBJ_GGML_EXT += ggml/src/ggml-cpu/amx/amx.o ggml/src/ggml-cpu/amx/mmq.o
endif
# only necessary for the CPU backend files
MK_CPPFLAGS += -Iggml/src/ggml-cpu
ifdef GGML_RPC
MK_CPPFLAGS += -DGGML_USE_RPC
OBJ_GGML_EXT += ggml/src/ggml-rpc.o

View File

@ -28,13 +28,16 @@ var cSettings: [CSetting] = [
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
.unsafeFlags(["-fno-objc-arc"]),
.headerSearchPath("ggml/src"),
.headerSearchPath("ggml/src/ggml-cpu"),
// NOTE: NEW_LAPACK will required iOS version 16.4+
// We should consider add this in the future when we drop support for iOS 14
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
// .define("ACCELERATE_NEW_LAPACK"),
// .define("ACCELERATE_LAPACK_ILP64")
.define("GGML_USE_CPU"),
]
#if canImport(Darwin)
sources.append("ggml/src/ggml-common.h")
sources.append("ggml/src/ggml-metal/ggml-metal.m")
@ -44,7 +47,6 @@ cSettings.append(
contentsOf: [
.define("GGML_USE_ACCELERATE"),
.define("GGML_USE_METAL"),
.define("GGML_USE_CPU")
]
)
#endif
@ -86,5 +88,5 @@ let package = Package(
linkerSettings: linkerSettings
)
],
cxxLanguageStandard: .cxx11
cxxLanguageStandard: .cxx17
)

483
README.md
View File

@ -4,7 +4,6 @@
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@ -26,7 +25,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
## Description
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
variety of hardware - locally and in the cloud.
range of hardware - locally and in the cloud.
- Plain C/C++ implementation without any dependencies
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
@ -36,14 +35,17 @@ variety of hardware - locally and in the cloud.
- Vulkan and SYCL backend support
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
improved significantly thanks to many contributions. It is the main playground for developing new features for the
[ggml](https://github.com/ggerganov/ggml) library.
The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.
**Supported models:**
<details>
<summary>Models</summary>
Typically finetunes of the base models below are supported as well.
Instructions for adding support for new models: [HOWTO-add-model.md](docs/development/HOWTO-add-model.md)
#### Text-only
- [X] LLaMA 🦙
- [x] LLaMA 2 🦙🦙
- [x] LLaMA 3 🦙🦙🦙
@ -97,9 +99,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
**Multimodal models:**
#### Multimodal
- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
- [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
@ -112,7 +112,10 @@ Typically finetunes of the base models below are supported as well.
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
- [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge)
**Bindings:**
</details>
<details>
<summary>Bindings</summary>
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
@ -139,191 +142,139 @@ Typically finetunes of the base models below are supported as well.
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
**UI:**
</details>
Unless otherwise noted these projects are open-source with permissive licensing:
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
- [iohub/collama](https://github.com/iohub/coLLaMA)
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
- [nat/openplayground](https://github.com/nat/openplayground)
- [Faraday](https://faraday.dev/) (proprietary)
- [LMStudio](https://lmstudio.ai/) (proprietary)
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
- [ramalama](https://github.com/containers/ramalama) (MIT)
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
- [ollama/ollama](https://github.com/ollama/ollama)
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
- [RAGNA Desktop](https://ragna.app/) (proprietary)
- [RecurseChat](https://recurse.chat/) (proprietary)
- [semperai/amica](https://github.com/semperai/amica)
- [withcatai/catai](https://github.com/withcatai/catai)
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
- [Msty](https://msty.app) (proprietary)
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
- [Dot](https://github.com/alexpinel/Dot) (GPL)
- [MindMac](https://mindmac.app) (proprietary)
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
- [AIKit](https://github.com/sozercan/aikit) (MIT)
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
- [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
<details>
<summary>UIs</summary>
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
**Tools:**
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
- [Dot](https://github.com/alexpinel/Dot) (GPL)
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
- [LARS](https://github.com/abgulati/LARS) (AGPL)
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
- [LMStudio](https://lmstudio.ai/) (proprietary)
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
- [MindMac](https://mindmac.app) (proprietary)
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) (Apache-2.0)
- [nat/openplayground](https://github.com/nat/openplayground) (MIT)
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) (MIT)
- [ollama/ollama](https://github.com/ollama/ollama) (MIT)
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
- [PocketPal AI](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) (MIT)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) (MIT)
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
- [ramalama](https://github.com/containers/ramalama) (MIT)
- [semperai/amica](https://github.com/semperai/amica) (MIT)
- [withcatai/catai](https://github.com/withcatai/catai) (MIT)
</details>
<details>
<summary>Tools</summary>
- [akx/ggify](https://github.com/akx/ggify) download PyTorch models from HuggingFace Hub and convert them to GGML
- [akx/ollama-dl](https://github.com/akx/ollama-dl) download models from the Ollama library to be used directly with llama.cpp
- [crashr/gppm](https://github.com/crashr/gppm) launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
**Infrastructure:**
</details>
<details>
<summary>Infrastructure</summary>
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
**Games:**
</details>
<details>
<summary>Games</summary>
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
## Demo
<details>
<summary>Typical run using LLaMA v2 13B on M2 Ultra</summary>
```
$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
I llama.cpp build info:
I UNAME_S: Darwin
I UNAME_P: arm
I UNAME_M: arm64
I CFLAGS: -I. -O3 -std=c11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE
I CXXFLAGS: -I. -I./common -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS
I LDFLAGS: -framework Accelerate
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
make: Nothing to be done for `default'.
main: build = 1041 (cf658ad)
main: seed = 1692823051
llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
llama_model_loader: - type f32: 81 tensors
llama_model_loader: - type q4_0: 281 tensors
llama_model_loader: - type q6_K: 1 tensors
llm_load_print_meta: format = GGUF V1 (latest)
llm_load_print_meta: arch = llama
llm_load_print_meta: vocab type = SPM
llm_load_print_meta: n_vocab = 32000
llm_load_print_meta: n_merges = 0
llm_load_print_meta: n_ctx_train = 4096
llm_load_print_meta: n_ctx = 512
llm_load_print_meta: n_embd = 5120
llm_load_print_meta: n_head = 40
llm_load_print_meta: n_head_kv = 40
llm_load_print_meta: n_layer = 40
llm_load_print_meta: n_rot = 128
llm_load_print_meta: n_gqa = 1
llm_load_print_meta: f_norm_eps = 1.0e-05
llm_load_print_meta: f_norm_rms_eps = 1.0e-05
llm_load_print_meta: n_ff = 13824
llm_load_print_meta: freq_base = 10000.0
llm_load_print_meta: freq_scale = 1
llm_load_print_meta: model type = 13B
llm_load_print_meta: model ftype = mostly Q4_0
llm_load_print_meta: model size = 13.02 B
llm_load_print_meta: general.name = LLaMA v2
llm_load_print_meta: BOS token = 1 '<s>'
llm_load_print_meta: EOS token = 2 '</s>'
llm_load_print_meta: UNK token = 0 '<unk>'
llm_load_print_meta: LF token = 13 '<0x0A>'
llm_load_tensors: ggml ctx size = 0.11 MB
llm_load_tensors: mem required = 7024.01 MB (+ 400.00 MB per state)
...................................................................................................
llama_new_context_with_model: kv self size = 400.00 MB
llama_new_context_with_model: compute buffer total size = 75.41 MB
system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
Building a website can be done in 10 simple steps:
Step 1: Find the right website platform.
Step 2: Choose your domain name and hosting plan.
Step 3: Design your website layout.
Step 4: Write your website content and add images.
Step 5: Install security features to protect your site from hackers or spammers
Step 6: Test your website on multiple browsers, mobile devices, operating systems etc…
Step 7: Test it again with people who are not related to you personally friends or family members will work just fine!
Step 8: Start marketing and promoting the website via social media channels or paid ads
Step 9: Analyze how many visitors have come to your site so far, what type of people visit more often than others (e.g., men vs women) etc…
Step 10: Continue to improve upon all aspects mentioned above by following trends in web design and staying up-to-date on new technologies that can enhance user experience even further!
How does a Website Work?
A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit whether its an image or text file (like PDFs). In order for someone elses browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
How to
llama_print_timings: load time = 576.45 ms
llama_print_timings: sample time = 283.10 ms / 400 runs ( 0.71 ms per token, 1412.91 tokens per second)
llama_print_timings: prompt eval time = 599.83 ms / 19 tokens ( 31.57 ms per token, 31.68 tokens per second)
llama_print_timings: eval time = 24513.59 ms / 399 runs ( 61.44 ms per token, 16.28 tokens per second)
llama_print_timings: total time = 25431.49 ms
```
</details>
<details>
<summary>Demo of running both LLaMA-7B and whisper.cpp on a single M1 Pro MacBook</summary>
## Supported backends
And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
| Backend | Target devices |
| --- | --- |
| [Metal](docs/build.md#metal-build) | Apple Silicon |
| [BLAS](docs/build.md#blas-build) | All |
| [BLIS](docs/backend/BLIS.md) | All |
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
| [hipBLAS](docs/build.md#hipblas) | AMD GPU |
| [Vulkan](docs/build.md#vulkan) | GPU |
| [CANN](docs/build.md#cann) | Ascend NPU |
https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4
## Building the project
</details>
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
## Usage
- Clone this repository and build locally, see [how to build](docs/build.md)
- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
- Use a Docker image, see [documentation for Docker](docs/docker.md)
- Download pre-built binaries from [releases](https://github.com/ggerganov/llama.cpp/releases)
Here are the end-to-end binary build and model conversion steps for most supported models.
## Obtaining and quantizing models
### Basic usage
The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
Firstly, you need to get the binary. There are different methods that you can follow:
- Method 1: Clone this repository and build locally, see [how to build](./docs/build.md)
- Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md)
- Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md)
- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases)
- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
You can run a basic completion using this command:
After downloading a model, use the CLI tools to run it locally - see below.
`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`:
- Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes
- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggerganov/llama.cpp/discussions/10123)
- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggerganov/llama.cpp/discussions/9268)
- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggerganov/llama.cpp/discussions/9669)
To learn more about model quantization, [read this documentation](examples/quantize/README.md)
## [`llama-cli`](examples/main)
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
- <details open>
<summary>Run simple text completion</summary>
```bash
llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128
# Output:
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
```
See [this page](./examples/main/README.md) for a full list of parameters.
</details>
### Conversation mode
If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter:
- <details>
<summary>Run in conversation mode</summary>
```bash
llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
llama-cli -m model.gguf -p "You are a helpful assistant" -cnv
# Output:
# > hi, who are you?
# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
#
@ -331,124 +282,174 @@ llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
# Easy peasy! The answer to 1+1 is... 2!
```
By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
</details>
- <details>
<summary>Run with custom chat template</summary>
```bash
./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
# use the "chatml" template
llama-cli -m model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
# use a custom template
llama-cli -m model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
```
You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
[Supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
</details>
- <details>
<summary>Constrain the output with a custom grammar</summary>
```bash
./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
llama-cli -m model.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
# {"appointmentTime": "8pm", "appointmentDetails": "schedule a a call"}
```
### Web server
The [grammars/](grammars/) folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](grammars/README.md).
[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
For authoring more complex JSON grammars, check out https://grammar.intrinsiclabs.ai/
Example usage:
</details>
## [`llama-server`](examples/server)
#### A lightweight, [OpenAI API](https://github.com/openai/openai-openapi) compatible, HTTP server for serving LLMs.
- <details open>
<summary>Start a local HTTP server with default configuration on port 8080</summary>
```bash
./llama-server -m your_model.gguf --port 8080
llama-server -m model.gguf --port 8080
# Basic web UI can be accessed via browser: http://localhost:8080
# Chat completion endpoint: http://localhost:8080/v1/chat/completions
```
### Interactive mode
</details>
> [!NOTE]
> If you prefer basic usage, please consider using conversation mode instead of interactive mode
In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
Here is an example of a few-shot interaction, invoked with the command
- <details>
<summary>Support multiple-users and parallel decoding</summary>
```bash
# default arguments using a 7B model
./examples/chat.sh
# advanced chat with a 13B model
./examples/chat-13B.sh
# custom arguments using a 13B model
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
# up to 4 concurrent requests, each with 4096 max context
llama-server -m model.gguf -c 16384 -np 4
```
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
</details>
![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
### Persistent Interaction
The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
- <details>
<summary>Enable speculative decoding</summary>
```bash
# Start a new chat
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
# Resume that chat
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
# Start a different chat with the same prompt/model
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh
# Different prompt cache for different prompt/model
PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
# the draft.gguf model should be a small variant of the target model.gguf
llama-server -m model.gguf -md draft.gguf
```
### Constrained output with grammars
</details>
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
- <details>
<summary>Serve an embedding model</summary>
```bash
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
# use the /embedding endpoint
llama-server -m model.gguf --embedding --pooling cls -ub 8192
```
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
</details>
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
- <details>
<summary>Serve a reranking model</summary>
## Build
```bash
# use the /reranking endpoint
llama-server -m model.gguf --reranking
```
Please refer to [Build llama.cpp locally](./docs/build.md)
</details>
## Supported backends
- <details>
<summary>Constrain all outputs with a grammar</summary>
| Backend | Target devices |
| --- | --- |
| [Metal](./docs/build.md#metal-build) | Apple Silicon |
| [BLAS](./docs/build.md#blas-build) | All |
| [BLIS](./docs/backend/BLIS.md) | All |
| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
| [MUSA](./docs/build.md#musa) | Moore Threads MTT GPU |
| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
| [Vulkan](./docs/build.md#vulkan) | GPU |
| [CANN](./docs/build.md#cann) | Ascend NPU |
```bash
# custom grammar
llama-server -m model.gguf --grammar-file grammar.gbnf
## Tools
# JSON
llama-server -m model.gguf --grammar-file grammars/json.gbnf
```
### Prepare and Quantize
</details>
> [!NOTE]
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
## [`llama-perplexity`](examples/perplexity)
Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
To learn more about quantizing model, [read this documentation](./examples/quantize/README.md)
- <details open>
<summary>Measure the perplexity over a text file</summary>
### Perplexity (measuring model quality)
```bash
llama-perplexity -m model.gguf -f file.txt
You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
# [1]15.2701,[2]5.4007,[3]5.3073,[4]6.2965,[5]5.8940,[6]5.6096,[7]5.7942,[8]4.9297, ...
# Final estimate: PPL = 5.4007 +/- 0.67339
```
</details>
- <details>
<summary>Measure KL divergence</summary>
```bash
# TODO
```
</details>
[^1]: [examples/perplexity/README.md](examples/perplexity/README.md)
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
## [`llama-bench`](example/bench)
#### Benchmark the performance of the inference for various parameters.
- <details open>
<summary>Run default benchmark</summary>
```bash
llama-bench -m model.gguf
# Output:
# | model | size | params | backend | threads | test | t/s |
# | ------------------- | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |
# | qwen2 1.5B Q4_0 | 885.97 MiB | 1.54 B | Metal,BLAS | 16 | pp512 | 5765.41 ± 20.55 |
# | qwen2 1.5B Q4_0 | 885.97 MiB | 1.54 B | Metal,BLAS | 16 | tg128 | 197.71 ± 0.81 |
#
# build: 3e0ba0e60 (4229)
```
</details>
## [`llama-simple`](examples/simple)
#### A minimal example for implementing apps with `llama.cpp`. Useful for developers.
- <details>
<summary>Basic text completion</summary>
```bash
llama-simple -m model.gguf
# Hello my name is Kaitlyn and I am a 16 year old girl. I am a junior in high school and I am currently taking a class called "The Art of
```
</details>
To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)
## Contributing
@ -463,20 +464,19 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
## Other documentation
- [main (cli)](./examples/main/README.md)
- [server](./examples/server/README.md)
- [jeopardy](./examples/jeopardy/README.md)
- [GBNF grammars](./grammars/README.md)
- [main (cli)](examples/main/README.md)
- [server](examples/server/README.md)
- [GBNF grammars](grammars/README.md)
**Development documentation**
#### Development documentation
- [How to build](./docs/build.md)
- [Running on Docker](./docs/docker.md)
- [Build on Android](./docs/android.md)
- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
- [How to build](docs/build.md)
- [Running on Docker](docs/docker.md)
- [Build on Android](docs/android.md)
- [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
**Seminal papers and background on the models**
#### Seminal papers and background on the models
If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
- LLaMA:
@ -487,3 +487,6 @@ If your issue is with model generation quality, then please at least scan the fo
- GPT-3.5 / InstructGPT / ChatGPT:
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
#### References

View File

@ -815,7 +815,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
ln -sfn ${mnt_models} ${SRC}/models-mnt
# Create a fresh python3 venv and enter it
python3 -m venv "$MNT/venv"
if ! python3 -m venv "$MNT/venv"; then
echo "Error: Failed to create Python virtual environment at $MNT/venv."
exit 1
fi
source "$MNT/venv/bin/activate"
pip install -r ${SRC}/requirements.txt --disable-pip-version-check

View File

@ -88,5 +88,5 @@ if (LLAMA_CURL)
endif ()
target_include_directories(${TARGET} PUBLIC .)
target_compile_features (${TARGET} PUBLIC cxx_std_11)
target_compile_features (${TARGET} PUBLIC cxx_std_17)
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)

View File

@ -348,6 +348,18 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
return true;
}
static std::string list_builtin_chat_templates() {
std::vector<const char *> supported_tmpl;
int32_t res = llama_chat_builtin_templates(nullptr, 0);
supported_tmpl.resize(res);
res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
std::ostringstream msg;
for (auto & tmpl : supported_tmpl) {
msg << tmpl << (&tmpl == &supported_tmpl.back() ? "" : ", ");
}
return msg.str();
}
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
// load dynamic backends
ggml_backend_load_all();
@ -1814,9 +1826,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--chat-template"}, "JINJA_TEMPLATE",
string_format(
"set custom jinja chat template (default: template taken from model's metadata)\n"
"if suffix/prefix are specified, template will be disabled\n"
"only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
"list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
),
[](common_params & params, const std::string & value) {
if (!common_chat_verify_template(value)) {
throw std::runtime_error(string_format(

View File

@ -652,7 +652,17 @@ bool fs_validate_filename(const std::string & filename) {
std::u32string filename_utf32;
try {
#if defined(__clang__)
// disable C++17 deprecation warning for std::codecvt_utf8
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
#endif
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
#if defined(__clang__)
# pragma clang diagnostic pop
#endif
filename_utf32 = converter.from_bytes(filename);
// If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,

View File

@ -133,6 +133,7 @@ struct common_params_sampling {
bool penalize_nl = false; // consider newlines as a repeatable token
bool ignore_eos = false;
bool no_perf = false; // disable performance metrics
bool timing_per_token = false;
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY

View File

@ -27,13 +27,6 @@ We recommend using openmp since it's easier to modify the cores being used.
### llama.cpp compilation
Makefile:
```bash
make GGML_BLIS=1 -j
# make GGML_BLIS=1 llama-benchmark-matmult
```
CMake:
```bash

View File

@ -7,33 +7,11 @@ git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
```
In order to build llama.cpp you have four different options.
The following sections describe how to build with different backends and options.
- Using `make`:
- On Linux or MacOS:
## CPU Build
```bash
make
```
- On Windows (x86/x64 only, arm64 requires cmake):
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
2. Extract `w64devkit` on your pc.
3. Run `w64devkit.exe`.
4. Use the `cd` command to reach the `llama.cpp` folder.
5. From here you can run:
```bash
make
```
- Notes:
- For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, run `make LLAMA_DEBUG=1`
- Using `CMake`:
Build llama.cpp using `CMake`:
```bash
cmake -B build
@ -42,9 +20,8 @@ In order to build llama.cpp you have four different options.
**Notes**:
- For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/).
- For faster compilation, add the `-j` argument to run multiple jobs in parallel, or use a generator that does this automatically such as Ninja. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/)
- For debug builds, there are two cases:
1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
@ -60,6 +37,9 @@ In order to build llama.cpp you have four different options.
cmake -B build -G "Xcode"
cmake --build build --config Debug
```
For more details and a list of supported generators, see the [CMake documentation](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html).
- Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
- Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
- Tab Workload: Desktop-development with C++
@ -70,61 +50,20 @@ In order to build llama.cpp you have four different options.
cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
cmake --build build-arm64-windows-llvm-release
```
Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
- Using `gmake` (FreeBSD):
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
2. Add your user to **video** group
3. Install compilation dependencies.
```bash
sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
```
## Metal Build
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
argument.
Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
## BLAS Build
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Using BLAS doesn't affect the generation performance. There are currently several different BLAS implementations available for build and use:
### Accelerate Framework:
### Accelerate Framework
This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
### OpenBLAS:
### OpenBLAS
This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
- Using `make`:
- On Linux:
```bash
make GGML_OPENBLAS=1
```
- On Windows:
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
3. Extract `w64devkit` on your pc.
4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
6. Run `w64devkit.exe`.
7. Use the `cd` command to reach the `llama.cpp` folder.
8. From here you can run:
```bash
make GGML_OPENBLAS=1
```
- Using `CMake` on Linux:
```bash
@ -136,14 +75,6 @@ This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS i
Check [BLIS.md](./backend/BLIS.md) for more information.
### SYCL
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
### Intel oneMKL
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
@ -161,16 +92,29 @@ Building through oneAPI compilers will make avx_vnni instruction set available f
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
### CUDA
### Other BLAS libraries
This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
Any other BLAS library can be used by setting the `GGML_BLAS_VENDOR` option. See the [CMake documentation](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) for a list of supported vendors.
For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
## Metal Build
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
To disable the Metal build at compile time use the `-DGGML_METAL=OFF` cmake option.
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers 0` command-line argument.
## SYCL
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
## CUDA
This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from the [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
- Using `make`:
```bash
make GGML_CUDA=1
```
- Using `CMake`:
```bash
@ -192,14 +136,10 @@ The following compilation options are also available to tweak performance:
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
### MUSA
## MUSA
This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
- Using `make`:
```bash
make GGML_MUSA=1
```
- Using `CMake`:
```bash
@ -213,16 +153,12 @@ The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enab
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
### hipBLAS
## HIP
This provides BLAS acceleration on HIP-supported AMD GPUs.
This provides GPU acceleration on HIP-supported AMD GPUs.
Make sure to have ROCm installed.
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
- Using `make`:
```bash
make GGML_HIP=1
```
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
```bash
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
@ -247,11 +183,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
&& cmake --build build -- -j 16
```
- Using `make` (example for target gfx1030, build with 16 CPU threads):
```bash
make -j16 GGML_HIP=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
```
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
```bash
set PATH=%HIP_PATH%\bin;%PATH%
@ -265,11 +196,11 @@ You can download it from your Linux distro's package manager or from here: [ROCm
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
### Vulkan
## Vulkan
**Windows**
#### w64devkit
### w64devkit
Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).
@ -289,9 +220,14 @@ Libs: -lvulkan-1
EOF
```
Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
#### Git Bash MINGW64
Switch into the `llama.cpp` directory and build using CMake.
```sh
cmake -B build -DGGML_VULKAN=ON
cmake --build build --config Release
```
### Git Bash MINGW64
Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
@ -310,11 +246,11 @@ cmake --build build --config Release
Now you can load the model in conversation mode using `Vulkan`
```
build/bin/release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
```sh
build/bin/Release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
```
#### MSYS2
### MSYS2
Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
```sh
pacman -S git \
@ -323,7 +259,8 @@ Install [MSYS2](https://www.msys2.org/) and then run the following commands in a
mingw-w64-ucrt-x86_64-vulkan-devel \
mingw-w64-ucrt-x86_64-shaderc
```
Switch into `llama.cpp` directory and build using CMake.
Switch into the `llama.cpp` directory and build using CMake.
```sh
cmake -B build -DGGML_VULKAN=ON
cmake --build build --config Release
@ -372,7 +309,7 @@ cmake --build build --config Release
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
```
### CANN
## CANN
This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
@ -387,22 +324,26 @@ cmake --build build --config release
You can test with:
`./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
```bash
llm_load_tensors: CANN buffer size = 13313.00 MiB
./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32
```
If the following info is output on screen, you are using `llama.cpp` with the CANN backend:
```bash
llm_load_tensors: CANN model buffer size = 13313.00 MiB
llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
```
For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
### Android
## Android
To read documentation for how to build on Android, [click here](./android.md)
### Arm CPU optimized mulmat kernels
## Notes about GPU-accelerated backends
Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
In most cases, it is possible to build and use multiple backends at the same time. For example, you can build llama.cpp with both CUDA and Vulkan support by using the `-DGGML_CUDA=ON -DGGML_VULKAN=ON` options with CMake. At runtime, you can specify which backend devices to use with the `--device` option. To see a list of available devices, use the `--list-devices` option.
Backends can be built as dynamic libraries that can be loaded dynamically at runtime. This allows you to use the same llama.cpp binary on different machines with different GPUs. To enable this feature, use the `GGML_BACKEND_DL` option when building.

View File

@ -1,61 +0,0 @@
#!/bin/bash
#
# Few-shot translation example.
# Requires a base model (i.e. no fine-tuned or instruct models).
#
# Usage:
#
# cd llama.cpp
# make -j
#
# ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
#
if [ $# -lt 2 ]; then
echo "Usage: ./base-translate.sh <model-base> \"<text>\" [extra-main-args]"
exit 1
fi
eargs=""
if [ $# -gt 2 ]; then
eargs="${@:3}"
fi
ftmp="__llama.cpp_example_tmp__.txt"
trap "rm -f $ftmp" EXIT
echo "Translate from English to French:
===
sea otter, peppermint, plush girafe:
sea otter => loutre de mer
peppermint => menthe poivrée
plush girafe => girafe peluche
===
violin
violin => violon
===
phone, computer, mouse, keyboard:
phone => téléphone
computer => ordinateur
mouse => souris
keyboard => clavier
===
" > $ftmp
echo "$2
" >> $ftmp
model=$1
# generate the most likely continuation until the string "===" is found
./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs

View File

@ -2,4 +2,4 @@ set(TARGET llama-batched-bench)
add_executable(${TARGET} batched-bench.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-batched)
add_executable(${TARGET} batched.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-convert-llama2c-to-ggml)
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,11 +2,8 @@
This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository:
To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository.
`$ make -j`
After successful compilation, following usage options are available:
```
usage: ./llama-convert-llama2c-to-ggml [options]

View File

@ -2,4 +2,4 @@ set(TARGET llama-cvector-generator)
add_executable(${TARGET} cvector-generator.cpp pca.hpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-embedding)
add_executable(${TARGET} embedding.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,7 +2,7 @@ set(TARGET llama-eval-callback)
add_executable(${TARGET} eval-callback.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TEST_TARGET test-eval-callback)
add_test(NAME ${TEST_TARGET}

View File

@ -2,4 +2,4 @@ set(TARGET llama-export-lora)
add_executable(${TARGET} export-lora.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-gbnf-validator)
add_executable(${TARGET} gbnf-validator.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-gen-docs)
add_executable(${TARGET} gen-docs.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -19,4 +19,4 @@ add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
target_link_libraries(${TARGET} PRIVATE sha256)
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-gguf-split)
add_executable(${TARGET} gguf-split.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-gguf)
add_executable(${TARGET} gguf.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-gritlm)
add_executable(${TARGET} gritlm.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-imatrix)
add_executable(${TARGET} imatrix.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -25,8 +25,6 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
## Example
```bash
GGML_CUDA=1 make -j
# generate importance matrix (imatrix.dat)
./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99

View File

@ -637,9 +637,18 @@ int main(int argc, char ** argv) {
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
}
if (params.prompt.empty()) {
if (params.in_files.empty()) {
LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n");
return 1;
}
LOG_INF("No prompt provided; combining precomputed matrices only.\n");
} else {
if (!compute_imatrix(ctx, params)) {
return 1;
}
}
g_collector.save_imatrix();

View File

@ -2,4 +2,4 @@ set(TARGET llama-infill)
add_executable(${TARGET} infill.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-bench)
add_executable(${TARGET} llama-bench.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -11,7 +11,7 @@ target_include_directories(llava PUBLIC .)
target_include_directories(llava PUBLIC ../..)
target_include_directories(llava PUBLIC ../../common)
target_compile_features(llava PRIVATE cxx_std_11)
target_compile_features(llava PRIVATE cxx_std_17)
add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
if (BUILD_SHARED_LIBS)
@ -35,11 +35,11 @@ add_executable(${TARGET} llava-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-minicpmv-cli)
add_executable(${TARGET} minicpmv-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-lookahead)
add_executable(${TARGET} lookahead.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,22 +2,22 @@ set(TARGET llama-lookup)
add_executable(${TARGET} lookup.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-lookup-create)
add_executable(${TARGET} lookup-create.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-lookup-merge)
add_executable(${TARGET} lookup-merge.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-lookup-stats)
add_executable(${TARGET} lookup-stats.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -29,4 +29,4 @@ add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
target_include_directories(${TARGET} PRIVATE ${_common_path})
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-cli)
add_executable(${TARGET} main.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-parallel)
add_executable(${TARGET} parallel.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-passkey)
add_executable(${TARGET} passkey.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-perplexity)
add_executable(${TARGET} perplexity.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -3,4 +3,4 @@ add_executable(${TARGET} quantize-stats.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -3,4 +3,4 @@ add_executable(${TARGET} quantize.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-retrieval)
add_executable(${TARGET} retrieval.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-run)
add_executable(${TARGET} run.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-save-load-state)
add_executable(${TARGET} save-load-state.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -50,4 +50,4 @@ if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif()
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -69,6 +69,8 @@ The project is under active development, and we are [looking for feedback and co
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
| `--list-devices` | print list of available devices and exit |
| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
@ -158,9 +160,16 @@ The project is under active development, and we are [looking for feedback and co
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>list of built-in templates:<br/>chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16) |
| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5) |
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9) |
| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model) |
| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model |
| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) |
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
@ -188,12 +197,6 @@ services:
`llama-server` is built alongside everything else from the root of the project
- Using `make`:
```bash
make llama-server
```
- Using `CMake`:
```bash
@ -207,15 +210,6 @@ services:
`llama-server` can also be built with SSL support using OpenSSL 3
- Using `make`:
```bash
# NOTE: For non-system openssl, use the following:
# CXXFLAGS="-I /path/to/openssl/include"
# LDFLAGS="-L /path/to/openssl/lib"
make LLAMA_SERVER_SSL=true llama-server
```
- Using `CMake`:
```bash
@ -416,6 +410,8 @@ node index.js
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
`timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false`
**Response format**
- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.

View File

@ -177,6 +177,8 @@ struct server_slot {
bool stopped_word = false;
bool stopped_limit = false;
bool timings_per_token = false;
bool oaicompat = false;
std::string oaicompat_model;
@ -882,6 +884,8 @@ struct server_context {
slot.oaicompat_model = "";
}
slot.timings_per_token = json_value(data, "timings_per_token", false);
slot.params.stream = json_value(data, "stream", false);
slot.params.cache_prompt = json_value(data, "cache_prompt", true);
slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict));
@ -1279,6 +1283,7 @@ struct server_context {
{"speculative.n_max", slot.params.speculative.n_max},
{"speculative.n_min", slot.params.speculative.n_min},
{"speculative.p_min", slot.params.speculative.p_min},
{"timings_per_token", slot.timings_per_token},
};
}
@ -1336,6 +1341,10 @@ struct server_context {
res.data["model"] = slot.oaicompat_model;
}
if (slot.timings_per_token) {
res.data["timings"] = slot.get_formated_timings();
}
queue_results.send(res);
}
@ -2274,12 +2283,17 @@ struct server_context {
common_sampler_accept(slot.smpl, id, true);
slot.n_decoded += 1;
const int64_t t_current = ggml_time_us();
if (slot.n_decoded == 1) {
slot.t_start_generation = ggml_time_us();
slot.t_start_generation = t_current;
slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
metrics.on_prompt_eval(slot);
}
slot.t_token_generation = (t_current - slot.t_start_generation) / 1e3;
completion_token_output result;
result.tok = id;
@ -3347,8 +3361,18 @@ int main(int argc, char ** argv) {
llama_backend_free();
};
// bind HTTP listen port, run the HTTP server in a thread
if (!svr->bind_to_port(params.hostname, params.port)) {
// bind HTTP listen port
bool was_bound = false;
if (params.port == 0) {
int bound_port = svr->bind_to_any_port(params.hostname);
if ((was_bound = (bound_port >= 0))) {
params.port = bound_port;
}
} else {
was_bound = svr->bind_to_port(params.hostname, params.port);
}
if (!was_bound) {
//LOG_ERROR("couldn't bind HTTP server socket", {
// {"hostname", params.hostname},
// {"port", params.port},
@ -3357,6 +3381,8 @@ int main(int argc, char ** argv) {
clean_up();
return 1;
}
// run the HTTP server in a thread
std::thread t([&]() { svr->listen_after_bind(); });
svr->wait_until_ready();

View File

@ -32,3 +32,17 @@ def test_server_models():
assert res.status_code == 200
assert len(res.body["data"]) == 1
assert res.body["data"][0]["id"] == server.model_alias
def test_load_split_model():
global server
server.model_hf_repo = "ggml-org/models"
server.model_hf_file = "tinyllamas/split/stories15M-q8_0-00001-of-00003.gguf"
server.model_alias = "tinyllama-split"
server.start()
res = server.make_request("POST", "/completion", data={
"n_predict": 16,
"prompt": "Hello",
"temperature": 0.0,
})
assert res.status_code == 200
assert match_regex("(little|girl)+", res.body["content"])

View File

@ -127,3 +127,39 @@ def test_completion_with_response_format(response_format: dict, n_predicted: int
assert res.status_code != 200
assert "error" in res.body
@pytest.mark.parametrize("messages", [
None,
"string",
[123],
[{}],
[{"role": 123}],
[{"role": "system", "content": 123}],
# [{"content": "hello"}], # TODO: should not be a valid case
[{"role": "system", "content": "test"}, {}],
])
def test_invalid_chat_completion_req(messages):
global server
server.start()
res = server.make_request("POST", "/chat/completions", data={
"messages": messages,
})
assert res.status_code == 400 or res.status_code == 500
assert "error" in res.body
def test_chat_completion_with_timings_per_token():
global server
server.start()
res = server.make_stream_request("POST", "/chat/completions", data={
"max_tokens": 10,
"messages": [{"role": "user", "content": "test"}],
"stream": True,
"timings_per_token": True,
})
for data in res:
assert "timings" in data
assert "prompt_per_second" in data["timings"]
assert "predicted_per_second" in data["timings"]
assert "predicted_n" in data["timings"]
assert data["timings"]["predicted_n"] <= 10

View File

@ -8,6 +8,7 @@ def create_server():
global server
server = ServerPreset.tinyllama_infill()
def test_infill_without_input_extra():
global server
server.start()
@ -19,6 +20,7 @@ def test_infill_without_input_extra():
assert res.status_code == 200
assert match_regex("(One|day|she|saw|big|scary|bird)+", res.body["content"])
def test_infill_with_input_extra():
global server
server.start()
@ -33,3 +35,23 @@ def test_infill_with_input_extra():
})
assert res.status_code == 200
assert match_regex("(cuts|Jimmy|mom|came|into|the|room)+", res.body["content"])
@pytest.mark.parametrize("input_extra", [
{},
{"filename": "ok"},
{"filename": 123},
{"filename": 123, "text": "abc"},
{"filename": 123, "text": 456},
])
def test_invalid_input_extra_req(input_extra):
global server
server.start()
res = server.make_request("POST", "/infill", data={
"prompt": "Complete this",
"input_extra": [input_extra],
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_",
"input_suffix": "}\n",
})
assert res.status_code == 400
assert "error" in res.body

View File

@ -36,3 +36,20 @@ def test_rerank():
assert most_relevant["relevance_score"] > least_relevant["relevance_score"]
assert most_relevant["index"] == 2
assert least_relevant["index"] == 3
@pytest.mark.parametrize("documents", [
[],
None,
123,
[1, 2, 3],
])
def test_invalid_rerank_req(documents):
global server
server.start()
res = server.make_request("POST", "/rerank", data={
"query": "Machine learning is",
"documents": documents,
})
assert res.status_code == 400
assert "error" in res.body

View File

@ -0,0 +1,103 @@
import pytest
from utils import *
# We use a F16 MOE gguf as main model, and q4_0 as draft model
server = ServerPreset.stories15m_moe()
MODEL_DRAFT_FILE_URL = "https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories15M-q4_0.gguf"
def create_server():
global server
server = ServerPreset.stories15m_moe()
# download draft model file if needed
file_name = MODEL_DRAFT_FILE_URL.split('/').pop()
model_draft_file = f'../../../{file_name}'
if not os.path.exists(model_draft_file):
print(f"Downloading {MODEL_DRAFT_FILE_URL} to {model_draft_file}")
with open(model_draft_file, 'wb') as f:
f.write(requests.get(MODEL_DRAFT_FILE_URL).content)
print(f"Done downloading draft model file")
# set default values
server.model_draft = model_draft_file
server.draft_min = 4
server.draft_max = 8
@pytest.fixture(scope="module", autouse=True)
def fixture_create_server():
return create_server()
def test_with_and_without_draft():
global server
server.model_draft = None # disable draft model
server.start()
res = server.make_request("POST", "/completion", data={
"prompt": "I believe the meaning of life is",
"temperature": 0.0,
"top_k": 1,
})
assert res.status_code == 200
content_no_draft = res.body["content"]
server.stop()
# create new server with draft model
create_server()
server.start()
res = server.make_request("POST", "/completion", data={
"prompt": "I believe the meaning of life is",
"temperature": 0.0,
"top_k": 1,
})
assert res.status_code == 200
content_draft = res.body["content"]
assert content_no_draft == content_draft
def test_different_draft_min_draft_max():
global server
test_values = [
(1, 2),
(1, 4),
(4, 8),
(4, 12),
(8, 16),
]
last_content = None
for draft_min, draft_max in test_values:
server.stop()
server.draft_min = draft_min
server.draft_max = draft_max
server.start()
res = server.make_request("POST", "/completion", data={
"prompt": "I believe the meaning of life is",
"temperature": 0.0,
"top_k": 1,
})
assert res.status_code == 200
if last_content is not None:
assert last_content == res.body["content"]
last_content = res.body["content"]
@pytest.mark.parametrize("n_slots,n_requests", [
(1, 2),
(2, 2),
])
def test_multi_requests_parallel(n_slots: int, n_requests: int):
global server
server.n_slots = n_slots
server.start()
tasks = []
for _ in range(n_requests):
tasks.append((server.make_request, ("POST", "/completion", {
"prompt": "I believe the meaning of life is",
"temperature": 0.0,
"top_k": 1,
})))
results = parallel_function_calls(tasks)
for res in results:
assert res.status_code == 200
assert match_regex("(wise|kind|owl|answer)+", res.body["content"])

View File

@ -46,6 +46,7 @@ class ServerProcess:
model_alias: str | None = None
model_url: str | None = None
model_file: str | None = None
model_draft: str | None = None
n_threads: int | None = None
n_gpu_layer: int | None = None
n_batch: int | None = None
@ -68,6 +69,8 @@ class ServerProcess:
response_format: str | None = None
lora_files: List[str] | None = None
disable_ctx_shift: int | None = False
draft_min: int | None = None
draft_max: int | None = None
# session variables
process: subprocess.Popen | None = None
@ -102,6 +105,8 @@ class ServerProcess:
server_args.extend(["--model", self.model_file])
if self.model_url:
server_args.extend(["--model-url", self.model_url])
if self.model_draft:
server_args.extend(["--model-draft", self.model_draft])
if self.model_hf_repo:
server_args.extend(["--hf-repo", self.model_hf_repo])
if self.model_hf_file:
@ -147,6 +152,10 @@ class ServerProcess:
server_args.extend(["--no-context-shift"])
if self.api_key:
server_args.extend(["--api-key", self.api_key])
if self.draft_max:
server_args.extend(["--draft-max", self.draft_max])
if self.draft_min:
server_args.extend(["--draft-min", self.draft_min])
args = [str(arg) for arg in [server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}")
@ -185,6 +194,7 @@ class ServerProcess:
raise TimeoutError(f"Server did not start within {timeout_seconds} seconds")
def stop(self) -> None:
if self in server_instances:
server_instances.remove(self)
if self.process:
print(f"Stopping server with pid={self.process.pid}")

View File

@ -650,6 +650,10 @@ static json format_final_response_oaicompat(const json & request, const json & r
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
}
if (result.contains("timings")) {
res.push_back({"timings", json_value(result, "timings", json::object())});
}
return res;
}
@ -740,6 +744,11 @@ static std::vector<json> format_partial_response_oaicompat(const json & result,
{"model", modelname},
{"object", "chat.completion.chunk"}
};
if (result.contains("timings")) {
ret.push_back({"timings", json_value(result, "timings", json::object())});
}
if (!finish_reason.empty()) {
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);

View File

@ -2,4 +2,4 @@ set(TARGET llama-simple-chat)
add_executable(${TARGET} simple-chat.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-simple)
add_executable(${TARGET} simple.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-speculative-simple)
add_executable(${TARGET} speculative-simple.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-speculative)
add_executable(${TARGET} speculative.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-tokenize)
add_executable(${TARGET} tokenize.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -96,6 +96,7 @@ option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
option(GGML_AVX512 "ggml: enable AVX512" OFF)
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
@ -161,7 +162,6 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
option(GGML_OPENMP "ggml: use OpenMP" ON)
option(GGML_RPC "ggml: use RPC" OFF)
option(GGML_AMX "ggml: use AMX" OFF)
option(GGML_SYCL "ggml: use SYCL" OFF)
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
set (GGML_SYCL_TARGET "INTEL" CACHE STRING

View File

@ -1,25 +0,0 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
// buffer_type API
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
// backend API
GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
#ifdef __cplusplus
}
#endif

View File

@ -261,21 +261,15 @@ function(ggml_add_backend backend)
if (${backend_id})
string(TOLOWER "ggml-${backend}" backend_target)
add_subdirectory(${backend_target})
# check again in case the backend disabled itself
# note that this should NOT be the normal behavior, in case of errors the backend should fail the build
# however, currently it is necessary for AMX, since it is enabled by default on llama.cpp
if (${backend_id})
message(STATUS "Including ${backend} backend")
if (NOT GGML_BACKEND_DL)
string(TOUPPER "GGML_USE_${backend}" backend_use)
target_compile_definitions(ggml PUBLIC ${backend_use})
endif()
endif()
endif()
endfunction()
ggml_add_backend(CPU)
ggml_add_backend(AMX)
ggml_add_backend(BLAS)
ggml_add_backend(CANN)
ggml_add_backend(CUDA)
@ -289,7 +283,7 @@ ggml_add_backend(Vulkan)
foreach (target ggml-base ggml)
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
target_compile_features (${target} PRIVATE c_std_11) # don't bump
target_compile_features (${target} PRIVATE c_std_11 cxx_std_17) # don't bump
endforeach()
target_link_libraries(ggml-base PRIVATE Threads::Threads)

View File

@ -1,105 +0,0 @@
if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$") AND
CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
message(STATUS "Using AMX")
file(GLOB GGML_HEADERS_AMX "*.h")
list(APPEND GGML_HEADERS_AMX "../../include/ggml-amx.h")
file(GLOB GGML_SOURCES_AMX "*.cpp")
ggml_add_backend_library(ggml-amx
${GGML_HEADERS_AMX}
${GGML_SOURCES_AMX}
)
# this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
# TODO: integrate AMX backend into the CPU backend
if (MSVC)
# instruction set detection for MSVC only
if (GGML_NATIVE)
# TODO: improve, should not reference files from the parent folder
include(../ggml-cpu/cmake/FindSIMD.cmake)
endif ()
if (GGML_AVX512)
list(APPEND ARCH_FLAGS /arch:AVX512)
# MSVC has no compile-time flags enabling specific
# AVX512 extensions, neither it defines the
# macros corresponding to the extensions.
# Do it manually.
if (GGML_AVX512_VBMI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
endif()
if (GGML_AVX512_VNNI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
endif()
if (GGML_AVX512_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
endif()
if (GGML_AMX_TILE)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
endif()
if (GGML_AMX_INT8)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
endif()
if (GGML_AMX_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
endif()
elseif (GGML_AVX2)
list(APPEND ARCH_FLAGS /arch:AVX2)
elseif (GGML_AVX)
list(APPEND ARCH_FLAGS /arch:AVX)
endif()
else()
if (GGML_NATIVE)
list(APPEND ARCH_FLAGS -march=native)
endif()
if (GGML_F16C)
list(APPEND ARCH_FLAGS -mf16c)
endif()
if (GGML_FMA)
list(APPEND ARCH_FLAGS -mfma)
endif()
if (GGML_AVX)
list(APPEND ARCH_FLAGS -mavx)
endif()
if (GGML_AVX2)
list(APPEND ARCH_FLAGS -mavx2)
endif()
if (GGML_AVX512)
list(APPEND ARCH_FLAGS -mavx512f)
list(APPEND ARCH_FLAGS -mavx512dq)
list(APPEND ARCH_FLAGS -mavx512bw)
endif()
if (GGML_AVX512_VBMI)
list(APPEND ARCH_FLAGS -mavx512vbmi)
endif()
if (GGML_AVX512_VNNI)
list(APPEND ARCH_FLAGS -mavx512vnni)
endif()
if (GGML_AVX512_BF16)
list(APPEND ARCH_FLAGS -mavx512bf16)
endif()
if (GGML_AMX_TILE)
list(APPEND ARCH_FLAGS -mamx-tile)
endif()
if (GGML_AMX_INT8)
list(APPEND ARCH_FLAGS -mamx-int8)
endif()
if (GGML_AMX_BF16)
list(APPEND ARCH_FLAGS -mamx-bf16)
endif()
endif()
target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS})
else()
set(GGML_AMX OFF PARENT_SCOPE)
message(WARNING "AMX requires x86 and gcc version > 11.0. Turning off GGML_AMX.")
endif()

View File

@ -1,449 +0,0 @@
#include "ggml-amx.h"
#include "ggml-amx/common.h"
#include "ggml-amx/mmq.h"
#include "ggml-backend-impl.h"
#include "ggml-impl.h"
#if defined(__gnu_linux__)
#include <sys/syscall.h>
#include <unistd.h>
#endif
#include <cstdlib>
#include <cstring>
#include <memory>
#if defined(__AMX_INT8__)
// AMX buffer interface
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
}
static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
return (void *)(buffer->context);
}
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
memset((char *)tensor->data + offset, value, size);
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
if (qtype_has_amx_kernels(tensor->type)) {
ggml_backend_amx_convert_weight(tensor, data, offset, size);
} else {
memcpy((char *)tensor->data + offset, data, size);
}
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
memcpy(data, (const char *)tensor->data + offset, size);
GGML_UNUSED(buffer);
}
static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
if (ggml_backend_buffer_is_host(src->buffer)) {
if (qtype_has_amx_kernels(src->type)) {
ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_backend_amx_get_alloc_size(dst));
} else {
memcpy(dst->data, src->data, ggml_nbytes(src));
}
return true;
}
return false;
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
memset(buffer->context, value, buffer->size);
}
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
/* .get_base = */ ggml_backend_amx_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_amx_buffer_get_tensor,
/* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor,
/* .clear = */ ggml_backend_amx_buffer_clear,
/* .reset = */ NULL,
};
static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "AMX";
GGML_UNUSED(buft);
}
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
if (data == NULL) {
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL;
}
return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
}
static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return TENSOR_ALIGNMENT;
GGML_UNUSED(buft);
}
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
return ggml_backend_amx_get_alloc_size(tensor);
GGML_UNUSED(buft);
}
static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return false;
GGML_UNUSED(buft);
}
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
/* .iface = */ {
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
},
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
/* .context = */ NULL,
};
return &ggml_backend_buffer_type_amx;
}
// backend interface
static const char * ggml_backend_amx_name(ggml_backend_t backend) {
return "AMX";
GGML_UNUSED(backend);
}
static void ggml_backend_amx_free(ggml_backend_t backend) {
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
delete ctx;
delete backend;
}
static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i];
switch (node->op) {
case GGML_OP_MUL_MAT:
ggml_backend_amx_mul_mat(ctx, node);
break;
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
break;
default:
fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
GGML_ASSERT(false);
}
}
return GGML_STATUS_SUCCESS;
GGML_UNUSED(backend);
}
static struct ggml_backend_i ggml_backend_amx_i = {
/* .get_name = */ ggml_backend_amx_name,
/* .free = */ ggml_backend_amx_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_amx_graph_compute,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
};
static ggml_guid_t ggml_backend_amx_guid() {
static ggml_guid guid = { 0x13, 0xb8, 0xa4, 0xc4, 0xba, 0xfe, 0x51, 0x67, 0x87, 0x44, 0x55, 0x15, 0xb2, 0x35, 0x62, 0x3e };
return &guid;
}
#define ARCH_GET_XCOMP_PERM 0x1022
#define ARCH_REQ_XCOMP_PERM 0x1023
#define XFEATURE_XTILECFG 17
#define XFEATURE_XTILEDATA 18
static bool ggml_amx_init() {
#if defined(__gnu_linux__)
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
fprintf(stderr, "AMX is not ready to be used!\n");
return false;
}
return true;
#elif defined(_WIN32)
return true;
#endif
}
ggml_backend_t ggml_backend_amx_init() {
// invoke a Linux system call to request access to AMX features
ggml_amx_init();
// backend context
ggml_backend_amx_context * ctx = new ggml_backend_amx_context;
// ggml amx backend
ggml_backend_t backend = new ggml_backend {
/* .guid = */ ggml_backend_amx_guid(),
/* .interface = */ ggml_backend_amx_i,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
/* .context = */ ctx,
};
return backend;
}
bool ggml_backend_is_amx(ggml_backend_t backend) {
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_amx_guid());
}
void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
GGML_ASSERT(ggml_backend_is_amx(backend_amx));
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend_amx->context;
ctx->n_threads = n_threads;
}
// device interface
static const char * ggml_backend_amx_device_get_name(ggml_backend_dev_t dev) {
return "AMX";
GGML_UNUSED(dev);
}
static const char * ggml_backend_amx_device_get_description(ggml_backend_dev_t dev) {
return "Intel Advanced Matrix Extensions";
GGML_UNUSED(dev);
}
static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
// TODO
*free = 0;
*total = 0;
GGML_UNUSED(dev);
}
static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
return GGML_BACKEND_DEVICE_TYPE_ACCEL;
GGML_UNUSED(dev);
}
static void ggml_backend_amx_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
props->name = ggml_backend_amx_device_get_name(dev);
props->description = ggml_backend_amx_device_get_description(dev);
props->type = ggml_backend_amx_device_get_type(dev);
ggml_backend_amx_device_get_memory(dev, &props->memory_free, &props->memory_total);
// `buffer_from_host_ptr` is intended to be used in mmap, when memory layout unchanged
props->caps = {
/* .async = */ false,
/* .host_buffer = */ false,
/* .buffer_from_host_ptr = */ false,
/* .events = */ false,
};
}
static ggml_backend_t ggml_backend_amx_device_init(ggml_backend_dev_t dev, const char * params) {
return ggml_backend_amx_init();
GGML_UNUSED(dev);
GGML_UNUSED(params);
}
static ggml_backend_buffer_type_t ggml_backend_amx_device_get_buffer_type(ggml_backend_dev_t dev) {
return ggml_backend_amx_buffer_type();
GGML_UNUSED(dev);
}
static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
// handle only 2d gemm for now
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
};
switch (op->op) {
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
return true;
case GGML_OP_MUL_MAT: {
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];
const enum ggml_type type = src0->type;
const int64_t ne0 = op->ne[0];
// amx kernels enables for Q4_0, Q4_1, Q8_0, F16
// Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
bool can_use_amx =
is_contiguous_2d(src0) && // src0 must be contiguous
is_contiguous_2d(src1) && // src1 must be contiguous
src1->type == GGML_TYPE_F32 && // src1 must be float32
has_amx_kernels && // with amx kernel impls
ne0 % (TILE_N * 2) == 0; // out_features is 32x
return can_use_amx;
}
default:
return false;
}
GGML_UNUSED(dev);
}
static bool ggml_backend_amx_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
GGML_UNUSED(dev);
}
static const struct ggml_backend_device_i ggml_backend_amx_device_i = {
/* .get_name = */ ggml_backend_amx_device_get_name,
/* .get_description = */ ggml_backend_amx_device_get_description,
/* .get_memory = */ ggml_backend_amx_device_get_memory,
/* .get_type = */ ggml_backend_amx_device_get_type,
/* .get_props = */ ggml_backend_amx_device_get_props,
/* .init_backend = */ ggml_backend_amx_device_init,
/* .get_buffer_type = */ ggml_backend_amx_device_get_buffer_type,
/* .get_host_buffer_type = */ NULL,
/* .buffer_from_host_ptr = */ NULL,
/* .supports_op = */ ggml_backend_amx_device_supports_op,
/* .supports_buft = */ ggml_backend_amx_device_supports_buft,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_synchronize = */ NULL,
};
// backend reg interface
static const char * ggml_backend_amx_reg_get_name(ggml_backend_reg_t reg) {
return "AMX";
GGML_UNUSED(reg);
}
static size_t ggml_backend_amx_reg_get_device_count(ggml_backend_reg_t reg) {
return 1;
GGML_UNUSED(reg);
}
static ggml_backend_dev_t ggml_backend_amx_reg_get_device(ggml_backend_reg_t reg, size_t index) {
GGML_ASSERT(index == 0);
static ggml_backend_device ggml_backend_amx_device = {
/* .iface = */ ggml_backend_amx_device_i,
/* .reg = */ reg,
/* .context = */ nullptr,
};
return &ggml_backend_amx_device;
GGML_UNUSED(reg);
GGML_UNUSED(index);
}
static void * ggml_backend_amx_get_proc_address(ggml_backend_reg_t reg, const char * name) {
if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
return (void *)ggml_backend_amx_set_n_threads;
}
return NULL;
GGML_UNUSED(reg);
GGML_UNUSED(name);
}
static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = {
/* .get_name = */ ggml_backend_amx_reg_get_name,
/* .get_device_count = */ ggml_backend_amx_reg_get_device_count,
/* .get_device = */ ggml_backend_amx_reg_get_device,
/* .get_proc_address = */ ggml_backend_amx_get_proc_address,
};
ggml_backend_reg_t ggml_backend_amx_reg(void) {
static struct ggml_backend_reg ggml_backend_amx_reg = {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_amx_reg_i,
/* .context = */ NULL,
};
return &ggml_backend_amx_reg;
}
#else // if defined(__AMX_INT8__)
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void) {
return nullptr;
}
bool ggml_backend_is_amx(ggml_backend_t backend) {
GGML_UNUSED(backend);
return false;
}
ggml_backend_t ggml_backend_amx_init(void) {
fprintf(stderr, "GGML is not compiled with AMX support!\n");
return nullptr;
}
void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
fprintf(stderr, "GGML is not compiled with AMX support!\n");
GGML_UNUSED(backend_amx);
GGML_UNUSED(n_threads);
}
ggml_backend_reg_t ggml_backend_amx_reg(void) {
return nullptr;
}
#endif
GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg)

View File

@ -211,7 +211,12 @@ extern "C" {
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
// Add backend dynamic loading support to the backend
// Initialize the backend
typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
// Optional: obtain a score for the backend based on the system configuration
// Higher scores are preferred, 0 means the backend is not supported in the current system
typedef int (*ggml_backend_score_t)(void);
#ifdef GGML_BACKEND_DL
# ifdef __cplusplus
@ -222,15 +227,28 @@ extern "C" {
ggml_backend_reg_t ggml_backend_init(void) { \
return reg_fn(); \
}
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \
extern "C" { \
GGML_BACKEND_API int ggml_backend_score(void); \
} \
int ggml_backend_score(void) { \
return score_fn(); \
}
# else
# define GGML_BACKEND_DL_IMPL(reg_fn) \
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
ggml_backend_reg_t ggml_backend_init(void) { \
return reg_fn(); \
}
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \
GGML_BACKEND_API int ggml_backend_score(void); \
int ggml_backend_score(void) { \
return score_fn(); \
}
# endif
#else
# define GGML_BACKEND_DL_IMPL(reg_fn)
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
#endif
#ifdef __cplusplus

View File

@ -2,8 +2,13 @@
#include "ggml-backend.h"
#include "ggml-impl.h"
#include <algorithm>
#include <codecvt>
#include <cstring>
#include <filesystem>
#include <locale>
#include <memory>
#include <string>
#include <type_traits>
#include <vector>
#ifdef _WIN32
@ -49,10 +54,6 @@
#include "ggml-rpc.h"
#endif
#ifdef GGML_USE_AMX
# include "ggml-amx.h"
#endif
#ifdef GGML_USE_CANN
#include "ggml-cann.h"
#endif
@ -61,9 +62,71 @@
#include "ggml-kompute.h"
#endif
#ifdef _WIN32
using dl_handle = std::remove_pointer_t<HMODULE>;
struct dl_handle_deleter {
void operator()(HMODULE handle) {
FreeLibrary(handle);
}
};
static dl_handle * dl_load_library(const std::wstring & path) {
// suppress error dialogs for missing DLLs
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
HMODULE handle = LoadLibraryW(path.c_str());
SetErrorMode(old_mode);
return handle;
}
static dl_handle * dl_load_library(const std::string & path) {
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return dl_load_library(converter.from_bytes(path));
}
static void * dl_get_sym(dl_handle * handle, const char * name) {
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
void * p = (void *) GetProcAddress(handle, name);
SetErrorMode(old_mode);
return p;
}
#else
using dl_handle = void;
struct dl_handle_deleter {
void operator()(void * handle) {
dlclose(handle);
}
};
static void * dl_load_library(const std::string & path) {
dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
return handle;
}
static void * dl_get_sym(dl_handle * handle, const char * name) {
return dlsym(handle, name);
}
#endif
using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
struct ggml_backend_reg_entry {
ggml_backend_reg_t reg;
void * handle;
dl_handle_ptr handle;
};
struct ggml_backend_registry {
@ -92,9 +155,6 @@ struct ggml_backend_registry {
#ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg());
#endif
#ifdef GGML_USE_AMX
register_backend(ggml_backend_amx_reg());
#endif
#ifdef GGML_USE_KOMPUTE
register_backend(ggml_backend_kompute_reg());
#endif
@ -104,13 +164,16 @@ struct ggml_backend_registry {
}
~ggml_backend_registry() {
while (!backends.empty()) {
// use silent since the log system may have been destroyed at this point
unload_backend(backends.back().reg, true);
// FIXME: backends cannot be safely unloaded without a function to destroy all the backend resources,
// since backend threads may still be running and accessing resources from the dynamic library
for (auto & entry : backends) {
if (entry.handle) {
entry.handle.release(); // NOLINT
}
}
}
void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) {
void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) {
if (!reg) {
return;
}
@ -119,7 +182,7 @@ struct ggml_backend_registry {
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
#endif
backends.push_back({ reg, handle });
backends.push_back({ reg, std::move(handle) });
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
register_device(ggml_backend_reg_dev_get(reg, i));
}
@ -133,54 +196,31 @@ struct ggml_backend_registry {
}
ggml_backend_reg_t load_backend(const char * path, bool silent) {
#ifdef _WIN32
// suppress error dialogs for missing DLLs
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
HMODULE handle = LoadLibraryA(path);
dl_handle_ptr handle { dl_load_library(path) };
if (!handle) {
if (!silent) {
GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError());
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
}
SetErrorMode(old_mode);
return nullptr;
}
ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init");
SetErrorMode(old_mode);
if (!backend_init) {
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
if (score_fn && score_fn() == 0) {
if (!silent) {
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError());
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
}
FreeLibrary(handle);
return nullptr;
}
#else
void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
if (!handle) {
auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
if (!backend_init_fn) {
if (!silent) {
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror());
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
}
return nullptr;
}
auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init");
if (!backend_init) {
if (!silent) {
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror());
}
dlclose(handle);
return nullptr;
}
#endif
ggml_backend_reg_t reg = backend_init();
ggml_backend_reg_t reg = backend_init_fn();
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
if (!silent) {
if (!reg) {
@ -190,22 +230,19 @@ struct ggml_backend_registry {
__func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
}
}
#ifdef _WIN32
FreeLibrary(handle);
#else
dlclose(handle);
#endif
return nullptr;
}
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
register_backend(reg, handle);
register_backend(reg, std::move(handle));
return reg;
}
void unload_backend(ggml_backend_reg_t reg, bool silent) {
auto it = std::find_if(backends.begin(), backends.end(),
[reg](ggml_backend_reg_entry entry) { return entry.reg == reg; });
[reg](const ggml_backend_reg_entry & entry) { return entry.reg == reg; });
if (it == backends.end()) {
if (!silent) {
@ -224,15 +261,6 @@ struct ggml_backend_registry {
[reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
devices.end());
// unload library
if (it->handle) {
#ifdef _WIN32
FreeLibrary((HMODULE) it->handle);
#else
dlclose(it->handle);
#endif
}
// remove backend
backends.erase(it);
}
@ -348,12 +376,7 @@ void ggml_backend_unload(ggml_backend_reg_t reg) {
get_reg().unload_backend(reg, true);
}
void ggml_backend_load_all() {
std::vector<std::string> search_prefix;
// add the executable directory to the search path
// FIXME: this is convenient for development, but it should probably be disabled in production
static std::string get_executable_path() {
#if defined(__APPLE__)
// get executable path
std::vector<char> path;
@ -371,7 +394,7 @@ void ggml_backend_load_all() {
if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash);
}
search_prefix.push_back(base_path + "/");
return base_path + "/";
#elif defined(__linux__)
std::string base_path = ".";
std::vector<char> path(1024);
@ -393,38 +416,104 @@ void ggml_backend_load_all() {
path.resize(path.size() * 2);
}
search_prefix.push_back(base_path + "/");
return base_path + "/";
#elif defined(_WIN32)
std::vector<char> path(MAX_PATH);
DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
if (len == 0) {
return "";
}
std::string base_path(path.data(), len);
// remove executable name
auto last_slash = base_path.find_last_of('\\');
if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash);
}
return base_path + "\\";
#endif
}
auto & reg = get_reg();
auto try_load = [&](const std::string & name) {
std::string os_name;
static std::string backend_filename_prefix() {
#ifdef _WIN32
os_name = "ggml-" + name + ".dll";
return "ggml-";
#else
os_name = "libggml-" + name + ".so";
return "libggml-";
#endif
if (reg.load_backend(os_name.c_str(), true)) {
return;
}
for (const auto & prefix : search_prefix) {
if (reg.load_backend((prefix + os_name).c_str(), true)) {
return;
}
}
};
try_load("amx");
try_load("blas");
try_load("cann");
try_load("cuda");
try_load("hip");
try_load("kompute");
try_load("metal");
try_load("rpc");
try_load("sycl");
try_load("vulkan");
try_load("musa");
try_load("cpu");
static std::string backend_filename_suffix() {
#ifdef _WIN32
return ".dll";
#else
return ".so";
#endif
}
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) {
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
// TODO: search system paths
std::vector<std::string> search_paths = { "./", get_executable_path() };
std::string file_prefix = backend_filename_prefix() + name + "-";
int best_score = 0;
std::string best_path;
namespace fs = std::filesystem;
for (const auto & search_path : search_paths) {
if (!fs::exists(search_path)) {
continue;
}
for (const auto & entry : fs::directory_iterator(search_path)) {
if (entry.is_regular_file()) {
std::string filename = entry.path().filename().string();
std::string ext = entry.path().extension().string();
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
if (!handle && !silent) {
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
}
if (handle) {
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
if (score_fn) {
int s = score_fn();
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
#endif
if (s > best_score) {
best_score = s;
best_path = entry.path().string();
}
}
}
}
}
}
}
if (best_score == 0) {
// try to load the base backend
for (const auto & search_path : search_paths) {
std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
if (fs::exists(path)) {
return get_reg().load_backend(path.c_str(), silent);
}
}
return nullptr;
}
return get_reg().load_backend(best_path.c_str(), silent);
}
void ggml_backend_load_all() {
ggml_backend_load_best("blas", true);
ggml_backend_load_best("cann", true);
ggml_backend_load_best("cuda", true);
ggml_backend_load_best("hip", true);
ggml_backend_load_best("kompute", true);
ggml_backend_load_best("metal", true);
ggml_backend_load_best("rpc", true);
ggml_backend_load_best("sycl", true);
ggml_backend_load_best("vulkan", true);
ggml_backend_load_best("musa", true);
ggml_backend_load_best("cpu", true);
}

View File

@ -742,7 +742,8 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
// since the tensor is pre-allocated, it cannot be moved to another backend
GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name);
ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op));
}
// graph input

View File

@ -1,12 +1,20 @@
ggml_add_backend_library(ggml-cpu
ggml_add_backend_library(ggml-cpu)
list (APPEND GGML_CPU_SOURCES
ggml-cpu.c
ggml-cpu.cpp
ggml-cpu-aarch64.c
ggml-cpu-aarch64.h
ggml-cpu-quants.c
ggml-cpu-quants.h
amx/amx.cpp
amx/amx.h
amx/mmq.cpp
amx/mmq.h
ggml-cpu-impl.h
)
target_compile_features(ggml-cpu PRIVATE c_std_11 cxx_std_17)
target_include_directories(ggml-cpu PRIVATE .)
if (APPLE AND GGML_ACCELERATE)
@ -14,9 +22,9 @@ if (APPLE AND GGML_ACCELERATE)
if (ACCELERATE_FRAMEWORK)
message(STATUS "Accelerate framework found")
add_compile_definitions(GGML_USE_ACCELERATE)
add_compile_definitions(ACCELERATE_NEW_LAPACK)
add_compile_definitions(ACCELERATE_LAPACK_ILP64)
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_ACCELERATE)
target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_NEW_LAPACK)
target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_LAPACK_ILP64)
target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
else()
@ -29,15 +37,9 @@ if (GGML_OPENMP)
if (OpenMP_FOUND)
message(STATUS "OpenMP found")
add_compile_definitions(GGML_USE_OPENMP)
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_OPENMP)
target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
# FIXME: should be replaced with a compiler id check
#if (GGML_MUSA)
# list(APPEND GGML_CPU_EXTRA_INCLUDES "/usr/lib/llvm-14/lib/clang/14.0.0/include")
# list(APPEND GGML_CPU_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
#endif()
else()
message(WARNING "OpenMP not found")
endif()
@ -46,9 +48,9 @@ endif()
if (GGML_LLAMAFILE)
message(STATUS "Using llamafile")
add_compile_definitions(GGML_USE_LLAMAFILE)
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_LLAMAFILE)
target_sources(ggml-cpu PRIVATE
list(APPEND GGML_CPU_SOURCES
llamafile/sgemm.cpp
llamafile/sgemm.h)
endif()
@ -58,7 +60,7 @@ if (GGML_CPU_HBM)
message(STATUS "Using memkind for CPU HBM")
add_compile_definitions(GGML_USE_CPU_HBM)
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_HBM)
target_link_libraries(ggml-cpu PUBLIC memkind)
endif()
@ -72,16 +74,16 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
message(STATUS "ARM detected")
if (MSVC)
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
add_compile_definitions(__ARM_NEON)
add_compile_definitions(__ARM_FEATURE_FMA)
list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead
list(APPEND ARCH_DEFINITIONS __ARM_NEON)
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA)
set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
if (GGML_COMPILER_SUPPORT_DOTPROD)
add_compile_definitions(__ARM_FEATURE_DOTPROD)
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
message(STATUS "ARM feature DOTPROD enabled")
endif ()
@ -89,14 +91,14 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
message(STATUS "ARM feature MATMUL_INT8 enabled")
endif ()
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
endif ()
@ -118,7 +120,7 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
if (GGML_COMPILER_SUPPORT_DOTPROD)
set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
add_compile_definitions(__ARM_FEATURE_DOTPROD)
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
message(STATUS "ARM feature DOTPROD enabled")
endif ()
@ -131,7 +133,7 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
message(STATUS "ARM feature MATMUL_INT8 enabled")
endif ()
@ -175,7 +177,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
if (MSVC)
# instruction set detection for MSVC only
if (GGML_NATIVE)
# TODO: improve, should not reference files from the parent folder
include(cmake/FindSIMD.cmake)
endif ()
if (GGML_AVX512)
@ -185,43 +186,43 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
# macros corresponding to the extensions.
# Do it manually.
if (GGML_AVX512_VBMI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
list(APPEND ARCH_FLAGS -mavx512vbmi)
endif()
endif()
if (GGML_AVX512_VNNI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
list(APPEND ARCH_DEFINITIONS __AVX512VNNI__)
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
list(APPEND ARCH_FLAGS -mavx512vnni)
endif()
endif()
if (GGML_AVX512_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
list(APPEND ARCH_DEFINITIONS __AVX512BF16__)
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
list(APPEND ARCH_FLAGS -mavx512bf16)
endif()
endif()
if (GGML_AMX_TILE)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
list(APPEND ARCH_DEFINITIONS __AMX_TILE__)
endif()
if (GGML_AMX_INT8)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
list(APPEND ARCH_DEFINITIONS __AMX_INT8__)
endif()
if (GGML_AMX_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
list(APPEND ARCH_DEFINITIONS __AMX_BF16__)
endif()
elseif (GGML_AVX2)
list(APPEND ARCH_FLAGS /arch:AVX2)
elseif (GGML_AVX)
list(APPEND ARCH_FLAGS /arch:AVX)
endif()
if (GGML_AVX_VNNI)
list(APPEND ARCH_DEFINITIONS __AVXVNNI__)
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
list(APPEND ARCH_FLAGS -mavxvnni)
endif()
endif()
else()
if (GGML_NATIVE)
list(APPEND ARCH_FLAGS -march=native)
@ -238,6 +239,9 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
if (GGML_AVX2)
list(APPEND ARCH_FLAGS -mavx2)
endif()
if (GGML_AVX_VNNI)
list(APPEND ARCH_FLAGS -mavxvnni)
endif()
if (GGML_AVX512)
list(APPEND ARCH_FLAGS -mavx512f)
list(APPEND ARCH_FLAGS -mavx512dq)
@ -299,11 +303,16 @@ endif()
if (GGML_CPU_AARCH64)
message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
add_compile_definitions(GGML_USE_CPU_AARCH64)
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_AARCH64)
endif()
target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES})
set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS "${ARCH_FLAGS}")
set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}")
# the feature detection code must be compiled without any architecture flags
target_sources(ggml-cpu PRIVATE cpu-feats-x86.cpp)
# target_sources(ggml-cpu PRIVATE cpu-feats-arm.cpp) # TODO: ARM feature detection
if (EMSCRIPTEN)
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")

View File

@ -0,0 +1,196 @@
#include "amx.h"
#include "common.h"
#include "mmq.h"
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml-impl.h"
#include "ggml-cpu.h"
#if defined(__gnu_linux__)
#include <sys/syscall.h>
#include <unistd.h>
#endif
#include <cstdlib>
#include <cstring>
#include <memory>
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
// AMX buffer interface
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
}
static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
return (void *)(buffer->context);
}
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
memset((char *)tensor->data + offset, value, size);
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
if (qtype_has_amx_kernels(tensor->type)) {
ggml_backend_amx_convert_weight(tensor, data, offset, size);
} else {
memcpy((char *)tensor->data + offset, data, size);
}
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
memcpy(data, (const char *)tensor->data + offset, size);
GGML_UNUSED(buffer);
}
static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
if (ggml_backend_buffer_is_host(src->buffer)) {
if (qtype_has_amx_kernels(src->type)) {
ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst));
} else {
memcpy(dst->data, src->data, ggml_nbytes(src));
}
return true;
}
return false;
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
memset(buffer->context, value, buffer->size);
}
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
/* .get_base = */ ggml_backend_amx_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_amx_buffer_get_tensor,
/* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor,
/* .clear = */ ggml_backend_amx_buffer_clear,
/* .reset = */ NULL,
};
static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "AMX";
GGML_UNUSED(buft);
}
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
if (data == NULL) {
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL;
}
return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
}
static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return TENSOR_ALIGNMENT;
GGML_UNUSED(buft);
}
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
return ggml_backend_amx_get_alloc_size(tensor);
GGML_UNUSED(buft);
}
static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return false;
GGML_UNUSED(buft);
}
#define ARCH_GET_XCOMP_PERM 0x1022
#define ARCH_REQ_XCOMP_PERM 0x1023
#define XFEATURE_XTILECFG 17
#define XFEATURE_XTILEDATA 18
static bool ggml_amx_init() {
#if defined(__gnu_linux__)
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
fprintf(stderr, "AMX is not ready to be used!\n");
return false;
}
return true;
#elif defined(_WIN32)
return true;
#endif
}
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
/* .iface = */ {
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
},
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
/* .context = */ NULL,
};
if (!ggml_amx_init()) {
return NULL;
}
return &ggml_backend_buffer_type_amx;
}
bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
}
bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op) {
// handle only 2d gemm for now
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
};
switch (op->op) {
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
return true;
case GGML_OP_MUL_MAT: {
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];
const enum ggml_type type = src0->type;
const int64_t ne0 = op->ne[0];
// amx kernels enables for Q4_0, Q4_1, Q8_0, F16
// Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
bool can_use_amx =
is_contiguous_2d(src0) && // src0 must be contiguous
is_contiguous_2d(src1) && // src1 must be contiguous
src1->type == GGML_TYPE_F32 && // src1 must be float32
has_amx_kernels && // with amx kernel impls
ne0 % (TILE_N * 2) == 0; // out_features is 32x
return can_use_amx;
}
default:
return false;
}
}
#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)

View File

@ -0,0 +1,20 @@
#include "ggml-backend.h"
#include "ggml-cpu-impl.h"
#ifdef __cplusplus
extern "C" {
#endif
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft);
bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op);
void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
#endif
#ifdef __cplusplus
}
#endif

View File

@ -1,8 +1,7 @@
#pragma once
#include "ggml.h"
// hack until AMX is moved into the CPU backend
#include "../ggml-cpu/ggml-cpu-impl.h" // <immintrin.h>
#include "ggml-cpu-impl.h"
#include <algorithm>
#include <memory>
@ -74,16 +73,23 @@ inline void parallel_for(int nth, int n, const func_t& f) {
#endif
}
template <typename func_t>
inline void parallel_for_ggml(const ggml_compute_params * params, int n, const func_t & f) {
int tbegin, tend;
balance211(n, params->nth, params->ith, tbegin, tend);
f(tbegin, tend);
}
// quantized types that have AMX support
inline bool qtype_has_amx_kernels(const enum ggml_type type) {
// TODO: fix padding for vnni format
return (type == GGML_TYPE_Q4_0) ||
(type == GGML_TYPE_Q4_1);
//(type == GGML_TYPE_Q8_0) ||
//(type == GGML_TYPE_Q4_K) ||
//(type == GGML_TYPE_Q5_K) ||
//(type == GGML_TYPE_Q6_K) ||
//(type == GGML_TYPE_IQ4_XS);
(type == GGML_TYPE_Q4_1) ||
(type == GGML_TYPE_Q8_0) ||
(type == GGML_TYPE_Q4_K) ||
(type == GGML_TYPE_Q5_K) ||
(type == GGML_TYPE_Q6_K) ||
(type == GGML_TYPE_IQ4_XS);
}
// ggml backend context

View File

@ -4,8 +4,11 @@
#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
#endif
#include "amx.h"
#include "mmq.h"
#include "ggml-impl.h"
#include "ggml-cpu-impl.h"
#include "ggml-cpu-quants.h"
#include "ggml-quants.h"
#include <algorithm>
#include <type_traits>
@ -33,7 +36,7 @@
#define ALWAYS_INLINE inline
#endif
#if defined(__AMX_INT8__)
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
namespace {
@ -496,13 +499,12 @@ inline void from_float(const float * x, char * vy, int64_t k);
template <>
inline void from_float<block_q8_0>(const float * x, char * vy, int64_t k) {
// FIXME: using unoptimized reference impl until moved to CPU backend
quantize_row_q8_0_ref(x, (block_q8_0 *)vy, k);
quantize_row_q8_0(x, (block_q8_0 *)vy, k);
}
template <>
inline void from_float<block_q8_1>(const float * x, char * vy, int64_t k) {
quantize_row_q8_1_ref(x, (block_q8_1 *)vy, k);
quantize_row_q8_1(x, (block_q8_1 *)vy, k);
}
template <>
@ -950,7 +952,7 @@ template<typename TB, typename packed_B_t = packed_B_type<TB>>
void unpack_B(packed_B_t * RESTRICT tile, const void * RESTRICT packed_B) {
GGML_UNUSED(tile);
GGML_UNUSED(packed_B);
};
}
template <>
void unpack_B<block_q4_0>(int8_t * RESTRICT tile, const void * RESTRICT packed_B) {
@ -1338,21 +1340,19 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
__m512 vb[COLS];
__m512 vc[ROWS * COLS];
auto loadc = [&](int idx) {
auto loadc = [&](auto idx) {
vc[idx] = _mm512_setzero_ps();
};
Unroll<ROWS * COLS>{}(loadc);
auto compute = [&](int idx, int k) {
// TODO: use `constexpr` here to get rid of interger div
// when upgraded to C++17
const int row = idx / COLS;
const int col = idx % COLS;
auto compute = [&](auto idx, auto k) {
constexpr int row = idx / COLS;
constexpr int col = idx % COLS;
if (col == 0) {
if constexpr (col == 0) {
va = _mm512_loadu_ps(A + row * K + k);
}
if (row == 0) {
if constexpr (row == 0) {
vb[col] = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(B + col * K + k)));
}
vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]);
@ -1362,9 +1362,9 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
Unroll<ROWS * COLS>{}(compute, k);
}
auto storec = [&](int idx) {
const int row = idx / COLS;
const int col = idx % COLS;
auto storec = [&](auto idx) {
constexpr int row = idx / COLS;
constexpr int col = idx % COLS;
C[row * ldc + col] = _mm512_reduce_add_ps(vc[idx]);
};
Unroll<ROWS * COLS>{}(storec);
@ -1427,14 +1427,14 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
const __m512i off = _mm512_set1_epi8(8);
const __m512i lowMask = _mm512_set1_epi8(0xF);
auto loadc = [&](int col) {
auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps();
};
Unroll<COLS>{}(loadc);
auto compute = [&](int col, int i) {
auto compute = [&](auto col, auto i) {
// load a and compute compensation
if (col == 0) {
if constexpr (col == 0) {
const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
vcomp = _mm512_setzero_si512();
for (int k = 0; k < 8; ++k) {
@ -1466,7 +1466,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
}
//store to C
auto storec = [&](int col) {
auto storec = [&](auto col) {
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
};
Unroll<COLS>{}(storec);
@ -1490,14 +1490,14 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
const __m512i lowMask = _mm512_set1_epi8(0xF);
auto loadc = [&](int col) {
auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps();
};
Unroll<COLS>{}(loadc);
auto compute = [&](int col, int i) {
auto compute = [&](auto col, auto i) {
// load a
if (col == 0) {
if constexpr (col == 0) {
const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
for (int k = 0; k < 8; ++k) {
va[k] = _mm512_set1_epi32(a_ptr[k]);
@ -1531,7 +1531,7 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
}
//store to C
auto storec = [&](int col) {
auto storec = [&](auto col) {
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
};
Unroll<COLS>{}(storec);
@ -1562,14 +1562,14 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
//
const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
auto loadc = [&](int col) {
auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps();
};
Unroll<COLS>{}(loadc);
auto compute = [&](int col, int i) {
auto compute = [&](auto col, auto i) {
// load a and add offset 128
if (col == 0) {
if constexpr (col == 0) {
const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
for (int k = 0; k < 8; ++k) {
va[k] = _mm512_set1_epi32(a_ptr[k]);
@ -1602,7 +1602,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
}
//store to C
auto storec = [&](int col) {
auto storec = [&](auto col) {
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
};
Unroll<COLS>{}(storec);
@ -1634,7 +1634,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO
const __m512i lowMask = _mm512_set1_epi8(0xF);
auto loadc = [&](int col) {
auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps();
};
Unroll<COLS>{}(loadc);
@ -1648,9 +1648,9 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO
// int16 {k/2, n, 2}, viewed as 2d {k/2, 2n}, k = 8
// from {16, 8} to {4, 32}
//
auto compute = [&](int col, int i) {
auto compute = [&](auto col, auto i) {
// load a
if (col == 0) {
if constexpr (col == 0) {
for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32)));
}
@ -1702,7 +1702,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO
}
//store to C
auto storec = [&](int col) {
auto storec = [&](auto col) {
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
};
Unroll<COLS>{}(storec);
@ -1735,15 +1735,15 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q5_K, float, BLOCK_M, BLOCK_N, BLO
const __m512i lowMask = _mm512_set1_epi8(0xF);
auto loadc = [&](int col) {
auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps();
};
Unroll<COLS>{}(loadc);
// Q5_K and Q4_K shares the same vnni formats, refer to notes above.
auto compute = [&](int col, int i) {
auto compute = [&](auto col, auto i) {
// load a
if (col == 0) {
if constexpr (col == 0) {
for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32)));
}
@ -1808,7 +1808,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q5_K, float, BLOCK_M, BLOCK_N, BLO
}
//store to C
auto storec = [&](int col) {
auto storec = [&](auto col) {
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
};
Unroll<COLS>{}(storec);
@ -1841,13 +1841,13 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q6_K, float, BLOCK_M, BLOCK_N, BLO
const __m512i m32s = _mm512_set1_epi32(32);
const __m512i lowMask = _mm512_set1_epi8(0xF);
auto loadc = [&](int col) {
auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps();
};
Unroll<COLS>{}(loadc);
auto compute = [&](int col, int i) {
if (col == 0) {
auto compute = [&](auto col, auto i) {
if constexpr (col == 0) {
// load a
va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 0));
va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 64));
@ -1959,13 +1959,13 @@ struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, B
const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
const __m512i values256 = _mm512_add_epi8(values128, off);
auto loadc = [&](int col) {
auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps();
};
Unroll<COLS>{}(loadc);
auto compute = [&](int col, int i) {
if (col == 0) {
auto compute = [&](auto col, auto i) {
if constexpr (col == 0) {
// load a
va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 0));
va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 64));
@ -2015,7 +2015,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, B
}
//store to C
auto storec = [&](int col) {
auto storec = [&](auto col) {
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
};
Unroll<COLS>{}(storec);
@ -2327,9 +2327,7 @@ size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor) {
// pack weight to vnni format
void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
size_t alloc_size = ggml_backend_amx_get_alloc_size(tensor);
GGML_ASSERT(alloc_size == size);
GGML_ASSERT(offset == 0 && size == ggml_nbytes(tensor)); // only full tensor conversion is supported for now
const enum ggml_type TYPE = tensor->type;
@ -2348,6 +2346,29 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
});
}
size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst) {
struct ggml_tensor * src0 = dst->src[0];
const enum ggml_type TYPE = src0->type;
const bool is_floating_type = TYPE == GGML_TYPE_F16;
if (is_floating_type) {
return 0;
}
const int M = dst->ne[1];
const int K = src0->ne[0];
size_t desired_wsize = 0;
GGML_DISPATCH_QTYPES(TYPE, [&] {
const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
desired_wsize = M * row_size_A;
});
return desired_wsize;
}
// NB: mixed dtype gemm with Advanced Matrix Extensions (Intel AMX)
//
// src0: weight in shape of {N, K}, quantized
@ -2356,14 +2377,12 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
//
// the function performs: dst = src1 @ src0.T
//
void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst) {
void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_tensor * dst) {
struct ggml_tensor * src0 = dst->src[0];
struct ggml_tensor * src1 = dst->src[1];
const enum ggml_type TYPE = src0->type;
const int n_threads = ctx->n_threads;
// f16 only has avx512 kernels for now,
// amx kernels will be added once 6th gen xeon is released.
const bool is_floating_type = TYPE == GGML_TYPE_F16;
@ -2379,7 +2398,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
const int MB = div_up(M, BLOCK_M);
const int NB = div_up(N, BLOCK_N);
parallel_for(n_threads, MB * NB, [&](int begin, int end) {
parallel_for_ggml(params, MB * NB, [&](int begin, int end) {
GGML_DISPATCH_FLOATING_TYPES(TYPE, [&] {
for (int i = begin; i < end; ++i) {
int mb = i / NB;
@ -2412,17 +2431,16 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
}
// pointer to work space, used convert A from float to quantized type
void * wdata = nullptr;
void * wdata = params->wdata;
//TODO: performance improvement: merge quant A
if (params->ith == 0) {
GGML_DISPATCH_QTYPES(TYPE, [&] {
const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
const size_t desired_wsize = M * row_size_A;
if (ctx->work_size < desired_wsize) {
ctx->work_data.reset(new char[desired_wsize]);
ctx->work_size = desired_wsize;
if (params->wsize < desired_wsize) {
GGML_ABORT("insufficient work space size");
}
wdata = ctx->work_data.get();
// Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size
// Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
@ -2433,6 +2451,9 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
from_float<vec_dot_type>(A_data + m * K, (char *)wdata + m * row_size_A, K);
}
});
}
ggml_barrier(params->threadpool);
if (M == 1) {
// MB = 1 and handle 8 tiles in each block
@ -2440,7 +2461,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
constexpr int BLOCK_N = TILE_N * kTilesN;
const int NB = div_up(N, BLOCK_N);
parallel_for(n_threads, NB, [&](int begin, int end) {
parallel_for_ggml(params, NB, [&](int begin, int end) {
GGML_DISPATCH_QTYPES(TYPE, [&] {
const int KB = K / blck_size;
const int TILE_SIZE = get_tile_size<type>();
@ -2470,7 +2491,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
const int MB = div_up(M, BLOCK_M);
const int NB = div_up(N, BLOCK_N);
parallel_for(n_threads, MB * NB, [&](int begin, int end) {
parallel_for_ggml(params, MB * NB, [&](int begin, int end) {
// init tile config for each thread
ggml_tile_config_init();
@ -2498,13 +2519,4 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
});
}
#else // if defined(__AMX_INT8__)
void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst) {
fprintf(stderr, "GGML is not compiled with AMX support!\n");
GGML_UNUSED(ctx);
GGML_UNUSED(dst);
}
#endif // if defined(__AMX_INT8__)
#endif // if defined(__AMX_INT8__) && defined(__AVX512VNNI__)

View File

@ -1,6 +1,5 @@
#pragma once
#include "common.h"
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
@ -10,7 +9,7 @@ size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst);
void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
#ifdef __cplusplus
}

View File

@ -0,0 +1,298 @@
#include "ggml-cpu.h"
#include "ggml-backend-impl.h"
#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include <cstring>
#include <vector>
#include <bitset>
#include <array>
#include <string>
struct cpuid_x86 {
bool SSE3(void) { return f_1_ecx[0]; }
bool PCLMULQDQ(void) { return f_1_ecx[1]; }
bool MONITOR(void) { return f_1_ecx[3]; }
bool SSSE3(void) { return f_1_ecx[9]; }
bool FMA(void) { return f_1_ecx[12]; }
bool CMPXCHG16B(void) { return f_1_ecx[13]; }
bool SSE41(void) { return f_1_ecx[19]; }
bool SSE42(void) { return f_1_ecx[20]; }
bool MOVBE(void) { return f_1_ecx[22]; }
bool POPCNT(void) { return f_1_ecx[23]; }
bool AES(void) { return f_1_ecx[25]; }
bool XSAVE(void) { return f_1_ecx[26]; }
bool OSXSAVE(void) { return f_1_ecx[27]; }
bool AVX(void) { return f_1_ecx[28]; }
bool F16C(void) { return f_1_ecx[29]; }
bool RDRAND(void) { return f_1_ecx[30]; }
bool MSR(void) { return f_1_edx[5]; }
bool CX8(void) { return f_1_edx[8]; }
bool SEP(void) { return f_1_edx[11]; }
bool CMOV(void) { return f_1_edx[15]; }
bool CLFSH(void) { return f_1_edx[19]; }
bool MMX(void) { return f_1_edx[23]; }
bool FXSR(void) { return f_1_edx[24]; }
bool SSE(void) { return f_1_edx[25]; }
bool SSE2(void) { return f_1_edx[26]; }
bool FSGSBASE(void) { return f_7_ebx[0]; }
bool BMI1(void) { return f_7_ebx[3]; }
bool HLE(void) { return is_intel && f_7_ebx[4]; }
bool AVX2(void) { return f_7_ebx[5]; }
bool BMI2(void) { return f_7_ebx[8]; }
bool ERMS(void) { return f_7_ebx[9]; }
bool INVPCID(void) { return f_7_ebx[10]; }
bool RTM(void) { return is_intel && f_7_ebx[11]; }
bool AVX512F(void) { return f_7_ebx[16]; }
bool RDSEED(void) { return f_7_ebx[18]; }
bool ADX(void) { return f_7_ebx[19]; }
bool AVX512PF(void) { return f_7_ebx[26]; }
bool AVX512ER(void) { return f_7_ebx[27]; }
bool AVX512CD(void) { return f_7_ebx[28]; }
bool SHA(void) { return f_7_ebx[29]; }
bool PREFETCHWT1(void) { return f_7_ecx[0]; }
bool LAHF(void) { return f_81_ecx[0]; }
bool LZCNT(void) { return is_intel && f_81_ecx[5]; }
bool ABM(void) { return is_amd && f_81_ecx[5]; }
bool SSE4a(void) { return is_amd && f_81_ecx[6]; }
bool XOP(void) { return is_amd && f_81_ecx[11]; }
bool TBM(void) { return is_amd && f_81_ecx[21]; }
bool SYSCALL(void) { return is_intel && f_81_edx[11]; }
bool MMXEXT(void) { return is_amd && f_81_edx[22]; }
bool RDTSCP(void) { return is_intel && f_81_edx[27]; }
bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; }
bool _3DNOW(void) { return is_amd && f_81_edx[31]; }
bool AVX512_VBMI(void) { return f_7_ecx[1]; }
bool AVX512_VNNI(void) { return f_7_ecx[11]; }
bool AVX512_FP16(void) { return f_7_edx[23]; }
bool AVX512_BF16(void) { return f_7_1_eax[5]; }
bool AVX_VNNI(void) { return f_7_1_eax[4]; }
bool AMX_TILE(void) { return f_7_edx[24]; }
bool AMX_INT8(void) { return f_7_edx[25]; }
bool AMX_FP16(void) { return f_7_1_eax[21]; }
bool AMX_BF16(void) { return f_7_edx[22]; }
#ifdef _MSC_VER
static void cpuid(int cpu_info[4], int eax) {
__cpuid(cpu_info, eax);
}
static void cpuidex(int cpu_info[4], int eax, int ecx) {
__cpuidex(cpu_info, eax, ecx);
}
#else
static void cpuid(int cpu_info[4], int eax) {
__asm__ __volatile__(
"cpuid"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(eax), "c"(0));
}
static void cpuidex(int cpu_info[4], int eax, int ecx) {
__asm__ __volatile__(
"cpuid"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(eax), "c"(ecx));
}
#endif
cpuid_x86() {
std::array<int, 4> cpui;
std::vector<std::array<int, 4>> data;
// calling __cpuid with 0x0 as the function_id argument
// gets the number of the highest valid function ID.
cpuid(cpui.data(), 0);
int n_ids = cpui[0];
for (int i = 0; i <= n_ids; ++i) {
cpuidex(cpui.data(), i, 0);
data.push_back(cpui);
}
// capture vendor string
char vendor[0x20] = {};
*reinterpret_cast<int *>(vendor) = data[0][1];
*reinterpret_cast<int *>(vendor + 4) = data[0][3];
*reinterpret_cast<int *>(vendor + 8) = data[0][2];
this->vendor = vendor;
if (this->vendor == "GenuineIntel") {
is_intel = true;
} else if (this->vendor == "AuthenticAMD") {
is_amd = true;
}
// load bitset with flags for function 0x00000001
if (n_ids >= 1) {
f_1_ecx = data[1][2];
f_1_edx = data[1][3];
}
// load bitset with flags for function 0x00000007
if (n_ids >= 7) {
f_7_ebx = data[7][1];
f_7_ecx = data[7][2];
f_7_edx = data[7][3];
cpuidex(cpui.data(), 7, 1);
f_7_1_eax = cpui[0];
}
// calling __cpuid with 0x80000000 as the function_id argument
// gets the number of the highest valid extended ID.
cpuid(cpui.data(), 0x80000000);
unsigned int n_ex_ids = cpui[0];
std::vector<std::array<int, 4>> ext_data;
for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) {
cpuidex(cpui.data(), i, 0);
ext_data.push_back(cpui);
}
// load bitset with flags for function 0x80000001
if (n_ex_ids >= 0x80000001) {
f_81_ecx = ext_data[1][2];
f_81_edx = ext_data[1][3];
}
// interpret CPU brand string if reported
char brand[0x40] = {};
if (n_ex_ids >= 0x80000004) {
std::memcpy(brand, ext_data[2].data(), sizeof(cpui));
std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui));
std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui));
this->brand = brand;
}
}
bool is_intel = false;
bool is_amd = false;
std::string vendor;
std::string brand;
std::bitset<32> f_1_ecx;
std::bitset<32> f_1_edx;
std::bitset<32> f_7_ebx;
std::bitset<32> f_7_ecx;
std::bitset<32> f_7_edx;
std::bitset<32> f_7_1_eax;
std::bitset<32> f_81_ecx;
std::bitset<32> f_81_edx;
};
#if 0
void test_x86_is() {
cpuid_x86 is;
printf("CPU Vendor: %s\n", is.vendor.c_str());
printf("Brand: %s\n", is.brand.c_str());
printf("is_intel: %d\n", is.is_intel);
printf("is_amd: %d\n", is.is_amd);
printf("sse3: %d\n", is.SSE3());
printf("pclmulqdq: %d\n", is.PCLMULQDQ());
printf("ssse3: %d\n", is.SSSE3());
printf("fma: %d\n", is.FMA());
printf("cmpxchg16b: %d\n", is.CMPXCHG16B());
printf("sse41: %d\n", is.SSE41());
printf("sse42: %d\n", is.SSE42());
printf("movbe: %d\n", is.MOVBE());
printf("popcnt: %d\n", is.POPCNT());
printf("aes: %d\n", is.AES());
printf("xsave: %d\n", is.XSAVE());
printf("osxsave: %d\n", is.OSXSAVE());
printf("avx: %d\n", is.AVX());
printf("f16c: %d\n", is.F16C());
printf("rdrand: %d\n", is.RDRAND());
printf("msr: %d\n", is.MSR());
printf("cx8: %d\n", is.CX8());
printf("sep: %d\n", is.SEP());
printf("cmov: %d\n", is.CMOV());
printf("clflush: %d\n", is.CLFSH());
printf("mmx: %d\n", is.MMX());
printf("fxsr: %d\n", is.FXSR());
printf("sse: %d\n", is.SSE());
printf("sse2: %d\n", is.SSE2());
printf("fsgsbase: %d\n", is.FSGSBASE());
printf("bmi1: %d\n", is.BMI1());
printf("hle: %d\n", is.HLE());
printf("avx2: %d\n", is.AVX2());
printf("bmi2: %d\n", is.BMI2());
printf("erms: %d\n", is.ERMS());
printf("invpcid: %d\n", is.INVPCID());
printf("rtm: %d\n", is.RTM());
printf("avx512f: %d\n", is.AVX512F());
printf("rdseed: %d\n", is.RDSEED());
printf("adx: %d\n", is.ADX());
printf("avx512pf: %d\n", is.AVX512PF());
printf("avx512er: %d\n", is.AVX512ER());
printf("avx512cd: %d\n", is.AVX512CD());
printf("sha: %d\n", is.SHA());
printf("prefetchwt1: %d\n", is.PREFETCHWT1());
printf("lahf: %d\n", is.LAHF());
printf("lzcnt: %d\n", is.LZCNT());
printf("abm: %d\n", is.ABM());
printf("sse4a: %d\n", is.SSE4a());
printf("xop: %d\n", is.XOP());
printf("tbm: %d\n", is.TBM());
printf("syscall: %d\n", is.SYSCALL());
printf("mmxext: %d\n", is.MMXEXT());
printf("rdtscp: %d\n", is.RDTSCP());
printf("3dnowext: %d\n", is._3DNOWEXT());
printf("3dnow: %d\n", is._3DNOW());
printf("avx512_vbmi: %d\n", is.AVX512_VBMI());
printf("avx512_vnni: %d\n", is.AVX512_VNNI());
printf("avx512_fp16: %d\n", is.AVX512_FP16());
printf("avx512_bf16: %d\n", is.AVX512_BF16());
printf("amx_tile: %d\n", is.AMX_TILE());
printf("amx_int8: %d\n", is.AMX_INT8());
printf("amx_fp16: %d\n", is.AMX_FP16());
printf("amx_bf16: %d\n", is.AMX_BF16());
}
#endif
static int ggml_backend_cpu_x86_score() {
// FIXME: this does not check for OS support
cpuid_x86 is;
// if the CPU backend was built with any features not supported by the current CPU, it cannot be used
if (ggml_cpu_has_fma() && !is.FMA()) { return 0; }
if (ggml_cpu_has_f16c() && !is.F16C()) { return 0; }
if (ggml_cpu_has_ssse3() && !is.SSSE3()) { return 0; }
if (ggml_cpu_has_sse3() && !is.SSE3()) { return 0; }
if (ggml_cpu_has_avx() && !is.AVX()) { return 0; }
if (ggml_cpu_has_avx_vnni() && !is.AVX_VNNI()) { return 0; }
if (ggml_cpu_has_avx2() && !is.AVX2()) { return 0; }
if (ggml_cpu_has_avx512() && !is.AVX512F()) { return 0; }
if (ggml_cpu_has_avx512_vbmi() && !is.AVX512_VBMI()) { return 0; }
if (ggml_cpu_has_avx512_bf16() && !is.AVX512_BF16()) { return 0; }
if (ggml_cpu_has_avx512_vnni() && !is.AVX512_VNNI()) { return 0; }
if (ggml_cpu_has_amx_int8() && !is.AMX_INT8()) { return 0; }
// calculate a backend score based on the supported features
// more important features have a higher weight
int score = 0;
score += ggml_cpu_has_fma () * 1;
score += ggml_cpu_has_f16c () * 1<<1;
score += ggml_cpu_has_ssse3 () * 1<<2;
score += ggml_cpu_has_sse3 () * 1<<3;
score += ggml_cpu_has_avx_vnni () * 1<<4;
score += ggml_cpu_has_avx () * 1<<5;
score += ggml_cpu_has_avx2 () * 1<<6;
score += ggml_cpu_has_avx512 () * 1<<7;
// score += ggml_cpu_has_avx512_vbmi() * 1<<8; // not used
score += ggml_cpu_has_avx512_bf16() * 1<<9;
score += ggml_cpu_has_avx512_vnni() * 1<<10;
score += ggml_cpu_has_amx_int8 () * 1<<11;
return score;
}
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score)
#endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))

View File

@ -128,7 +128,7 @@ static inline __m512i sum_i16_pairs_int_32x16(const __m512i x) {
}
static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) {
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
#if defined(__AVX512VNNI__)
const __m512i zero = _mm512_setzero_si512();
return _mm512_dpbusd_epi32(zero, ax, sy);
#else
@ -525,67 +525,47 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
UNUSED(ncols_interleaved);
UNUSED(blocklen);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *)vx;
__asm__ __volatile__(
"movi v31.16b, #0x4\n"
"movi v30.16b, #0xf0\n"
"add %x[b_ptr], %x[b_ptr], #0x8\n"
"1:" // Column loop
"add x22, %x[a_ptr], #0x2\n"
"movi v29.16b, #0x0\n"
"mov x21, %x[nb]\n"
"2:" // Block loop
"ldr q28, [%x[b_ptr], #0x0]\n"
"ldr q27, [x22, #0x0]\n"
"movi v26.4s, #0x0\n"
"sub x20, x22, #0x2\n"
"ldr q25, [x22, #0x10]\n"
"ldr q24, [%x[b_ptr], #0x10]\n"
"sub x21, x21, #0x1\n"
"add x22, x22, #0x22\n"
"ldr q23, [%x[b_ptr], #0x20]\n"
"ldr q22, [%x[b_ptr], #0x30]\n"
"ld1r { v21.8h }, [x20]\n"
"ldr q20, [%x[b_ptr], #-0x8]\n"
"sshl v16.16b, v28.16b, v31.16b\n"
"and v28.16b, v28.16b, v30.16b\n"
"sshl v19.16b, v24.16b, v31.16b\n"
"and v24.16b, v24.16b, v30.16b\n"
"add %x[b_ptr], %x[b_ptr], #0x48\n"
"sshl v18.16b, v23.16b, v31.16b\n"
"and v23.16b, v23.16b, v30.16b\n"
".inst 0x4f9be21a // sdot v26.4s, v16.16b, v27.4b[0]\n"
"sshl v17.16b, v22.16b, v31.16b\n"
"and v22.16b, v22.16b, v30.16b\n"
"fcvtl v21.4s, v21.4h\n"
"fcvtl v16.4s, v20.4h\n"
".inst 0x4f99e39a // sdot v26.4s, v28.16b, v25.4b[0]\n"
"fmul v16.4s, v16.4s, v21.4s\n"
".inst 0x4fbbe27a // sdot v26.4s, v19.16b, v27.4b[1]\n"
".inst 0x4fb9e31a // sdot v26.4s, v24.16b, v25.4b[1]\n"
".inst 0x4f9bea5a // sdot v26.4s, v18.16b, v27.4b[2]\n"
".inst 0x4f99eafa // sdot v26.4s, v23.16b, v25.4b[2]\n"
".inst 0x4fbbea3a // sdot v26.4s, v17.16b, v27.4b[3]\n"
".inst 0x4fb9eada // sdot v26.4s, v22.16b, v25.4b[3]\n"
"scvtf v26.4s, v26.4s, #0x4\n"
"fmla v29.4s, v26.4s, v16.4s\n"
"cbnz x21, 2b\n"
"sub %x[nc], %x[nc], #0x4\n"
"str q29, [%x[res_ptr], #0x0]\n"
"add %x[res_ptr], %x[res_ptr], #0x10\n"
"cbnz %x[nc], 1b\n"
: [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
: [a_ptr] "r" (a_ptr), [nb] "r" (nb)
: "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
);
for (int c = 0; c < nc; c += ncols_interleaved) {
const block_q8_0 * a_ptr = (const block_q8_0 *)vy;
float32x4_t acc = vdupq_n_f32(0);
for (int b = 0; b < nb; b++) {
int8x16_t b0 = vld1q_s8((const int8_t *)b_ptr->qs);
int8x16_t b1 = vld1q_s8((const int8_t *)b_ptr->qs + 16);
int8x16_t b2 = vld1q_s8((const int8_t *)b_ptr->qs + 32);
int8x16_t b3 = vld1q_s8((const int8_t *)b_ptr->qs + 48);
float16x4_t bd = vld1_f16((const __fp16 *)b_ptr->d);
int8x16_t a0 = vld1q_s8(a_ptr->qs);
int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
float16x4_t ad = vld1_dup_f16((const __fp16 *)&a_ptr->d);
int32x4_t ret = vdupq_n_s32(0);
ret = vdotq_laneq_s32(ret, b0 << 4, a0, 0);
ret = vdotq_laneq_s32(ret, b1 << 4, a0, 1);
ret = vdotq_laneq_s32(ret, b2 << 4, a0, 2);
ret = vdotq_laneq_s32(ret, b3 << 4, a0, 3);
ret = vdotq_laneq_s32(ret, b0 & 0xf0U, a1, 0);
ret = vdotq_laneq_s32(ret, b1 & 0xf0U, a1, 1);
ret = vdotq_laneq_s32(ret, b2 & 0xf0U, a1, 2);
ret = vdotq_laneq_s32(ret, b3 & 0xf0U, a1, 3);
acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
a_ptr++;
b_ptr++;
}
vst1q_f32(s, acc);
s += ncols_interleaved;
}
return;
}
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
float sumf[4];
int sumi;
@ -1020,7 +1000,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void
float * res_ptr = s;
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
float32x4_t sumf = vdupq_n_f32(0);
for (int l = 0; l < nb; l++) {
@ -3507,7 +3487,7 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void
for (int y = 0; y < nr / 4; y++) {
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
float32x4_t sumf[4];
for (int m = 0; m < 4; m++) {

View File

@ -15,6 +15,18 @@
extern "C" {
#endif
struct ggml_compute_params {
// ith = thread index, nth = number of threads
int ith, nth;
// work buffer for all threads
size_t wsize;
void * wdata;
struct ggml_threadpool * threadpool;
};
#if defined(_MSC_VER)
#define m512bh(p) p
@ -366,6 +378,9 @@ static __m256 __lasx_xvreplfr2vr_s(float val) {
}
#endif
// TODO: move to ggml-threading
void ggml_barrier(struct ggml_threadpool * tp);
#ifdef __cplusplus
}
#endif

View File

@ -1791,11 +1791,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
float32_t _scale[4] = { GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
float32_t _scale[4] = {
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
};
float32x4_t scale = vld1q_f32(_scale);
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
@ -2347,10 +2348,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
const block_q8_1 * restrict b_y0 = &vy0[i];
const block_q8_1 * restrict b_y1 = &vy1[i];
float32_t summs_t[4] = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
float32_t summs_t[4] = {
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)
};
summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
const uint8x16_t m4b = vdupq_n_u8(0x0F);
@ -2371,10 +2374,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
// mmla into int32x4_t
float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
float32_t _scale[4] = {
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
};
float32x4_t scale = vld1q_f32(_scale);
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
@ -2394,10 +2399,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
sumv2 = vaddq_f32(sumv2, summs0);
vst1_f32(s, vget_low_f32 (sumv2));
vst1_f32(s + bs, vget_high_f32(sumv2));
return;
}
#endif
@ -3374,10 +3381,12 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
float32_t _scale[4] = {
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
};
float32x4_t scale = vld1q_f32(_scale);
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
@ -3395,11 +3404,13 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
l1, r1)), l2, r2)), l3, r3))), scale);
}
float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
vst1_f32(s, vget_low_f32 (sumv2));
vst1_f32(s + bs, vget_high_f32(sumv2));
return;
}
#endif

View File

@ -10,6 +10,7 @@
#include "ggml-quants.h"
#include "ggml-cpu-quants.h"
#include "ggml-threading.h"
#include "amx/amx.h"
#include "ggml.h"
#if defined(_MSC_VER) || defined(__MINGW32__)
@ -624,7 +625,7 @@ do { \
for (int i = 0; i < offset; ++i) { \
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
} \
res = _mm512_reduce_add_ps(x[0]); \
res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
} while (0)
// TODO: is this optimal ?
@ -674,7 +675,7 @@ do { \
for (int i = 0; i < offset; ++i) { \
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
} \
res = _mm512_reduce_add_ps(x[0]); \
res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
} while (0)
#define GGML_F16_VEC GGML_F32Cx16
@ -685,8 +686,8 @@ do { \
#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
#elif defined(__AVX__)
#define GGML_SIMD
@ -1367,31 +1368,15 @@ struct ggml_compute_state {
int ith;
};
struct ggml_compute_params {
// ith = thread index, nth = number of threads
int ith, nth;
// work buffer for all threads
size_t wsize;
void * wdata;
struct ggml_threadpool * threadpool;
};
//
// fundamental operations
//
inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
@ -2286,7 +2271,7 @@ struct ggml_state {
static struct ggml_state g_state = {0};
static void ggml_barrier(struct ggml_threadpool * tp) {
void ggml_barrier(struct ggml_threadpool * tp) {
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
if (n_threads == 1) {
return;
@ -7455,6 +7440,13 @@ static void ggml_compute_forward_mul_mat(
type = (enum ggml_type)(intptr_t)src0->extra;
}
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
if (src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
ggml_backend_amx_mul_mat(params, dst);
return;
}
#endif
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat;
@ -7641,8 +7633,8 @@ UseGgmlGemm2:;
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
int64_t num_rows_per_vec_dot = vec_dot_num_rows;
// TODO: currently the mmla kernels support only even numbered rows/cols.
// this check can be removed once they are extended to support odd numbered rows/cols too
// these checks are needed to avoid crossing dim1 boundaries
// can be optimized, but the logic would become more complicated, so keeping it like this for simplicity
if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
num_rows_per_vec_dot = 1;
}
@ -13294,10 +13286,16 @@ struct ggml_cplan ggml_graph_plan(
} break;
case GGML_OP_MUL_MAT:
{
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
if (node->src[0]->buffer && ggml_backend_amx_buft_is_amx(node->src[0]->buffer->buft)) {
cur = ggml_backend_amx_desired_wsize(node);
}
#endif
const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
if (node->src[1]->type != vec_dot_type) {
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
size_t cur2 = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
cur = MAX(cur, cur2);
}
} break;
case GGML_OP_MUL_MAT_ID:

View File

@ -3,6 +3,7 @@
#include "ggml-cpu.h"
#include "ggml-cpu-aarch64.h"
#include "ggml-impl.h"
#include "amx/amx.h"
#include <cctype>
#include <string>
#include <vector>
@ -134,12 +135,16 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backen
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
std::vector<ggml_backend_buffer_type_t> bufts;
#ifdef GGML_USE_CPU_HBM
bufts.push_back(ggml_backend_cpu_hbm_buffer_type());
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
if (ggml_backend_amx_buffer_type()) {
bufts.push_back(ggml_backend_amx_buffer_type());
}
#endif
#ifdef GGML_USE_CPU_AARCH64
if (ggml_backend_cpu_aarch64_buffer_type()) {
bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
}
#endif
bufts.push_back(NULL);
@ -456,12 +461,27 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];
if (op->op == GGML_OP_NONE || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE || op->op == GGML_OP_TRANSPOSE) {
return true;
}
if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) {
return false;
}
}
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
if (src0 && src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
return ggml_backend_amx_device_supports_op(op);
}
for (int i = 1; i < GGML_MAX_SRC; i++) {
if (op->src[i] && op->src[i]->buffer && ggml_backend_amx_buft_is_amx(op->src[i]->buffer->buft)) {
return false;
}
}
#endif
for (int i = 1; i < GGML_MAX_SRC; i++) {
if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
return false;
@ -491,7 +511,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
}
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
bool supported = ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
supported = supported || ggml_backend_amx_buft_is_amx(buft);
#endif
return supported;
GGML_UNUSED(dev);
}

View File

@ -50,8 +50,7 @@
#include "sgemm.h"
#include "ggml-impl.h"
// hack until moved into the CPU backend
#include "../ggml-cpu-impl.h"
#include "ggml-cpu-impl.h"
#include "ggml-quants.h"
#ifdef _MSC_VER

View File

@ -30,11 +30,13 @@
extern "C" {
#endif
#undef MIN
#undef MAX
#ifndef MIN
# define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
#ifndef MAX
# define MAX(a, b) ((a) > (b) ? (a) : (b))
#endif
// required for mmap as gguf only guarantees 32-byte alignment
#define TENSOR_ALIGNMENT 32

View File

@ -47,7 +47,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
info.device_count = dpct::dev_mgr::instance().device_count();
if (info.device_count == 0) {
fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__);
GGML_LOG_ERROR("%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__);
return info;
}
@ -55,16 +55,16 @@ static ggml_sycl_device_info ggml_sycl_init() {
int64_t total_vram = 0;
#if defined(GGML_SYCL_FORCE_MMQ)
fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: yes\n", __func__);
GGML_LOG_INFO("%s: GGML_SYCL_FORCE_MMQ: yes\n", __func__);
#else
fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: no\n", __func__);
GGML_LOG_INFO("%s: GGML_SYCL_FORCE_MMQ: no\n", __func__);
#endif
#if defined(SYCL_USE_XMX)
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
GGML_LOG_INFO("%s: SYCL_USE_XMX: yes\n", __func__);
#else
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
#endif
fprintf(stderr, "%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count);
GGML_LOG_INFO("%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count);
for (int i = 0; i < info.device_count; ++i) {
info.devices[i].vmm = 0;
@ -110,7 +110,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
auto global_mem_size = prop.get_global_mem_size()/1000000;
fprintf(stderr, "|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
name.c_str(), version.c_str(), prop.get_max_compute_units(),
prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
@ -120,18 +120,29 @@ void ggml_backend_sycl_print_sycl_devices() {
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
int device_count = dpct::dev_mgr::instance().device_count();
std::map<std::string, size_t> DeviceNums;
fprintf(stderr, "found %d SYCL devices:\n", device_count);
fprintf(stderr, "| | | | |Max | |Max |Global | |\n");
fprintf(stderr, "| | | | |compute|Max work|sub |mem | |\n");
fprintf(stderr, "|ID| Device Type| Name|Version|units |group |group|size | Driver version|\n");
fprintf(stderr, "|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|\n");
GGML_LOG_INFO("Found %d SYCL devices:\n", device_count);
GGML_LOG_INFO(
"| | | | "
" |Max | |Max |Global | |\n");
GGML_LOG_INFO(
"| | | | "
" |compute|Max work|sub |mem | |\n");
GGML_LOG_INFO(
"|ID| Device Type| "
"Name|Version|units |group |group|size | Driver version|\n");
GGML_LOG_INFO(
"|--|-------------------|---------------------------------------|------"
"-|-------|--------|-----|-------|---------------------|\n");
for (int id = 0; id < device_count; ++id) {
sycl::device device = dpct::dev_mgr::instance().get_device(id);
sycl::backend backend = device.get_backend();
std::string backend_type = get_device_backend_and_type(device);
int type_id = DeviceNums[backend_type]++;
std::stringstream device_type;
device_type << "[" << backend_type << ":" << std::to_string(type_id) << "]";
device_type << "[" << backend_type << ":" << std::to_string(type_id)
<< "]";
print_device_detail(id, device, device_type.str());
}
}
@ -154,15 +165,14 @@ static void ggml_check_sycl() try {
static bool initialized = false;
if (!initialized) {
fprintf(stderr, "[SYCL] call ggml_check_sycl\n");
GGML_LOG_INFO("[SYCL] call ggml_check_sycl\n");
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
GGML_LOG_INFO("%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
#if defined(GGML_SYCL_F16)
fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__);
GGML_LOG_INFO("%s: GGML_SYCL_F16: yes\n", __func__);
#else
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
GGML_LOG_INFO("%s: GGML_SYCL_F16: no\n", __func__);
#endif
/* NOT REMOVE, keep it for next optimize for XMX.
@ -180,9 +190,10 @@ static void ggml_check_sycl() try {
return;
}
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
ggml_backend_sycl_print_sycl_devices();
initialized = true;
g_sycl_loaded = true;
ggml_backend_sycl_print_sycl_devices();
}
}
catch (sycl::exception const &exc) {
@ -205,7 +216,7 @@ inline void check_allow_gpu_index(const int device_index) {
__func__,
device_index,
ggml_sycl_info().device_count - 1);
fprintf(stderr, "%s\n", error_buf);
GGML_LOG_ERROR("%s\n", error_buf);
assert(false);
}
}
@ -475,7 +486,7 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
size, *stream)));
if (!dev_ptr) {
fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, size);
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
return nullptr;
}
ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
@ -752,7 +763,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
size, *stream)));
if (!buf) {
char err_buf[1024];
snprintf(err_buf, 1023, "%s: can't malloc %lu Bytes memory on device", __func__, size);
snprintf(err_buf, 1023, "%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
throw std::runtime_error(err_buf);
}
// set padding to 0 to avoid possible NaN values
@ -1142,7 +1153,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
look_ahead_size, *qptr)));
if (!ptr) {
fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, look_ahead_size);
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device/GPU\n", __func__, look_ahead_size);
return nullptr;
}
@ -1150,9 +1161,10 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
pool_size += look_ahead_size;
#ifdef DEBUG_SYCL_MALLOC
fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
GGML_LOG_DEBUG("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
(uint32_t)(max_size/1024/1024), (uint32_t)(g_sycl_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
#endif
// GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg look_ahead_size=%lu, return %p\n", look_ahead_size, ptr);
return ptr;
}
@ -1166,7 +1178,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
return;
}
}
fprintf(stderr, "WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n");
GGML_LOG_WARN("WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n");
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
pool_size -= size;
}
@ -2437,7 +2449,7 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_te
break;
default:
// TODO: k-quants
fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
GGML_ABORT("fatal error");
break;
}
@ -3447,8 +3459,15 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
// TODO: Refactor and cleanup of mul mat dispatching.
if (src0->ne[3] == 1 && src1->ne[3] == 1) {
// KQ single-batch
// mmv p021 was specific for these dimensions
ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst);
} else {
// The kernel from the if path is faster for that specific case, but does not support all mul mats.
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
}
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
// KQV single-batch
ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
@ -3743,7 +3762,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else {
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__,
ggml_type_name(src0->type), ggml_type_name(src1->type));
GGML_ABORT("fatal error");
}
@ -3818,7 +3837,7 @@ void ggml_sycl_set_main_device(const int main_device) try {
dpct::device_info prop;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
prop, dpct::dev_mgr::instance().get_device(main_device))));
fprintf(stderr, "Using device %d (%s) as main device\n",
GGML_LOG_INFO("Using device %d (%s) as main device\n",
main_device, prop.get_name());
}
}
@ -4165,7 +4184,7 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
#endif
bool ok = ggml_sycl_compute_forward(*sycl_ctx, node);
if (!ok) {
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
}
GGML_ASSERT(ok);
}
@ -4486,7 +4505,7 @@ static bool ggml_backend_sycl_device_supports_buft(ggml_backend_dev_t dev, ggml_
static int64_t get_op_batch_size(const ggml_tensor * op) {
switch (op->op) {
case GGML_OP_GET_ROWS:
return op->ne[1]; // this will increse the speed of prefill in test
return 0;
case GGML_OP_MUL_MAT:
return op->ne[1];
case GGML_OP_MUL_MAT_ID:
@ -4665,7 +4684,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context(device);
if (ctx == nullptr) {
fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
return nullptr;
};

View File

@ -1231,6 +1231,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
std::cerr << "ggml_vulkan: Compiling shaders";
// some shaders require the subgroup size to be 16 or larger
const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u);
// mulmat
std::vector<uint32_t> l_warptile, m_warptile, s_warptile,
l_warptile_mmq, m_warptile_mmq, s_warptile_mmq;
@ -1240,11 +1243,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
l_warptile = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
m_warptile = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
s_warptile = { std::max(device->subgroup_size, 16u), 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size };
s_warptile = { subgroup_size_16, 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size };
l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
m_warptile_mmq = { 128, 64, 64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
s_warptile_mmq = { std::max(device->subgroup_size, 16u), 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size };
s_warptile_mmq = { subgroup_size_16, 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size };
l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 };
m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 };
@ -1431,7 +1434,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
@ -1445,7 +1448,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
@ -1459,7 +1462,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
// dequant shaders

View File

@ -3,5 +3,5 @@ find_package (Threads REQUIRED)
set(TARGET vulkan-shaders-gen)
add_executable(${TARGET} vulkan-shaders-gen.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)

View File

@ -4,9 +4,11 @@
#include "mul_mat_vec_base.comp"
layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
shared FLOAT_TYPE tmp[32];
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
shared FLOAT_TYPE tmp[BLOCK_SIZE];
void main() {
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
@ -21,21 +23,19 @@ void main() {
const uint num_blocks_per_row = p.ncols / QUANT_K;
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
// 16 threads are used to process each block
const uint it_size = gl_WorkGroupSize.x/16;
const uint tid = gl_LocalInvocationID.x;
const uint itid = tid%16; // 0...16
const uint ix = tid/16;
const uint step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
const uint step = 8;
const uint v_im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const uint v_in = tid - step*v_im; // 0...15 or 0...7
const uint v_im = itid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const uint v_in = itid - step*v_im; // 0...15 or 0...7
#if K_QUANTS_PER_ITERATION == 1
const uint l0 = v_in; // 0...15
const uint is = 0;
#else
const uint l0 = 4 * v_in; // 0, 4, 8, ..., 28
const uint is = v_in / 4;
#endif
const uint ql_offset = 64*v_im + l0;
const uint qh_offset = 32*v_im + l0;
@ -44,7 +44,7 @@ void main() {
FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
const uint y_idx = i * QUANT_K + y_offset;
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
@ -95,10 +95,10 @@ void main() {
}
tmp[gl_LocalInvocationID.x] = temp;
// sum up partial sums and write back result
barrier();
[[unroll]] for (uint s = 16; s > 0; s >>= 1) {
[[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}

6
grammars/english.gbnf Normal file
View File

@ -0,0 +1,6 @@
# note: this might be incomplete, mostly an example
root ::= en-char+ ([ \t\n] en-char+)*
en-char ::= letter | digit | punctuation
letter ::= [a-zA-Z]
digit ::= [0-9]
punctuation ::= [!"#$%&'()*+,-./:;<=>?@[\\\]^_`{|}~]

View File

@ -990,6 +990,9 @@ extern "C" {
char * buf,
int32_t length);
// Get list of built-in chat templates
int32_t llama_chat_builtin_templates(const char ** output, size_t len);
//
// Sampling API
//

View File

@ -1,9 +1,9 @@
set(TARGET llama-vdot)
add_executable(${TARGET} vdot.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-q8dot)
add_executable(${TARGET} q8dot.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

12
scripts/build-cpu.sh Executable file
View File

@ -0,0 +1,12 @@
#!/bin/bash
name="$1"
args="${@:2}"
echo "Building $name with args: $args"
rm -fr build-cpu-$1
cmake -S . -B build-cpu-$1 -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF $args
cmake --build build-cpu-$1 --config Release -t ggml-cpu -j $(nproc)
cp build-cpu-$1/bin/libggml-cpu.so ./libggml-cpu-$1.so
rm -fr build-cpu-$1

View File

@ -16,15 +16,21 @@ bench_args="${@:3}"
rm -f llama-bench.sqlite > /dev/null
# to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
if [ -n "$GGML_CUDA" ]; then
cmake_opts="-DGGML_CUDA=ON"
fi
function run {
rm -fr build > /dev/null
cmake -B build -S . $cmake_opts > /dev/null
cmake --build build -t llama-bench > /dev/null
build/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
}
git checkout $1 > /dev/null
make clean > /dev/null
make -j$(nproc) $make_opts llama-bench > /dev/null
./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
run
git checkout $2 > /dev/null
make clean > /dev/null
make -j$(nproc) $make_opts llama-bench > /dev/null
./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
run
./scripts/compare-llama-bench.py -b $1 -c $2

View File

@ -1,212 +0,0 @@
#!/bin/bash
#
# Use this script only on fresh pods (runpod.io)!
# Otherwise, it can break your environment!
#
if [ -z "$1" ]; then
echo "Usage: $0 <data>"
echo " 0: no models"
echo " 1: tinyllama-1b"
echo " 2: codellama-7b"
echo " 3: codellama-13b"
echo " 4: codellama-34b"
echo " 5: codellama-7b-instruct"
echo " 6: codellama-13b-instruct"
echo " 7: codellama-34b-instruct"
exit 1
fi
set -x
# setup deps
apt-get update
apt-get install -y git-lfs cmake cmake-curses-gui vim ruby
git-lfs install
if [ ! -d "/workspace" ]; then
ln -sfn $(pwd) /workspace
fi
# download data
cd /workspace
# this is useful to git clone repos without doubling the disk size due to .git
git clone https://github.com/iboB/git-lfs-download
ln -sfn /workspace/git-lfs-download/git-lfs-download /usr/local/bin/git-lfs-download
# llama.cpp
cd /workspace
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
GGML_CUDA=1 make -j
ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3 ./models/tinyllama-1b
ln -sfn /workspace/CodeLlama-7b-hf ./models/codellama-7b
ln -sfn /workspace/CodeLlama-13b-hf ./models/codellama-13b
ln -sfn /workspace/CodeLlama-34b-hf ./models/codellama-34b
ln -sfn /workspace/CodeLlama-7b-Instruct-hf ./models/codellama-7b-instruct
ln -sfn /workspace/CodeLlama-13b-Instruct-hf ./models/codellama-13b-instruct
ln -sfn /workspace/CodeLlama-34b-Instruct-hf ./models/codellama-34b-instruct
pip install -r requirements.txt
# cmake
cd /workspace/llama.cpp
mkdir build-cublas
cd build-cublas
cmake -DGGML_CUDA=1 ../
make -j
if [ "$1" -eq "0" ]; then
exit 0
fi
# more models
if [ "$1" -eq "1" ]; then
cd /workspace
git-lfs-download https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.3
cd /workspace/llama.cpp
python3 examples/convert_legacy_llama.py ./models/tinyllama-1b --outfile ./models/tinyllama-1b/ggml-model-f16.gguf --outtype f16
./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0
./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k
./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "2" ]; then
cd /workspace
git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-hf --without *safetensors*
rm -v ./CodeLlama-7b-hf/*safetensors*
cd /workspace/llama.cpp
python3 examples/convert_legacy_llama.py ./models/codellama-7b --outfile ./models/codellama-7b/ggml-model-f16.gguf --outtype f16
./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0
./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k
./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "3" ]; then
cd /workspace
git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-hf --without *safetensors*
rm -v ./CodeLlama-13b-hf/*safetensors*
cd /workspace/llama.cpp
python3 examples/convert_legacy_llama.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16
./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0
./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k
./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "4" ]; then
cd /workspace
git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-hf --without *safetensors*
rm -v ./CodeLlama-34b-hf/*safetensors*
cd /workspace/llama.cpp
python3 examples/convert_legacy_llama.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16
./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0
./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k
./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "5" ]; then
cd /workspace
git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf --without *safetensors*
rm -v ./CodeLlama-7b-Instruct-hf/*safetensors*
cd /workspace/llama.cpp
python3 examples/convert_legacy_llama.py ./models/codellama-7b-instruct --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf --outtype f16
./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0
./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k
./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "6" ]; then
cd /workspace
git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf --without *safetensors*
rm -v ./CodeLlama-13b-Instruct-hf/*safetensors*
cd /workspace/llama.cpp
python3 examples/convert_legacy_llama.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16
./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0
./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k
./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "7" ]; then
cd /workspace
git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf --without *safetensors*
rm -v ./CodeLlama-34b-Instruct-hf/*safetensors*
cd /workspace/llama.cpp
python3 examples/convert_legacy_llama.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16
./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0
./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k
./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "1" ]; then
# perf + perplexity
cd /workspace/llama.cpp/build-cublas
make -j && ../scripts/run-all-perf.sh tinyllama-1b "f16" "-ngl 99 -t 1 -p 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,32,64,128,256,512,1024,2048 -n 128"
../scripts/get-wikitext-2.sh
unzip wikitext-2-raw-v1.zip
make -j && ./bin/llama-perplexity -m ../models/tinyllama-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32
# batched
cd /workspace/llama.cpp
GGML_CUDA=1 make -j && ./llama-batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
# batched-bench
cd /workspace/llama.cpp
GGML_CUDA=1 make -j && ./llama-batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
# parallel
cd /workspace/llama.cpp
GGML_CUDA=1 make -j && ./llama-parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
fi
# speculative
#if [ "$1" -eq "7" ]; then
# cd /workspace/llama.cpp
#
# GGML_CUDA=1 make -j && ./llama-speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
#fi
# more benches
#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1

View File

@ -1,418 +0,0 @@
#!/bin/bash
#
# Helper script for deploying llama.cpp server with a single Bash command
#
# - Works on Linux and macOS
# - Supports: CPU, CUDA, Metal
# - Can run all GGUF models from HuggingFace
# - Can serve requests in parallel
# - Always builds latest llama.cpp from GitHub
#
# Limitations
#
# - Chat templates are poorly supported (base models recommended)
# - Might be unstable!
#
# Usage:
# ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]
#
# --port: port number, default is 8888
# --repo: path to a repo containing GGUF model files
# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input
# --backend: cpu, cuda, metal, depends on the OS
# --gpu-id: gpu id, default is 0
# --n-parallel: number of parallel requests, default is 8
# --n-kv: KV cache size, default is 4096
# --verbose: verbose output
# --non-interactive: run without asking a permission to run
#
# Example:
#
# bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
#
set -e
# required utils: curl, git, make
if ! command -v curl &> /dev/null; then
printf "[-] curl not found\n"
exit 1
fi
if ! command -v git &> /dev/null; then
printf "[-] git not found\n"
exit 1
fi
if ! command -v make &> /dev/null; then
printf "[-] make not found\n"
exit 1
fi
# parse arguments
is_interactive=1
port=8888
repo=""
wtype=""
backend="cpu"
# if macOS, use metal backend by default
if [[ "$OSTYPE" == "darwin"* ]]; then
backend="metal"
elif command -v nvcc &> /dev/null; then
backend="cuda"
fi
gpu_id=0
n_parallel=8
n_kv=4096
verbose=0
function print_usage {
printf "Usage:\n"
printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]\n\n"
printf " --port: port number, default is 8888\n"
printf " --repo: path to a repo containing GGUF model files\n"
printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
printf " --backend: cpu, cuda, metal, depends on the OS\n"
printf " --gpu-id: gpu id, default is 0\n"
printf " --n-parallel: number of parallel requests, default is 8\n"
printf " --n-kv: KV cache size, default is 4096\n"
printf " --verbose: verbose output\n\n"
printf " --non-interactive: run without asking a permission to run\n"
printf "Example:\n\n"
printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
}
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
--non-interactive)
is_interactive=0
shift
;;
--port)
port="$2"
shift
shift
;;
--repo)
repo="$2"
shift
shift
;;
--wtype)
wtype="$2"
shift
shift
;;
--backend)
backend="$2"
shift
shift
;;
--gpu-id)
gpu_id="$2"
shift
shift
;;
--n-parallel)
n_parallel="$2"
shift
shift
;;
--n-kv)
n_kv="$2"
shift
shift
;;
--verbose)
verbose=1
shift
;;
--help)
print_usage
exit 0
;;
*)
echo "Unknown argument: $key"
print_usage
exit 1
;;
esac
done
# available weights types
wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
wfiles=()
for wt in "${wtypes[@]}"; do
wfiles+=("")
done
# map wtype input to index
if [[ ! -z "$wtype" ]]; then
iw=-1
is=0
for wt in "${wtypes[@]}"; do
# uppercase
uwt=$(echo "$wt" | tr '[:lower:]' '[:upper:]')
if [[ "$uwt" == "$wtype" ]]; then
iw=$is
break
fi
is=$((is+1))
done
if [[ $iw -eq -1 ]]; then
printf "[-] Invalid weight type: %s\n" "$wtype"
exit 1
fi
wtype="$iw"
fi
# sample repos
repos=(
"https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
"https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
"https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
"https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
"https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
"https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
"https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
"https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
"https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
"https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
)
if [ $is_interactive -eq 1 ]; then
printf "\n"
printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
printf " Based on the options that follow, the script might download a model file\n"
printf " from the internet, which can be a few GBs in size. The script will also\n"
printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n"
printf "\n"
printf " Upon success, an HTTP server will be started and it will serve the selected\n"
printf " model using llama.cpp for demonstration purposes.\n"
printf "\n"
printf " Please note:\n"
printf "\n"
printf " - All new data will be stored in the current folder\n"
printf " - The server will be listening on all network interfaces\n"
printf " - The server will run with default settings which are not always optimal\n"
printf " - Do not judge the quality of a model based on the results from this script\n"
printf " - Do not use this script to benchmark llama.cpp\n"
printf " - Do not use this script in production\n"
printf " - This script is only for demonstration purposes\n"
printf "\n"
printf " If you don't know what you are doing, please press Ctrl-C to abort now\n"
printf "\n"
printf " Press Enter to continue ...\n\n"
read
fi
if [[ -z "$repo" ]]; then
printf "[+] No repo provided from the command line\n"
printf " Please select a number from the list below or enter an URL:\n\n"
is=0
for r in "${repos[@]}"; do
printf " %2d) %s\n" $is "$r"
is=$((is+1))
done
# ask for repo until index of sample repo is provided or an URL
while [[ -z "$repo" ]]; do
printf "\n Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
read -p "[+] Select repo: " repo
# check if the input is a number
if [[ "$repo" =~ ^[0-9]+$ ]]; then
if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
repo="${repos[$repo]}"
else
printf "[-] Invalid repo index: %s\n" "$repo"
repo=""
fi
elif [[ "$repo" =~ ^https?:// ]]; then
repo="$repo"
else
printf "[-] Invalid repo URL: %s\n" "$repo"
repo=""
fi
done
fi
# remove suffix
repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
printf "[+] Checking for GGUF model files in %s\n" "$repo"
# find GGUF files in the source
# TODO: better logic
model_tree="${repo%/}/tree/main"
model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
# list all files in the provided git repo
printf "[+] Model files:\n\n"
for file in $model_files; do
# determine iw by grepping the filename with wtypes
iw=-1
is=0
for wt in "${wtypes[@]}"; do
# uppercase
ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
if [[ "$ufile" =~ "$wt" ]]; then
iw=$is
break
fi
is=$((is+1))
done
if [[ $iw -eq -1 ]]; then
continue
fi
wfiles[$iw]="$file"
have=" "
if [[ -f "$file" ]]; then
have="*"
fi
printf " %2d) %s %s\n" $iw "$have" "$file"
done
wfile="${wfiles[$wtype]}"
# ask for weights type until provided and available
while [[ -z "$wfile" ]]; do
printf "\n"
read -p "[+] Select weight type: " wtype
wfile="${wfiles[$wtype]}"
if [[ -z "$wfile" ]]; then
printf "[-] Invalid weight type: %s\n" "$wtype"
wtype=""
fi
done
printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
url="${repo%/}/resolve/main/$wfile"
# check file if the model has been downloaded before
chk="$wfile.chk"
# check if we should download the file
# - if $wfile does not exist
# - if $wfile exists but $chk does not exist
# - if $wfile exists and $chk exists but $wfile is newer than $chk
# TODO: better logic using git lfs info
do_download=0
if [[ ! -f "$wfile" ]]; then
do_download=1
elif [[ ! -f "$chk" ]]; then
do_download=1
elif [[ "$wfile" -nt "$chk" ]]; then
do_download=1
fi
if [[ $do_download -eq 1 ]]; then
printf "[+] Downloading weights from %s\n" "$url"
# download the weights file
curl -o "$wfile" -# -L "$url"
# create a check file if successful
if [[ $? -eq 0 ]]; then
printf "[+] Creating check file %s\n" "$chk"
touch "$chk"
fi
else
printf "[+] Using cached weights %s\n" "$wfile"
fi
# get latest llama.cpp and build
printf "[+] Downloading latest llama.cpp\n"
llama_cpp_dir="__llama_cpp_port_${port}__"
if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
# if the dir exists and there isn't a file "__ggml_script__" in it, abort
printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
printf "[-] Please remove it and try again\n"
exit 1
elif [[ -d "$llama_cpp_dir" ]]; then
printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
printf "[+] Using cached llama.cpp\n"
cd "$llama_cpp_dir"
git reset --hard
git fetch
git checkout origin/master
cd ..
else
printf "[+] Cloning llama.cpp\n"
git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
fi
# mark that that the directory is made by this script
touch "$llama_cpp_dir/__ggml_script__"
if [[ $verbose -eq 1 ]]; then
set -x
fi
# build
cd "$llama_cpp_dir"
make clean
log="--silent"
if [[ $verbose -eq 1 ]]; then
log=""
fi
if [[ "$backend" == "cuda" ]]; then
printf "[+] Building with CUDA backend\n"
GGML_CUDA=1 make -j llama-server $log
elif [[ "$backend" == "cpu" ]]; then
printf "[+] Building with CPU backend\n"
make -j llama-server $log
elif [[ "$backend" == "metal" ]]; then
printf "[+] Building with Metal backend\n"
make -j llama-server $log
else
printf "[-] Unknown backend: %s\n" "$backend"
exit 1
fi
# run the server
printf "[+] Running server\n"
args=""
if [[ "$backend" == "cuda" ]]; then
export CUDA_VISIBLE_DEVICES=$gpu_id
args="-ngl 999"
elif [[ "$backend" == "cpu" ]]; then
args="-ngl 0"
elif [[ "$backend" == "metal" ]]; then
args="-ngl 999"
else
printf "[-] Unknown backend: %s\n" "$backend"
exit 1
fi
if [[ $verbose -eq 1 ]]; then
args="$args --verbose"
fi
./llama-server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
exit 0

View File

@ -25,7 +25,7 @@ add_library(llama
)
target_include_directories(llama PUBLIC . ../include)
target_compile_features (llama PUBLIC cxx_std_11) # don't bump
target_compile_features (llama PUBLIC cxx_std_17) # don't bump
target_link_libraries(llama PUBLIC ggml)

View File

@ -1552,6 +1552,67 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
},
};
enum llm_chat_template {
LLM_CHAT_TEMPLATE_CHATML,
LLM_CHAT_TEMPLATE_LLAMA_2,
LLM_CHAT_TEMPLATE_LLAMA_2_SYS,
LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS,
LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP,
LLM_CHAT_TEMPLATE_MISTRAL_V1,
LLM_CHAT_TEMPLATE_MISTRAL_V3,
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
LLM_CHAT_TEMPLATE_MISTRAL_V7,
LLM_CHAT_TEMPLATE_PHI_3,
LLM_CHAT_TEMPLATE_ZEPHYR,
LLM_CHAT_TEMPLATE_MONARCH,
LLM_CHAT_TEMPLATE_GEMMA,
LLM_CHAT_TEMPLATE_ORION,
LLM_CHAT_TEMPLATE_OPENCHAT,
LLM_CHAT_TEMPLATE_VICUNA,
LLM_CHAT_TEMPLATE_VICUNA_ORCA,
LLM_CHAT_TEMPLATE_DEEPSEEK,
LLM_CHAT_TEMPLATE_DEEPSEEK_2,
LLM_CHAT_TEMPLATE_COMMAND_R,
LLM_CHAT_TEMPLATE_LLAMA_3,
LLM_CHAT_TEMPLATE_CHATGML_3,
LLM_CHAT_TEMPLATE_CHATGML_4,
LLM_CHAT_TEMPLATE_MINICPM,
LLM_CHAT_TEMPLATE_EXAONE_3,
LLM_CHAT_TEMPLATE_RWKV_WORLD,
LLM_CHAT_TEMPLATE_GRANITE,
LLM_CHAT_TEMPLATE_UNKNOWN,
};
static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "chatml", LLM_CHAT_TEMPLATE_CHATML },
{ "llama2", LLM_CHAT_TEMPLATE_LLAMA_2 },
{ "llama2-sys", LLM_CHAT_TEMPLATE_LLAMA_2_SYS },
{ "llama2-sys-bos", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS },
{ "llama2-sys-strip", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
{ "mistral-v1", LLM_CHAT_TEMPLATE_MISTRAL_V1 },
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
{ "gemma", LLM_CHAT_TEMPLATE_GEMMA },
{ "orion", LLM_CHAT_TEMPLATE_ORION },
{ "openchat", LLM_CHAT_TEMPLATE_OPENCHAT },
{ "vicuna", LLM_CHAT_TEMPLATE_VICUNA },
{ "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
{ "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
{ "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
};
static llm_arch llm_arch_from_string(const std::string & name) {
for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
if (kv.second == name) {
@ -21871,18 +21932,109 @@ int32_t llama_detokenize(
// chat templates
//
static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
if (LLM_CHAT_TEMPLATES.find(tmpl) != LLM_CHAT_TEMPLATES.end()) {
return LLM_CHAT_TEMPLATES.at(tmpl);
}
auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
return tmpl.find(haystack) != std::string::npos;
};
if (tmpl_contains("<|im_start|>")) {
return LLM_CHAT_TEMPLATE_CHATML;
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
if (tmpl_contains("[SYSTEM_PROMPT]")) {
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
} else if (
// catches official 'v1' template
tmpl_contains("' [INST] ' + system_message")
// catches official 'v3' and 'v3-tekken' templates
|| tmpl_contains("[AVAILABLE_TOOLS]")
) {
// Official mistral 'v1', 'v3' and 'v3-tekken' templates
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
if (tmpl_contains(" [INST]")) {
return LLM_CHAT_TEMPLATE_MISTRAL_V1;
} else if (tmpl_contains("\"[INST]\"")) {
return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
}
return LLM_CHAT_TEMPLATE_MISTRAL_V3;
} else {
// llama2 template and its variants
// [variant] support system message
// See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
bool support_system_message = tmpl_contains("<<SYS>>");
bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
bool strip_message = tmpl_contains("content.strip()");
if (strip_message) {
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
} else if (add_bos_inside_history) {
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
} else if (support_system_message) {
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
} else {
return LLM_CHAT_TEMPLATE_LLAMA_2;
}
}
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
return LLM_CHAT_TEMPLATE_PHI_3;
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
return LLM_CHAT_TEMPLATE_ZEPHYR;
} else if (tmpl_contains("bos_token + message['role']")) {
return LLM_CHAT_TEMPLATE_MONARCH;
} else if (tmpl_contains("<start_of_turn>")) {
return LLM_CHAT_TEMPLATE_GEMMA;
} else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
// OrionStarAI/Orion-14B-Chat
return LLM_CHAT_TEMPLATE_ORION;
} else if (tmpl_contains("GPT4 Correct ")) {
// openchat/openchat-3.5-0106
return LLM_CHAT_TEMPLATE_OPENCHAT;
} else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
// eachadea/vicuna-13b-1.1 (and Orca variant)
if (tmpl_contains("SYSTEM: ")) {
return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
}
return LLM_CHAT_TEMPLATE_VICUNA;
} else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
// deepseek-ai/deepseek-coder-33b-instruct
return LLM_CHAT_TEMPLATE_DEEPSEEK;
} else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
// CohereForAI/c4ai-command-r-plus
return LLM_CHAT_TEMPLATE_COMMAND_R;
} else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
return LLM_CHAT_TEMPLATE_LLAMA_3;
} else if (tmpl_contains("[gMASK]sop")) {
// chatglm3-6b
return LLM_CHAT_TEMPLATE_CHATGML_3;
} else if (tmpl_contains("[gMASK]<sop>")) {
return LLM_CHAT_TEMPLATE_CHATGML_4;
} else if (tmpl_contains(LU8("<用户>"))) {
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
return LLM_CHAT_TEMPLATE_MINICPM;
} else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
} else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
// EXAONE-3.0-7.8B-Instruct
return LLM_CHAT_TEMPLATE_EXAONE_3;
} else if (tmpl_contains("rwkv-world")) {
return LLM_CHAT_TEMPLATE_RWKV_WORLD;
} else if (tmpl_contains("<|start_of_role|>")) {
return LLM_CHAT_TEMPLATE_GRANITE;
}
return LLM_CHAT_TEMPLATE_UNKNOWN;
}
// Simple version of "llama_apply_chat_template" that only works with strings
// This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
static int32_t llama_chat_apply_template_internal(
const std::string & tmpl,
const llm_chat_template tmpl,
const std::vector<const llama_chat_message *> & chat,
std::string & dest, bool add_ass) {
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
std::stringstream ss;
auto tmpl_contains = [&tmpl](std::string haystack) -> bool {
return tmpl.find(haystack) != std::string::npos;
};
if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) {
if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
// chatml template
for (auto message : chat) {
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
@ -21890,16 +22042,59 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<|im_start|>assistant\n";
}
} else if (tmpl == "llama2" || tmpl == "mistral" || tmpl_contains("[INST]")) {
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
// Official mistral 'v7' template
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
for (auto message : chat) {
std::string role(message->role);
std::string content(message->content);
if (role == "system") {
ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
} else if (role == "user") {
ss << "[INST] " << content << "[/INST]";
}
else {
ss << " " << content << "</s>";
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
|| tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
|| tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
bool is_inside_turn = false;
for (auto message : chat) {
if (!is_inside_turn) {
ss << leading_space << "[INST]" << trailing_space;
is_inside_turn = true;
}
std::string role(message->role);
std::string content(message->content);
if (role == "system") {
ss << content << "\n\n";
} else if (role == "user") {
ss << content << leading_space << "[/INST]";
} else {
ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
is_inside_turn = false;
}
}
} else if (
tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
// llama2 template and its variants
// [variant] support system message
bool support_system_message = tmpl_contains("<<SYS>>") || tmpl == "mistral";
// [variant] space before + after response
bool space_around_response = tmpl_contains("' ' + eos_token");
// See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
// [variant] add BOS inside history
bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
// [variant] trim spaces from the input message
bool strip_message = tmpl_contains("content.strip()");
bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
// construct the prompt
bool is_inside_turn = true; // skip BOS at the beginning
ss << "[INST] ";
@ -21920,12 +22115,11 @@ static int32_t llama_chat_apply_template_internal(
} else if (role == "user") {
ss << content << " [/INST]";
} else {
ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
ss << content << "</s>";
is_inside_turn = false;
}
}
// llama2 templates seem to not care about "add_generation_prompt"
} else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
} else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
// Phi 3
for (auto message : chat) {
std::string role(message->role);
@ -21934,7 +22128,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<|assistant|>\n";
}
} else if (tmpl == "zephyr" || tmpl_contains("<|user|>")) {
} else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
// zephyr template
for (auto message : chat) {
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
@ -21942,7 +22136,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<|assistant|>\n";
}
} else if (tmpl == "monarch" || tmpl_contains("bos_token + message['role']")) {
} else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
for (auto message : chat) {
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
@ -21951,7 +22145,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<s>assistant\n";
}
} else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl_contains("<start_of_turn>")) {
} else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
// google/gemma-7b-it
std::string system_prompt = "";
for (auto message : chat) {
@ -21973,7 +22167,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<start_of_turn>model\n";
}
} else if (tmpl == "orion" || tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
} else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
// OrionStarAI/Orion-14B-Chat
std::string system_prompt = "";
for (auto message : chat) {
@ -21993,7 +22187,7 @@ static int32_t llama_chat_apply_template_internal(
ss << message->content << "</s>";
}
}
} else if (tmpl == "openchat" || tmpl_contains("GPT4 Correct ")) {
} else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
// openchat/openchat-3.5-0106,
for (auto message : chat) {
std::string role(message->role);
@ -22007,13 +22201,13 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "GPT4 Correct Assistant:";
}
} else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: "))) {
} else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
// eachadea/vicuna-13b-1.1 (and Orca variant)
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
// Orca-Vicuna variant uses a system prefix
if (tmpl == "vicuna-orca" || tmpl_contains("SYSTEM: ")) {
if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
ss << "SYSTEM: " << message->content << "\n";
} else {
ss << message->content << "\n\n";
@ -22027,7 +22221,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "ASSISTANT:";
}
} else if (tmpl == "deepseek" || (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>"))) {
} else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
// deepseek-ai/deepseek-coder-33b-instruct
for (auto message : chat) {
std::string role(message->role);
@ -22042,7 +22236,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "### Response:\n";
}
} else if (tmpl == "command-r" || (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>"))) {
} else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
// CohereForAI/c4ai-command-r-plus
for (auto message : chat) {
std::string role(message->role);
@ -22057,7 +22251,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
}
} else if (tmpl == "llama3" || (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>"))) {
} else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
// Llama 3
for (auto message : chat) {
std::string role(message->role);
@ -22066,7 +22260,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
}
} else if (tmpl == "chatglm3" || tmpl_contains("[gMASK]sop")) {
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
// chatglm3-6b
ss << "[gMASK]" << "sop";
for (auto message : chat) {
@ -22076,7 +22270,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<|assistant|>";
}
} else if (tmpl == "chatglm4" || tmpl_contains("[gMASK]<sop>")) {
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
ss << "[gMASK]" << "<sop>";
for (auto message : chat) {
std::string role(message->role);
@ -22094,6 +22288,7 @@ static int32_t llama_chat_apply_template_internal(
ss << "<|assistant|>";
}
} else if (tmpl == "minicpm" || tmpl_contains(LU8("<用户>"))) {
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
for (auto message : chat) {
std::string role(message->role);
@ -22105,7 +22300,7 @@ static int32_t llama_chat_apply_template_internal(
ss << trim(message->content);
}
}
} else if (tmpl == "deepseek2" || tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
} else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
// DeepSeek-V2
for (auto message : chat) {
std::string role(message->role);
@ -22120,7 +22315,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "Assistant:";
}
} else if (tmpl == "exaone3" || (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]"))) {
} else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
// EXAONE-3.0-7.8B-Instruct
for (auto message : chat) {
@ -22136,7 +22331,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "[|assistant|]";
}
} else if (tmpl == "rwkv-world" || tmpl_contains("rwkv-world")) {
} else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
// this template requires the model to have "\n\n" as EOT token
for (auto message : chat) {
std::string role(message->role);
@ -22146,7 +22341,7 @@ static int32_t llama_chat_apply_template_internal(
ss << message->content << "\n\n";
}
}
} else if (tmpl == "granite" || tmpl_contains("<|start_of_role|>")) {
} else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
// IBM Granite template
for (const auto & message : chat) {
std::string role(message->role);
@ -22198,7 +22393,11 @@ int32_t llama_chat_apply_template(
}
std::string formatted_chat;
int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
llm_chat_template detected_tmpl = llama_chat_detect_template(curr_tmpl);
if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
return -1;
}
int32_t res = llama_chat_apply_template_internal(detected_tmpl, chat_vec, formatted_chat, add_ass);
if (res < 0) {
return res;
}
@ -22208,6 +22407,15 @@ int32_t llama_chat_apply_template(
return res;
}
int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
auto it = LLM_CHAT_TEMPLATES.begin();
for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
output[i] = it->first.c_str();
std::advance(it, 1);
}
return (int32_t) LLM_CHAT_TEMPLATES.size();
}
//
// sampling
//

Some files were not shown because too many files have changed in this diff Show More