mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-11 13:30:35 +00:00
llama : reorganize source code + improve CMake (#8006)
* scripts : update sync [no ci] * files : relocate [no ci] * ci : disable kompute build [no ci] * cmake : fixes [no ci] * server : fix mingw build ggml-ci * cmake : minor [no ci] * cmake : link math library [no ci] * cmake : build normal ggml library (not object library) [no ci] * cmake : fix kompute build ggml-ci * make,cmake : fix LLAMA_CUDA + replace GGML_CDEF_PRIVATE ggml-ci * move public backend headers to the public include directory (#8122) * move public backend headers to the public include directory * nix test * spm : fix metal header --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * scripts : fix sync paths [no ci] * scripts : sync ggml-blas.h [no ci] --------- Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
parent
8854044561
commit
f3f65429c4
@ -160,9 +160,9 @@ effectiveStdenv.mkDerivation (
|
|||||||
};
|
};
|
||||||
|
|
||||||
postPatch = ''
|
postPatch = ''
|
||||||
substituteInPlace ./ggml-metal.m \
|
substituteInPlace ./ggml/src/ggml-metal.m \
|
||||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||||
substituteInPlace ./ggml-metal.m \
|
substituteInPlace ./ggml/src/ggml-metal.m \
|
||||||
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
||||||
'';
|
'';
|
||||||
|
|
||||||
@ -205,17 +205,17 @@ effectiveStdenv.mkDerivation (
|
|||||||
|
|
||||||
cmakeFlags =
|
cmakeFlags =
|
||||||
[
|
[
|
||||||
(cmakeBool "LLAMA_NATIVE" false)
|
|
||||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
||||||
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
||||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||||
(cmakeBool "LLAMA_BLAS" useBlas)
|
(cmakeBool "GGML_NATIVE" false)
|
||||||
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
|
(cmakeBool "GGML_BLAS" useBlas)
|
||||||
(cmakeBool "LLAMA_CUDA" useCuda)
|
(cmakeBool "GGML_CLBLAST" useOpenCL)
|
||||||
(cmakeBool "LLAMA_HIPBLAS" useRocm)
|
(cmakeBool "GGML_CUDA" useCuda)
|
||||||
(cmakeBool "LLAMA_METAL" useMetalKit)
|
(cmakeBool "GGML_HIPBLAS" useRocm)
|
||||||
(cmakeBool "LLAMA_VULKAN" useVulkan)
|
(cmakeBool "GGML_METAL" useMetalKit)
|
||||||
(cmakeBool "LLAMA_STATIC" enableStatic)
|
(cmakeBool "GGML_VULKAN" useVulkan)
|
||||||
|
(cmakeBool "GGML_STATIC" enableStatic)
|
||||||
]
|
]
|
||||||
++ optionals useCuda [
|
++ optionals useCuda [
|
||||||
(
|
(
|
||||||
@ -231,7 +231,7 @@ effectiveStdenv.mkDerivation (
|
|||||||
]
|
]
|
||||||
++ optionals useMetalKit [
|
++ optionals useMetalKit [
|
||||||
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
|
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
|
||||||
(cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
|
(cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
|
||||||
];
|
];
|
||||||
|
|
||||||
# Environment variables needed for ROCm
|
# Environment variables needed for ROCm
|
||||||
@ -244,7 +244,7 @@ effectiveStdenv.mkDerivation (
|
|||||||
# if they haven't been added yet.
|
# if they haven't been added yet.
|
||||||
postInstall = ''
|
postInstall = ''
|
||||||
mkdir -p $out/include
|
mkdir -p $out/include
|
||||||
cp $src/llama.h $out/include/
|
cp $src/include/llama.h $out/include/
|
||||||
'';
|
'';
|
||||||
|
|
||||||
# Define the shells here, but don't add in the inputsFrom to avoid recursion.
|
# Define the shells here, but don't add in the inputsFrom to avoid recursion.
|
||||||
|
28
.github/labeler.yml
vendored
28
.github/labeler.yml
vendored
@ -2,31 +2,31 @@
|
|||||||
Kompute:
|
Kompute:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml-kompute.h
|
- ggml/include/ggml-kompute.h
|
||||||
- ggml-kompute.cpp
|
- ggml/src/ggml-kompute.cpp
|
||||||
- README-kompute.md
|
- README-kompute.md
|
||||||
Apple Metal:
|
Apple Metal:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml-metal.h
|
- ggml/include/ggml-metal.h
|
||||||
- ggml-metal.cpp
|
- ggml/src/ggml-metal.cpp
|
||||||
- README-metal.md
|
- README-metal.md
|
||||||
SYCL:
|
SYCL:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml-sycl.h
|
- ggml/include/ggml-sycl.h
|
||||||
- ggml-sycl.cpp
|
- ggml/src/ggml-sycl.cpp
|
||||||
- README-sycl.md
|
- README-sycl.md
|
||||||
Nvidia GPU:
|
Nvidia GPU:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml-cuda.h
|
- ggml/include/ggml-cuda.h
|
||||||
- ggml-cuda/**
|
- ggml/src/ggml-cuda/**
|
||||||
Vulkan:
|
Vulkan:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml_vk_generate_shaders.py
|
- ggml/ggml_vk_generate_shaders.py
|
||||||
- ggml-vulkan*
|
- ggml/src/ggml-vulkan*
|
||||||
documentation:
|
documentation:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
@ -73,10 +73,10 @@ server:
|
|||||||
ggml:
|
ggml:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml.c
|
- ggml/include/ggml*.h
|
||||||
- ggml.h
|
- ggml/src/ggml*.c
|
||||||
- ggml-*.c
|
- ggml/src/ggml*.cpp
|
||||||
- ggml-*.h
|
- ggml/src/ggml*.h
|
||||||
- ggml-cuda/**
|
- ggml-cuda/**
|
||||||
nix:
|
nix:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
|
2
.github/workflows/bench.yml
vendored
2
.github/workflows/bench.yml
vendored
@ -109,7 +109,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
set -eux
|
set -eux
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
-DLLAMA_NATIVE=OFF \
|
-DGGML_NATIVE=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
-DLLAMA_CURL=ON \
|
-DLLAMA_CURL=ON \
|
||||||
-DLLAMA_CUBLAS=ON \
|
-DLLAMA_CUBLAS=ON \
|
||||||
|
74
.github/workflows/build.yml
vendored
74
.github/workflows/build.yml
vendored
@ -47,7 +47,7 @@ jobs:
|
|||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
|
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
@ -105,7 +105,7 @@ jobs:
|
|||||||
sysctl -a
|
sysctl -a
|
||||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||||
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||||
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON
|
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON
|
||||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
@ -305,7 +305,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF
|
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
|
||||||
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
@ -335,7 +335,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_RPC=ON ..
|
cmake -DGGML_RPC=ON ..
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
@ -363,7 +363,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_VULKAN=ON ..
|
cmake -DGGML_VULKAN=ON ..
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
ubuntu-22-cmake-hip:
|
ubuntu-22-cmake-hip:
|
||||||
@ -384,13 +384,13 @@ jobs:
|
|||||||
- name: Build with native CMake HIP support
|
- name: Build with native CMake HIP support
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON
|
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON
|
||||||
cmake --build build --config Release -j $(nproc)
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
- name: Build with legacy HIP support
|
- name: Build with legacy HIP support
|
||||||
id: cmake_build_legacy_hip
|
id: cmake_build_legacy_hip
|
||||||
run: |
|
run: |
|
||||||
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON
|
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON
|
||||||
cmake --build build2 --config Release -j $(nproc)
|
cmake --build build2 --config Release -j $(nproc)
|
||||||
|
|
||||||
ubuntu-22-cmake-sycl:
|
ubuntu-22-cmake-sycl:
|
||||||
@ -431,7 +431,7 @@ jobs:
|
|||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
|
cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
ubuntu-22-cmake-sycl-fp16:
|
ubuntu-22-cmake-sycl-fp16:
|
||||||
@ -472,10 +472,10 @@ jobs:
|
|||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON ..
|
cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
# TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||||
# how to debug it.
|
# how to debug it.
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
|
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
|
||||||
macOS-latest-make:
|
macOS-latest-make:
|
||||||
@ -497,15 +497,15 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
LLAMA_FATAL_WARNINGS: 1
|
LLAMA_FATAL_WARNINGS: 1
|
||||||
run: |
|
run: |
|
||||||
LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
|
GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
id: make_test
|
id: make_test
|
||||||
run: |
|
run: |
|
||||||
LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
|
GGML_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
|
||||||
LLAMA_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu)
|
GGML_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
# TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
# TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||||
# how to debug it.
|
# how to debug it.
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
|
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
|
||||||
# would be great if we fix these
|
# would be great if we fix these
|
||||||
@ -529,7 +529,7 @@ jobs:
|
|||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF ..
|
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
@ -559,13 +559,14 @@ jobs:
|
|||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -G Xcode .. \
|
cmake -G Xcode .. \
|
||||||
-DLLAMA_METAL_EMBED_LIBRARY=ON \
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||||
-DLLAMA_BUILD_TESTS=OFF \
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=OFF \
|
-DLLAMA_BUILD_SERVER=OFF \
|
||||||
-DCMAKE_SYSTEM_NAME=iOS \
|
-DCMAKE_SYSTEM_NAME=iOS \
|
||||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
|
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||||
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||||
|
|
||||||
macOS-latest-cmake-tvos:
|
macOS-latest-cmake-tvos:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
@ -588,13 +589,14 @@ jobs:
|
|||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -G Xcode .. \
|
cmake -G Xcode .. \
|
||||||
-DLLAMA_METAL_EMBED_LIBRARY=ON \
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||||
-DLLAMA_BUILD_TESTS=OFF \
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=OFF \
|
-DLLAMA_BUILD_SERVER=OFF \
|
||||||
-DCMAKE_SYSTEM_NAME=tvOS \
|
-DCMAKE_SYSTEM_NAME=tvOS \
|
||||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
|
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||||
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||||
|
|
||||||
macOS-latest-swift:
|
macOS-latest-swift:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
@ -662,7 +664,7 @@ jobs:
|
|||||||
- name: Build using make w/ OpenBLAS
|
- name: Build using make w/ OpenBLAS
|
||||||
shell: msys2 {0}
|
shell: msys2 {0}
|
||||||
run: |
|
run: |
|
||||||
make LLAMA_OPENBLAS=1 -j $(nproc)
|
make GGML_OPENBLAS=1 -j $(nproc)
|
||||||
|
|
||||||
- name: Build using CMake
|
- name: Build using CMake
|
||||||
shell: msys2 {0}
|
shell: msys2 {0}
|
||||||
@ -678,7 +680,7 @@ jobs:
|
|||||||
- name: Build using CMake w/ OpenBLAS
|
- name: Build using CMake w/ OpenBLAS
|
||||||
shell: msys2 {0}
|
shell: msys2 {0}
|
||||||
run: |
|
run: |
|
||||||
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
|
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||||
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||||
|
|
||||||
windows-latest-cmake:
|
windows-latest-cmake:
|
||||||
@ -693,25 +695,25 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- build: 'rpc-x64'
|
- build: 'rpc-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'noavx-x64'
|
- build: 'noavx-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx2-x64'
|
- build: 'avx2-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx-x64'
|
- build: 'avx-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx512-x64'
|
- build: 'avx512-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'openblas-x64'
|
- build: 'openblas-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||||
- build: 'kompute-x64'
|
- build: 'kompute-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'vulkan-x64'
|
- build: 'vulkan-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'llvm-arm64'
|
- build: 'llvm-arm64'
|
||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'msvc-arm64'
|
- build: 'msvc-arm64'
|
||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
@ -724,7 +726,7 @@ jobs:
|
|||||||
id: clone_kompute
|
id: clone_kompute
|
||||||
if: ${{ matrix.build == 'kompute-x64' }}
|
if: ${{ matrix.build == 'kompute-x64' }}
|
||||||
run: |
|
run: |
|
||||||
git submodule update --init kompute
|
git submodule update --init ggml/src/kompute
|
||||||
|
|
||||||
- name: Download OpenBLAS
|
- name: Download OpenBLAS
|
||||||
id: get_openblas
|
id: get_openblas
|
||||||
@ -854,7 +856,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON
|
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
|
||||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
@ -987,7 +989,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||||
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON
|
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
|
|
||||||
ios-xcode-build:
|
ios-xcode-build:
|
||||||
|
6
.github/workflows/server.yml
vendored
6
.github/workflows/server.yml
vendored
@ -92,12 +92,12 @@ jobs:
|
|||||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||||
run: |
|
run: |
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
-DLLAMA_NATIVE=OFF \
|
-DGGML_NATIVE=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
-DLLAMA_CURL=ON \
|
-DLLAMA_CURL=ON \
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||||
-DLLAMA_OPENMP=OFF ;
|
-DGGML_OPENMP=OFF ;
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
@ -105,7 +105,7 @@ jobs:
|
|||||||
if: ${{ matrix.sanitizer != 'THREAD' }}
|
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||||
run: |
|
run: |
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
-DLLAMA_NATIVE=OFF \
|
-DGGML_NATIVE=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
-DLLAMA_CURL=ON \
|
-DLLAMA_CURL=ON \
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -56,6 +56,7 @@ CMakeSettings.json
|
|||||||
compile_commands.json
|
compile_commands.json
|
||||||
ggml-metal-embed.metal
|
ggml-metal-embed.metal
|
||||||
llama-batched-swift
|
llama-batched-swift
|
||||||
|
/rpc-server
|
||||||
out/
|
out/
|
||||||
tmp/
|
tmp/
|
||||||
|
|
||||||
|
2
.gitmodules
vendored
2
.gitmodules
vendored
@ -1,3 +1,3 @@
|
|||||||
[submodule "kompute"]
|
[submodule "kompute"]
|
||||||
path = kompute
|
path = ggml/src/kompute
|
||||||
url = https://github.com/nomic-ai/kompute.git
|
url = https://github.com/nomic-ai/kompute.git
|
||||||
|
1352
CMakeLists.txt
1352
CMakeLists.txt
File diff suppressed because it is too large
Load Diff
@ -19,14 +19,14 @@
|
|||||||
"cacheVariables": {
|
"cacheVariables": {
|
||||||
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
||||||
"CMAKE_CXX_COMPILER": "icx",
|
"CMAKE_CXX_COMPILER": "icx",
|
||||||
"LLAMA_SYCL": "ON",
|
"GGML_SYCL": "ON",
|
||||||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
||||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
||||||
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||||
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
|
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
|
||||||
|
|
||||||
{
|
{
|
||||||
"name": "arm64-windows-msvc", "hidden": true,
|
"name": "arm64-windows-msvc", "hidden": true,
|
||||||
|
@ -3,14 +3,13 @@
|
|||||||
import PackageDescription
|
import PackageDescription
|
||||||
|
|
||||||
var sources = [
|
var sources = [
|
||||||
"ggml.c",
|
"src/llama.cpp",
|
||||||
"sgemm.cpp",
|
"src/unicode.cpp",
|
||||||
"llama.cpp",
|
"src/unicode-data.cpp",
|
||||||
"unicode.cpp",
|
"ggml/src/ggml.c",
|
||||||
"unicode-data.cpp",
|
"ggml/src/ggml-alloc.c",
|
||||||
"ggml-alloc.c",
|
"ggml/src/ggml-backend.c",
|
||||||
"ggml-backend.c",
|
"ggml/src/ggml-quants.c",
|
||||||
"ggml-quants.c",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
var resources: [Resource] = []
|
var resources: [Resource] = []
|
||||||
@ -26,8 +25,8 @@ var cSettings: [CSetting] = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
#if canImport(Darwin)
|
#if canImport(Darwin)
|
||||||
sources.append("ggml-metal.m")
|
sources.append("ggml/src/ggml-metal.m")
|
||||||
resources.append(.process("ggml-metal.metal"))
|
resources.append(.process("ggml/src/ggml-metal.metal"))
|
||||||
linkerSettings.append(.linkedFramework("Accelerate"))
|
linkerSettings.append(.linkedFramework("Accelerate"))
|
||||||
cSettings.append(
|
cSettings.append(
|
||||||
contentsOf: [
|
contentsOf: [
|
||||||
@ -63,8 +62,6 @@ let package = Package(
|
|||||||
"models",
|
"models",
|
||||||
"tests",
|
"tests",
|
||||||
"CMakeLists.txt",
|
"CMakeLists.txt",
|
||||||
"ggml-cuda.cu",
|
|
||||||
"ggml-cuda.h",
|
|
||||||
"Makefile"
|
"Makefile"
|
||||||
],
|
],
|
||||||
sources: sources,
|
sources: sources,
|
||||||
|
@ -115,12 +115,12 @@ The docker build option is currently limited to *intel GPU* targets.
|
|||||||
### Build image
|
### Build image
|
||||||
```sh
|
```sh
|
||||||
# Using FP16
|
# Using FP16
|
||||||
docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
|
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
|
||||||
```
|
```
|
||||||
|
|
||||||
*Notes*:
|
*Notes*:
|
||||||
|
|
||||||
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
|
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.
|
||||||
|
|
||||||
You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
|
You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
|
||||||
|
|
||||||
@ -244,10 +244,10 @@ source /opt/intel/oneapi/setvars.sh
|
|||||||
# Build LLAMA with MKL BLAS acceleration for intel GPU
|
# Build LLAMA with MKL BLAS acceleration for intel GPU
|
||||||
|
|
||||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
# Option 2: Use FP16
|
# Option 2: Use FP16
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
|
||||||
|
|
||||||
# build all binary
|
# build all binary
|
||||||
cmake --build build --config Release -j -v
|
cmake --build build --config Release -j -v
|
||||||
@ -264,10 +264,10 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
|
|||||||
# Build LLAMA with Nvidia BLAS acceleration through SYCL
|
# Build LLAMA with Nvidia BLAS acceleration through SYCL
|
||||||
|
|
||||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
# Option 2: Use FP16
|
# Option 2: Use FP16
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
|
||||||
|
|
||||||
# build all binary
|
# build all binary
|
||||||
cmake --build build --config Release -j -v
|
cmake --build build --config Release -j -v
|
||||||
@ -422,10 +422,10 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
|
|||||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||||
|
|
||||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||||
|
|
||||||
# Option 2: Or FP16
|
# Option 2: Or FP16
|
||||||
cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
|
||||||
|
|
||||||
cmake --build build --config Release -j
|
cmake --build build --config Release -j
|
||||||
```
|
```
|
||||||
@ -440,7 +440,7 @@ Or, use CMake presets to build:
|
|||||||
cmake --preset x64-windows-sycl-release
|
cmake --preset x64-windows-sycl-release
|
||||||
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||||
|
|
||||||
cmake -DLLAMA_SYCL_F16=ON --preset x64-windows-sycl-release
|
cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
|
||||||
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||||
|
|
||||||
cmake --preset x64-windows-sycl-debug
|
cmake --preset x64-windows-sycl-debug
|
||||||
@ -544,9 +544,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
|||||||
|
|
||||||
| Name | Value | Function |
|
| Name | Value | Function |
|
||||||
|--------------------|-----------------------------------|---------------------------------------------|
|
|--------------------|-----------------------------------|---------------------------------------------|
|
||||||
| LLAMA_SYCL | ON (mandatory) | Enable build with SYCL code path. |
|
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path. |
|
||||||
| LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
|
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
|
||||||
| LLAMA_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
|
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
|
||||||
| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. |
|
| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. |
|
||||||
| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
||||||
|
|
||||||
|
58
README.md
58
README.md
@ -415,7 +415,7 @@ Flox follows the nixpkgs build of llama.cpp.
|
|||||||
### Metal Build
|
### Metal Build
|
||||||
|
|
||||||
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
|
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
|
||||||
To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.
|
To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
|
||||||
|
|
||||||
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
|
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
|
||||||
argument.
|
argument.
|
||||||
@ -435,7 +435,7 @@ Building the program with BLAS support may lead to some performance improvements
|
|||||||
- Using `make`:
|
- Using `make`:
|
||||||
- On Linux:
|
- On Linux:
|
||||||
```bash
|
```bash
|
||||||
make LLAMA_OPENBLAS=1
|
make GGML_OPENBLAS=1
|
||||||
```
|
```
|
||||||
|
|
||||||
- On Windows:
|
- On Windows:
|
||||||
@ -450,13 +450,13 @@ Building the program with BLAS support may lead to some performance improvements
|
|||||||
8. From here you can run:
|
8. From here you can run:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
make LLAMA_OPENBLAS=1
|
make GGML_OPENBLAS=1
|
||||||
```
|
```
|
||||||
|
|
||||||
- Using `CMake` on Linux:
|
- Using `CMake` on Linux:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
|
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -475,10 +475,10 @@ Building the program with BLAS support may lead to some performance improvements
|
|||||||
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
|
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
|
||||||
|
|
||||||
- Using manual oneAPI installation:
|
- Using manual oneAPI installation:
|
||||||
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
|
By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
|
||||||
```bash
|
```bash
|
||||||
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation
|
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation
|
||||||
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
|
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -495,28 +495,28 @@ Building the program with BLAS support may lead to some performance improvements
|
|||||||
|
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
```bash
|
```bash
|
||||||
make LLAMA_CUDA=1
|
make GGML_CUDA=1
|
||||||
```
|
```
|
||||||
- Using `CMake`:
|
- Using `CMake`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build -DLLAMA_CUDA=ON
|
cmake -B build -DGGML_CUDA=ON
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
|
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
|
||||||
|
|
||||||
| Option | Legal values | Default | Description |
|
| Option | Legal values | Default | Description |
|
||||||
|--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
| GGML_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
||||||
| LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
|
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
|
||||||
| LLAMA_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
|
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
|
||||||
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
||||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
||||||
| LLAMA_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
|
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
|
||||||
|
|
||||||
- #### hipBLAS
|
- #### hipBLAS
|
||||||
|
|
||||||
@ -526,15 +526,15 @@ Building the program with BLAS support may lead to some performance improvements
|
|||||||
|
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
```bash
|
```bash
|
||||||
make LLAMA_HIPBLAS=1
|
make GGML_HIPBLAS=1
|
||||||
```
|
```
|
||||||
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
|
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
|
||||||
```bash
|
```bash
|
||||||
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
||||||
cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
||||||
&& cmake --build build --config Release -- -j 16
|
&& cmake --build build --config Release -- -j 16
|
||||||
```
|
```
|
||||||
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
|
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
|
||||||
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
||||||
|
|
||||||
Note that if you get the following error:
|
Note that if you get the following error:
|
||||||
@ -548,19 +548,19 @@ Building the program with BLAS support may lead to some performance improvements
|
|||||||
```bash
|
```bash
|
||||||
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
|
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
|
||||||
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
|
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
|
||||||
cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
||||||
&& cmake --build build -- -j 16
|
&& cmake --build build -- -j 16
|
||||||
```
|
```
|
||||||
|
|
||||||
- Using `make` (example for target gfx1030, build with 16 CPU threads):
|
- Using `make` (example for target gfx1030, build with 16 CPU threads):
|
||||||
```bash
|
```bash
|
||||||
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
|
make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
|
||||||
```
|
```
|
||||||
|
|
||||||
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
||||||
```bash
|
```bash
|
||||||
set PATH=%HIP_PATH%\bin;%PATH%
|
set PATH=%HIP_PATH%\bin;%PATH%
|
||||||
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
|
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
|
||||||
cmake --build build
|
cmake --build build
|
||||||
```
|
```
|
||||||
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
||||||
@ -572,10 +572,10 @@ Building the program with BLAS support may lead to some performance improvements
|
|||||||
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
|
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
|
||||||
|
|
||||||
| Option | Legal values | Default | Description |
|
| Option | Legal values | Default | Description |
|
||||||
|-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
|
|
||||||
- #### Vulkan
|
- #### Vulkan
|
||||||
|
|
||||||
@ -613,7 +613,7 @@ Building the program with BLAS support may lead to some performance improvements
|
|||||||
Then, build llama.cpp using the cmake command below:
|
Then, build llama.cpp using the cmake command below:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build -DLLAMA_VULKAN=1
|
cmake -B build -DGGML_VULKAN=1
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
|
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
|
||||||
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
||||||
|
10
ci/run.sh
10
ci/run.sh
@ -36,11 +36,11 @@ SRC=`pwd`
|
|||||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
|
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||||
@ -50,7 +50,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
|
||||||
fi
|
fi
|
||||||
## helpers
|
## helpers
|
||||||
|
|
||||||
@ -284,7 +284,7 @@ function gg_run_open_llama_7b_v2 {
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
@ -550,7 +550,7 @@ function gg_run_pythia_2_8b {
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
22
cmake/git-vars.cmake
Normal file
22
cmake/git-vars.cmake
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
find_package(Git)
|
||||||
|
|
||||||
|
# the commit's SHA1
|
||||||
|
execute_process(COMMAND
|
||||||
|
"${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
|
||||||
|
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
||||||
|
OUTPUT_VARIABLE GIT_SHA1
|
||||||
|
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||||
|
|
||||||
|
# the date of the commit
|
||||||
|
execute_process(COMMAND
|
||||||
|
"${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
|
||||||
|
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
||||||
|
OUTPUT_VARIABLE GIT_DATE
|
||||||
|
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||||
|
|
||||||
|
# the subject of the commit
|
||||||
|
execute_process(COMMAND
|
||||||
|
"${GIT_EXECUTABLE}" log -1 --format=%s
|
||||||
|
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
||||||
|
OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
|
||||||
|
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
@ -2,11 +2,12 @@ set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
|
|||||||
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
|
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
|
||||||
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
||||||
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||||
set(LLAMA_BLAS @LLAMA_BLAS@)
|
|
||||||
set(LLAMA_CUDA @LLAMA_CUDA@)
|
set(GGML_BLAS @GGML_BLAS@)
|
||||||
set(LLAMA_METAL @LLAMA_METAL@)
|
set(GGML_CUDA @GGML_CUDA@)
|
||||||
set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
|
set(GGML_METAL @GGML_METAL@)
|
||||||
set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
|
set(GGML_HIPBLAS @GGML_HIPBLAS@)
|
||||||
|
set(GGML_ACCELERATE @GGML_ACCELERATE@)
|
||||||
|
|
||||||
@PACKAGE_INIT@
|
@PACKAGE_INIT@
|
||||||
|
|
||||||
@ -17,25 +18,26 @@ set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
|
|||||||
# Ensure transient dependencies satisfied
|
# Ensure transient dependencies satisfied
|
||||||
|
|
||||||
find_package(Threads REQUIRED)
|
find_package(Threads REQUIRED)
|
||||||
if (APPLE AND LLAMA_ACCELERATE)
|
|
||||||
|
if (APPLE AND GGML_ACCELERATE)
|
||||||
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
|
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_BLAS)
|
if (GGML_BLAS)
|
||||||
find_package(BLAS REQUIRED)
|
find_package(BLAS REQUIRED)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_CUDA)
|
if (GGML_CUDA)
|
||||||
find_package(CUDAToolkit REQUIRED)
|
find_package(CUDAToolkit REQUIRED)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_METAL)
|
if (GGML_METAL)
|
||||||
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||||
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_HIPBLAS)
|
if (GGML_HIPBLAS)
|
||||||
find_package(hip REQUIRED)
|
find_package(hip REQUIRED)
|
||||||
find_package(hipblas REQUIRED)
|
find_package(hipblas REQUIRED)
|
||||||
find_package(rocblas REQUIRED)
|
find_package(rocblas REQUIRED)
|
||||||
@ -47,7 +49,9 @@ find_library(llama_LIBRARY llama
|
|||||||
|
|
||||||
set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
|
set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
|
||||||
set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
|
set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
|
||||||
|
|
||||||
add_library(llama UNKNOWN IMPORTED)
|
add_library(llama UNKNOWN IMPORTED)
|
||||||
|
|
||||||
set_target_properties(llama
|
set_target_properties(llama
|
||||||
PROPERTIES
|
PROPERTIES
|
||||||
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
|
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
|
@ -1,5 +1,6 @@
|
|||||||
# common
|
# common
|
||||||
|
|
||||||
|
find_package(Threads REQUIRED)
|
||||||
|
|
||||||
# Build info header
|
# Build info header
|
||||||
#
|
#
|
||||||
@ -36,7 +37,7 @@ add_custom_command(
|
|||||||
COMMENT "Generating build details from Git"
|
COMMENT "Generating build details from Git"
|
||||||
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
||||||
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
||||||
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
|
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
|
||||||
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
||||||
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
||||||
VERBATIM
|
VERBATIM
|
||||||
@ -83,5 +84,5 @@ if (LLAMA_CURL)
|
|||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
target_include_directories(${TARGET} PUBLIC .)
|
target_include_directories(${TARGET} PUBLIC .)
|
||||||
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
target_compile_features (${TARGET} PUBLIC cxx_std_11)
|
||||||
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||||
|
|
||||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
|
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
|
||||||
set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
|
set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
|
@ -30,8 +30,8 @@ We recommend using openmp since it's easier to modify the cores being used.
|
|||||||
Makefile:
|
Makefile:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
make LLAMA_BLIS=1 -j
|
make GGML_BLIS=1 -j
|
||||||
# make LLAMA_BLIS=1 benchmark-matmult
|
# make GGML_BLIS=1 llama-benchmark-matmult
|
||||||
```
|
```
|
||||||
|
|
||||||
CMake:
|
CMake:
|
||||||
@ -39,7 +39,7 @@ CMake:
|
|||||||
```bash
|
```bash
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
|
cmake -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME ..
|
||||||
make -j
|
make -j
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -39,13 +39,13 @@ else()
|
|||||||
add_subdirectory(quantize-stats)
|
add_subdirectory(quantize-stats)
|
||||||
add_subdirectory(quantize)
|
add_subdirectory(quantize)
|
||||||
add_subdirectory(retrieval)
|
add_subdirectory(retrieval)
|
||||||
if (LLAMA_RPC)
|
if (GGML_RPC)
|
||||||
add_subdirectory(rpc)
|
add_subdirectory(rpc)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_BUILD_SERVER)
|
if (LLAMA_BUILD_SERVER)
|
||||||
add_subdirectory(server)
|
add_subdirectory(server)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_SYCL)
|
if (GGML_SYCL)
|
||||||
add_subdirectory(sycl)
|
add_subdirectory(sycl)
|
||||||
endif()
|
endif()
|
||||||
add_subdirectory(save-load-state)
|
add_subdirectory(save-load-state)
|
||||||
|
@ -25,7 +25,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
|
|||||||
## Example
|
## Example
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_CUDA=1 make -j
|
GGML_CUDA=1 make -j
|
||||||
|
|
||||||
# generate importance matrix (imatrix.dat)
|
# generate importance matrix (imatrix.dat)
|
||||||
./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
||||||
|
@ -194,7 +194,7 @@ llama_print_timings: total time = 44411.01 ms / 377 tokens
|
|||||||
## Orin compile and run
|
## Orin compile and run
|
||||||
### compile
|
### compile
|
||||||
```sh
|
```sh
|
||||||
make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
|
make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32
|
||||||
```
|
```
|
||||||
### run on Orin
|
### run on Orin
|
||||||
### case 1
|
### case 1
|
||||||
|
@ -29,13 +29,13 @@ You can also run multiple `rpc-server` instances on the same host, each with a d
|
|||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options.
|
On each host, build the corresponding backend with `cmake` and add `-DGGML_RPC=ON` to the build options.
|
||||||
For example, to build the CUDA backend with RPC support:
|
For example, to build the CUDA backend with RPC support:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
mkdir build-rpc-cuda
|
mkdir build-rpc-cuda
|
||||||
cd build-rpc-cuda
|
cd build-rpc-cuda
|
||||||
cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
|
cmake .. -DGGML_CUDA=ON -DGGML_RPC=ON
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -58,12 +58,12 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
|
|||||||
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
|
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
|
||||||
|
|
||||||
|
|
||||||
On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:
|
On the main host build `llama.cpp` only with `-DGGML_RPC=ON`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
mkdir build-rpc
|
mkdir build-rpc
|
||||||
cd build-rpc
|
cd build-rpc
|
||||||
cmake .. -DLLAMA_RPC=ON
|
cmake .. -DGGML_RPC=ON
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -1,7 +1,14 @@
|
|||||||
set(TARGET llama-server)
|
set(TARGET llama-server)
|
||||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||||
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
|
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
|
||||||
|
|
||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
||||||
|
|
||||||
|
if (MINGW)
|
||||||
|
# fix: https://github.com/ggerganov/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
|
||||||
|
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
|
||||||
|
endif()
|
||||||
|
|
||||||
set(TARGET_SRCS
|
set(TARGET_SRCS
|
||||||
server.cpp
|
server.cpp
|
||||||
utils.hpp
|
utils.hpp
|
||||||
@ -24,6 +31,7 @@ set(PUBLIC_ASSETS
|
|||||||
prompt-formats.js
|
prompt-formats.js
|
||||||
json-schema-to-grammar.mjs
|
json-schema-to-grammar.mjs
|
||||||
)
|
)
|
||||||
|
|
||||||
foreach(asset ${PUBLIC_ASSETS})
|
foreach(asset ${PUBLIC_ASSETS})
|
||||||
set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
|
set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
|
||||||
set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
|
set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
|
||||||
@ -34,18 +42,23 @@ foreach(asset ${PUBLIC_ASSETS})
|
|||||||
COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
|
COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
|
||||||
)
|
)
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
||||||
add_executable(${TARGET} ${TARGET_SRCS})
|
add_executable(${TARGET} ${TARGET_SRCS})
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_compile_definitions(${TARGET} PRIVATE
|
target_compile_definitions(${TARGET} PRIVATE
|
||||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||||
)
|
)
|
||||||
|
|
||||||
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
||||||
if (LLAMA_SERVER_SSL)
|
if (LLAMA_SERVER_SSL)
|
||||||
find_package(OpenSSL REQUIRED)
|
find_package(OpenSSL REQUIRED)
|
||||||
target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
|
target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
|
||||||
target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
|
target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
@ -8,10 +8,10 @@ cd build
|
|||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
#for FP16
|
#for FP16
|
||||||
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
|
#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # faster for long-prompt inference
|
||||||
|
|
||||||
#for FP32
|
#for FP32
|
||||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
#build example/main
|
#build example/main
|
||||||
#cmake --build . --config Release --target main
|
#cmake --build . --config Release --target main
|
||||||
|
@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR
|
|||||||
|
|
||||||
:: for FP16
|
:: for FP16
|
||||||
:: faster for long-prompt inference
|
:: faster for long-prompt inference
|
||||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
:: cmake -G "MinGW Makefiles" .. -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
|
||||||
|
|
||||||
:: for FP32
|
:: for FP32
|
||||||
cmake -G "Ninja" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
cmake -G "Ninja" .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
||||||
if %errorlevel% neq 0 goto ERROR
|
if %errorlevel% neq 0 goto ERROR
|
||||||
:: build example/main only
|
:: build example/main only
|
||||||
:: make main
|
:: make main
|
||||||
|
238
ggml/CMakeLists.txt
Normal file
238
ggml/CMakeLists.txt
Normal file
@ -0,0 +1,238 @@
|
|||||||
|
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
|
||||||
|
project("ggml" C CXX)
|
||||||
|
include(CheckIncludeFileCXX)
|
||||||
|
|
||||||
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
|
|
||||||
|
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
||||||
|
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
||||||
|
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
||||||
|
set(GGML_STANDALONE ON)
|
||||||
|
|
||||||
|
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||||
|
|
||||||
|
# configure project version
|
||||||
|
# TODO
|
||||||
|
else()
|
||||||
|
set(GGML_STANDALONE OFF)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (EMSCRIPTEN)
|
||||||
|
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||||
|
|
||||||
|
option(GGML_WASM_SINGLE_FILE "ggml: embed WASM inside the generated ggml.js" ON)
|
||||||
|
else()
|
||||||
|
if (MINGW)
|
||||||
|
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||||
|
else()
|
||||||
|
set(BUILD_SHARED_LIBS_DEFAULT ON)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
||||||
|
|
||||||
|
#
|
||||||
|
# option list
|
||||||
|
#
|
||||||
|
|
||||||
|
# TODO: mark all options as advanced when not GGML_STANDALONE
|
||||||
|
|
||||||
|
if (APPLE)
|
||||||
|
set(GGML_METAL_DEFAULT ON)
|
||||||
|
set(GGML_BLAS_DEFAULT ON)
|
||||||
|
set(GGML_BLAS_VENDOR_DEFAULT "Apple")
|
||||||
|
else()
|
||||||
|
set(GGML_METAL_DEFAULT OFF)
|
||||||
|
set(GGML_BLAS_DEFAULT OFF)
|
||||||
|
set(GGML_BLAS_VENDOR_DEFAULT "Generic")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# general
|
||||||
|
option(GGML_STATIC "ggml: static link libraries" OFF)
|
||||||
|
option(GGML_NATIVE "ggml: enable -march=native flag" ON)
|
||||||
|
option(GGML_LTO "ggml: enable link time optimization" OFF)
|
||||||
|
option(GGML_CCACHE "ggml: use ccache if available" ON)
|
||||||
|
|
||||||
|
# debug
|
||||||
|
option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON)
|
||||||
|
option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
|
||||||
|
option(GGML_GPROF "ggml: enable gprof" OFF)
|
||||||
|
|
||||||
|
# build
|
||||||
|
option(GGML_FATAL_WARNINGS "ggml: enable -Werror flag" OFF)
|
||||||
|
|
||||||
|
# sanitizers
|
||||||
|
option(GGML_SANITIZE_THREAD "ggml: enable thread sanitizer" OFF)
|
||||||
|
option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF)
|
||||||
|
option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
|
||||||
|
|
||||||
|
# instruction set specific
|
||||||
|
if (GGML_NATIVE)
|
||||||
|
set(INS_ENB OFF)
|
||||||
|
else()
|
||||||
|
set(INS_ENB ON)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
||||||
|
|
||||||
|
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
||||||
|
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
||||||
|
option(GGML_AVX512 "ggml: enable AVX512" OFF)
|
||||||
|
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
|
||||||
|
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
|
||||||
|
option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
|
||||||
|
option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
|
||||||
|
if (NOT MSVC)
|
||||||
|
option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
|
||||||
|
endif()
|
||||||
|
option(GGML_LASX "ggml: enable lasx" ON)
|
||||||
|
option(GGML_LSX "ggml: enable lsx" ON)
|
||||||
|
option(GGML_SVE "ggml: enable SVE" OFF)
|
||||||
|
|
||||||
|
if (WIN32)
|
||||||
|
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# ggml core
|
||||||
|
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
|
||||||
|
|
||||||
|
# 3rd party libs / backends
|
||||||
|
option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
|
||||||
|
option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
|
||||||
|
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
|
||||||
|
"ggml: BLAS library vendor")
|
||||||
|
option(GGML_LLAMAFILE "ggml: use ggml SGEMM" OFF)
|
||||||
|
|
||||||
|
option(GGML_CUDA "ggml: use CUDA" OFF)
|
||||||
|
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||||
|
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
|
||||||
|
set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
|
||||||
|
set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
|
||||||
|
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
|
||||||
|
set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
|
||||||
|
"ggml: iters./thread per block for Q2_K/Q6_K")
|
||||||
|
set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||||
|
"ggml: max. batch size for using peer access")
|
||||||
|
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
|
||||||
|
option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
|
||||||
|
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
||||||
|
|
||||||
|
option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF)
|
||||||
|
option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
|
||||||
|
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
||||||
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||||
|
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
||||||
|
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
||||||
|
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
|
||||||
|
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
||||||
|
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
||||||
|
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
|
||||||
|
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
||||||
|
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
||||||
|
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
|
||||||
|
option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
|
||||||
|
set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
|
||||||
|
"ggml: metal minimum macOS version")
|
||||||
|
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
|
||||||
|
option(GGML_OPENMP "ggml: use OpenMP" ON)
|
||||||
|
option(GGML_RPC "ggml: use RPC" OFF)
|
||||||
|
option(GGML_SYCL "ggml: use SYCL" OFF)
|
||||||
|
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
||||||
|
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
||||||
|
"ggml: sycl target device")
|
||||||
|
|
||||||
|
# extra artifacts
|
||||||
|
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
|
||||||
|
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
|
||||||
|
|
||||||
|
#
|
||||||
|
# dependencies
|
||||||
|
#
|
||||||
|
|
||||||
|
set(CMAKE_C_STANDARD 11)
|
||||||
|
set(CMAKE_C_STANDARD_REQUIRED true)
|
||||||
|
|
||||||
|
if (GGML_SYCL)
|
||||||
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CXX_STANDARD 11)
|
||||||
|
endif()
|
||||||
|
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
||||||
|
|
||||||
|
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
||||||
|
|
||||||
|
find_package(Threads REQUIRED)
|
||||||
|
|
||||||
|
#
|
||||||
|
# build the library
|
||||||
|
#
|
||||||
|
|
||||||
|
add_subdirectory(src)
|
||||||
|
|
||||||
|
#
|
||||||
|
# tests and examples
|
||||||
|
#
|
||||||
|
|
||||||
|
if (GGML_BUILD_TESTS)
|
||||||
|
enable_testing()
|
||||||
|
add_subdirectory(tests)
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
if (GGML_BUILD_EXAMPLES)
|
||||||
|
add_subdirectory(examples)
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
#
|
||||||
|
# install
|
||||||
|
#
|
||||||
|
|
||||||
|
include(GNUInstallDirs)
|
||||||
|
include(CMakePackageConfigHelpers)
|
||||||
|
|
||||||
|
set(GGML_PUBLIC_HEADERS
|
||||||
|
include/ggml.h
|
||||||
|
include/ggml-alloc.h
|
||||||
|
include/ggml-backend.h
|
||||||
|
"${GGML_HEADERS_CUDA}"
|
||||||
|
"${GGML_HEADERS_METAL}"
|
||||||
|
"${GGML_HEADERS_EXTRA}")
|
||||||
|
|
||||||
|
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||||
|
#if (GGML_METAL)
|
||||||
|
# set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
|
||||||
|
#endif()
|
||||||
|
install(TARGETS ggml PUBLIC_HEADER)
|
||||||
|
|
||||||
|
if (BUILD_SHARED_LIBS)
|
||||||
|
install(TARGETS ggml LIBRARY)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_METAL)
|
||||||
|
install(
|
||||||
|
FILES src/ggml-metal.metal
|
||||||
|
PERMISSIONS
|
||||||
|
OWNER_READ
|
||||||
|
OWNER_WRITE
|
||||||
|
GROUP_READ
|
||||||
|
WORLD_READ
|
||||||
|
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||||
|
|
||||||
|
if (NOT GGML_METAL_EMBED_LIBRARY)
|
||||||
|
install(
|
||||||
|
FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||||
|
DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_STANDALONE)
|
||||||
|
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
|
||||||
|
@ONLY)
|
||||||
|
|
||||||
|
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
|
||||||
|
DESTINATION share/pkgconfig)
|
||||||
|
endif()
|
@ -79,22 +79,22 @@ endmacro()
|
|||||||
# flags are for MSVC only!
|
# flags are for MSVC only!
|
||||||
check_sse("AVX" " ;/arch:AVX")
|
check_sse("AVX" " ;/arch:AVX")
|
||||||
if (NOT ${AVX_FOUND})
|
if (NOT ${AVX_FOUND})
|
||||||
set(LLAMA_AVX OFF)
|
set(GGML_AVX OFF)
|
||||||
else()
|
else()
|
||||||
set(LLAMA_AVX ON)
|
set(GGML_AVX ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
check_sse("AVX2" " ;/arch:AVX2")
|
check_sse("AVX2" " ;/arch:AVX2")
|
||||||
check_sse("FMA" " ;/arch:AVX2")
|
check_sse("FMA" " ;/arch:AVX2")
|
||||||
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
|
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
|
||||||
set(LLAMA_AVX2 OFF)
|
set(GGML_AVX2 OFF)
|
||||||
else()
|
else()
|
||||||
set(LLAMA_AVX2 ON)
|
set(GGML_AVX2 ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
check_sse("AVX512" " ;/arch:AVX512")
|
check_sse("AVX512" " ;/arch:AVX512")
|
||||||
if (NOT ${AVX512_FOUND})
|
if (NOT ${AVX512_FOUND})
|
||||||
set(LLAMA_AVX512 OFF)
|
set(GGML_AVX512 OFF)
|
||||||
else()
|
else()
|
||||||
set(LLAMA_AVX512 ON)
|
set(GGML_AVX512 ON)
|
||||||
endif()
|
endif()
|
@ -8,7 +8,9 @@
|
|||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
#include "ggml-sycl/presets.hpp"
|
|
||||||
|
#define GGML_SYCL_NAME "SYCL"
|
||||||
|
#define GGML_SYCL_MAX_DEVICES 48
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
1171
ggml/src/CMakeLists.txt
Normal file
1171
ggml/src/CMakeLists.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -603,7 +603,7 @@ static void on_no_fattn_vec_case(const int D) {
|
|||||||
if (D == 64) {
|
if (D == 64) {
|
||||||
fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
|
fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
|
||||||
fprintf(stderr, "By default only f16 KV cache is supported.\n");
|
fprintf(stderr, "By default only f16 KV cache is supported.\n");
|
||||||
fprintf(stderr, "Compile with LLAMA_CUDA_FA_ALL_QUANTS for V cache quantization support.\n");
|
fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for V cache quantization support.\n");
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
} else if (D == 128) {
|
} else if (D == 128) {
|
||||||
fprintf(stderr, "Unsupported KV type combination for head_size 128.\n");
|
fprintf(stderr, "Unsupported KV type combination for head_size 128.\n");
|
||||||
@ -611,7 +611,7 @@ static void on_no_fattn_vec_case(const int D) {
|
|||||||
fprintf(stderr, " - K == q4_0, V == q4_0, 4.50 BPV\n");
|
fprintf(stderr, " - K == q4_0, V == q4_0, 4.50 BPV\n");
|
||||||
fprintf(stderr, " - K == q8_0, V == q8_0, 8.50 BPV\n");
|
fprintf(stderr, " - K == q8_0, V == q8_0, 8.50 BPV\n");
|
||||||
fprintf(stderr, " - K == f16, V == f16, 16.00 BPV\n");
|
fprintf(stderr, " - K == f16, V == f16, 16.00 BPV\n");
|
||||||
fprintf(stderr, "Compile with LLAMA_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
|
fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");
|
fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user