ggml : add unified SYCL backend for Intel GPUs (#2690)

* first update for migration

* update init_cublas

* add debug functio, commit all help code

* step 1

* step 2

* step3 add fp16, slower 31->28

* add GGML_LIST_DEVICE function

* step 5 format device and print

* step6, enhance error check, remove CUDA macro, enhance device id to fix none-zero id issue

* support main device is non-zero

* step7 add debug for code path, rm log

* step 8, rename all macro & func from cuda by sycl

* fix error of select non-zero device, format device list

* ren ggml-sycl.hpp -> ggml-sycl.h

* clear CMAKE to rm unused lib and options

* correct queue: rm dtct:get_queue

* add print tensor function to debug

* fix error: wrong result in 658746bb26702e50f2c59c0e4ada8e9da6010481

* summary dpct definition in one header file to replace folder:dpct

* refactor device log

* mv dpct definition from folder dpct to ggml-sycl.h

* update readme, refactor build script

* fix build with sycl

* set nthread=1 when sycl, increase performance

* add run script, comment debug code

* add ls-sycl-device tool

* add ls-sycl-device, rm unused files

* rm rear space

* dos2unix

* Update README_sycl.md

* fix return type

* remove sycl version from include path

* restore rm code to fix hang issue

* add syc and link for sycl readme

* rm original sycl code before refactor

* fix code err

* add know issue for pvc hang issue

* enable SYCL_F16 support

* align pr4766

* check for sycl blas, better performance

* cleanup 1

* remove extra endif

* add build&run script, clean CMakefile, update guide by review comments

* rename macro to intel hardware

* editor config format

* format fixes

* format fixes

* editor format fix

* Remove unused headers

* skip build sycl tool for other code path

* replace tab by space

* fix blas matmul function

* fix mac build

* restore hip dependency

* fix conflict

* ren as review comments

* mv internal function to .cpp file

* export funciton print_sycl_devices(), mv class dpct definition to source file

* update CI/action for sycl code, fix CI error of repeat/dup

* fix action ID format issue

* rm unused strategy

* enable llama_f16 in ci

* fix conflict

* fix build break on MacOS, due to CI of MacOS depend on external ggml, instead of internal ggml

* fix ci cases for unsupported data type

* revert unrelated changed in cuda cmake
remove useless nommq
fix typo of GGML_USE_CLBLAS_SYCL

* revert hip cmake changes

* fix indent

* add prefix in func name

* revert no mmq

* rm cpu blas duplicate

* fix no_new_line

* fix src1->type==F16 bug.

* pass batch offset for F16 src1

* fix batch error

* fix wrong code

* revert sycl checking in test-sampling

* pass void as arguments of ggml_backend_sycl_print_sycl_devices

* remove extra blank line in test-sampling

* revert setting n_threads in sycl

* implement std::isinf for icpx with fast math.

* Update ci/run.sh

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/sycl/run-llama2.sh

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/sycl/run-llama2.sh

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update CMakeLists.txt

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update CMakeLists.txt

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update CMakeLists.txt

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update CMakeLists.txt

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* add copyright and MIT license declare

* update the cmd example

---------

Co-authored-by: jianyuzh <jianyu.zhang@intel.com>
Co-authored-by: luoyu-intel <yu.luo@intel.com>
Co-authored-by: Meng, Hengyu <hengyu.meng@intel.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Abhilash Majumder 2024-01-28 21:26:23 +05:30 committed by GitHub
parent b764b8f1d0
commit 0f648573dd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
22 changed files with 15764 additions and 24 deletions

View File

@ -143,6 +143,47 @@ jobs:
cd build cd build
ctest -L main --verbose ctest -L main --verbose
ubuntu-22-cmake-sycl:
runs-on: ubuntu-22.04
continue-on-error: true
steps:
- uses: actions/checkout@v2
- name: add oneAPI to apt
shell: bash
run: |
cd /tmp
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
- name: install oneAPI dpcpp compiler
shell: bash
run: |
sudo apt update
sudo apt install intel-oneapi-compiler-dpcpp-cpp
- name: install oneAPI MKL library
shell: bash
run: |
sudo apt install intel-oneapi-mkl-devel
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
mkdir build
cd build
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
cmake --build . --config Release -j $(nproc)
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# how to debug it. # how to debug it.
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124 # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124

View File

@ -1,5 +1,6 @@
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories. cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
project("llama.cpp" C CXX) project("llama.cpp" C CXX)
include(CheckIncludeFileCXX)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@ -103,6 +104,8 @@ option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging"
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF) option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_SYCL "llama: use SYCL" OFF)
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
@ -121,8 +124,12 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
# #
# Compile flags # Compile flags
# #
if (LLAMA_SYCL)
set(CMAKE_CXX_STANDARD 17)
else()
set(CMAKE_CXX_STANDARD 11)
endif()
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED true) set(CMAKE_CXX_STANDARD_REQUIRED true)
set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true) set(CMAKE_C_STANDARD_REQUIRED true)
@ -454,6 +461,32 @@ if (LLAMA_HIPBLAS)
endif() endif()
endif() endif()
if (LLAMA_SYCL)
if ( NOT DEFINED ENV{ONEAPI_ROOT})
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
endif()
#todo: AOT
find_package(IntelSYCL REQUIRED)
if (LLAMA_SYCL_F16)
add_compile_definitions(GGML_SYCL_F16)
endif()
add_compile_definitions(GGML_USE_SYCL)
add_compile_options(-I./) #include DPCT
add_compile_options(-I/${SYCL_INCLUDE_DIR})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h)
set(GGML_SOURCES_SYCL ggml-sycl.cpp)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
endif()
function(get_flags CCID CCVER) function(get_flags CCID CCVER)
set(C_FLAGS "") set(C_FLAGS "")
set(CXX_FLAGS "") set(CXX_FLAGS "")
@ -479,11 +512,13 @@ function(get_flags CCID CCVER)
list(APPEND CXX_FLAGS -Wextra-semi) list(APPEND CXX_FLAGS -Wextra-semi)
endif() endif()
elseif (CCID MATCHES "Intel") elseif (CCID MATCHES "Intel")
if (NOT LLAMA_SYCL)
# enable max optimization level when using Intel compiler # enable max optimization level when using Intel compiler
set(C_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector) set(C_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector) set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
add_link_options(-fuse-ld=lld -static-intel) add_link_options(-fuse-ld=lld -static-intel)
endif() endif()
endif()
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE) set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE) set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
@ -799,6 +834,7 @@ add_library(ggml OBJECT
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
) )
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})

View File

@ -63,7 +63,7 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
- AVX, AVX2 and AVX512 support for x86 architectures - AVX, AVX2 and AVX512 support for x86 architectures
- Mixed F16 / F32 precision - Mixed F16 / F32 precision
- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support - 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support
- CUDA, Metal and OpenCL GPU backend support - CUDA, Metal, OpenCL, SYCL GPU backend support
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022). The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves
@ -599,6 +599,15 @@ Building the program with BLAS support may lead to some performance improvements
You can get a list of platforms and devices from the `clinfo -l` command, etc. You can get a list of platforms and devices from the `clinfo -l` command, etc.
- #### SYCL
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
For detailed info, please refer to [llama.cpp for SYCL](README_sycl.md).
### Prepare Data & Run ### Prepare Data & Run
```bash ```bash

252
README_sycl.md Normal file
View File

@ -0,0 +1,252 @@
# llama.cpp for SYCL
[Background](#background)
[OS](#os)
[Intel GPU](#intel-gpu)
[Linux](#linux)
[Environment Variable](#environment-variable)
[Known Issue](#known-issue)
[Todo](#todo)
## Background
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
The llama.cpp for SYCL is used to support Intel GPUs.
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
## OS
|OS|Status|Verified|
|-|-|-|
|Linux|Support|Ubuntu 22.04|
|Windows|Ongoing| |
## Intel GPU
|Intel GPU| Status | Verified Model|
|-|-|-|
|Intel Data Center Max Series| Support| Max 1550|
|Intel Data Center Flex Series| Support| Flex 170|
|Intel Arc Series| Support| Arc 770|
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
## Linux
### Setup Environment
1. Install Intel GPU driver.
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
Note: for iGPU, please install the client GPU driver.
b. Add user to group: video, render.
```
sudo usermod -aG render username
sudo usermod -aG video username
```
Note: re-login to enable it.
c. Check
```
sudo apt install clinfo
sudo clinfo -l
```
Output (example):
```
Platform #0: Intel(R) OpenCL Graphics
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
Platform #0: Intel(R) OpenCL HD Graphics
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
```
2. Install Intel® oneAPI Base toolkit.
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
Recommend to install to default folder: **/opt/intel/oneapi**.
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
b. Check
```
source /opt/intel/oneapi/setvars.sh
sycl-ls
```
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
Output (example):
```
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
```
2. Build locally:
```
mkdir -p build
cd build
source /opt/intel/oneapi/setvars.sh
#for FP16
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
#for FP32
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
#build example/main only
#cmake --build . --config Release --target main
#build all binary
cmake --build . --config Release -v
```
or
```
./examples/sycl/build.sh
```
Note:
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
### Run
1. Put model file to folder **models**
2. Enable oneAPI running environment
```
source /opt/intel/oneapi/setvars.sh
```
3. List device ID
Run without parameter:
```
./build/bin/ls-sycl-device
or
./build/bin/main
```
Check the ID in startup log, like:
```
found 4 SYCL devices:
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
```
|Attribute|Note|
|-|-|
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
4. Set device ID and execute llama.cpp
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
```
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
```
or run by script:
```
./examples/sycl/run_llama2.sh
```
Note:
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
5. Check the device ID in output
Like
```
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
```
## Environment Variable
#### Build
|Name|Value|Function|
|-|-|-|
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
#### Running
|Name|Value|Function|
|-|-|-|
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
## Known Issue
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
Miss to enable oneAPI running environment.
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
- Hang during startup
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
Solution: add **--no-mmap**.
## Todo
- Support to build in Windows.
- Support multiple cards.

View File

@ -22,4 +22,8 @@ bash ./ci/run.sh ./tmp/results ./tmp/mnt
# with CUDA support # with CUDA support
GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
# with SYCL support
source /opt/intel/oneapi/setvars.sh
GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
``` ```

View File

@ -10,6 +10,9 @@
# # with CUDA support # # with CUDA support
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
# #
# # with SYCL support
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
if [ -z "$2" ]; then if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>" echo "usage: $0 <output-dir> <mnt-dir>"
@ -40,6 +43,14 @@ if [ ! -z ${GG_BUILD_CUDA} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1" CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
fi fi
if [ ! -z ${GG_BUILD_SYCL} ]; then
if [ -z ${ONEAPI_ROOT} ]; then
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
exit 1
fi
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
fi
## helpers ## helpers
# download a file if it does not exist or if it is outdated # download a file if it does not exist or if it is outdated

View File

@ -42,6 +42,10 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
#define GGML_USE_CUBLAS_SYCL
#endif
int32_t get_num_physical_cores() { int32_t get_num_physical_cores() {
#ifdef __linux__ #ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores // enumerate the set of thread siblings, num entries is num cores
@ -599,9 +603,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.main_gpu = std::stoi(argv[i]); params.main_gpu = std::stoi(argv[i]);
#ifndef GGML_USE_CUBLAS #ifndef GGML_USE_CUBLAS_SYCL
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS_SYCL
} else if (arg == "--split-mode" || arg == "-sm") { } else if (arg == "--split-mode" || arg == "-sm") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -618,9 +622,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
invalid_param = true; invalid_param = true;
break; break;
} }
#ifndef GGML_USE_CUBLAS #ifndef GGML_USE_CUBLAS_SYCL
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS_SYCL
} else if (arg == "--tensor-split" || arg == "-ts") { } else if (arg == "--tensor-split" || arg == "-ts") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -643,9 +648,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
params.tensor_split[i] = 0.0f; params.tensor_split[i] = 0.0f;
} }
} }
#ifndef GGML_USE_CUBLAS #ifndef GGML_USE_CUBLAS_SYCL
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS_SYCL
} else if (arg == "--no-mmap") { } else if (arg == "--no-mmap") {
params.use_mmap = false; params.use_mmap = false;
} else if (arg == "--numa") { } else if (arg == "--numa") {
@ -1007,7 +1012,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
#endif #endif // LLAMA_SUPPORTS_GPU_OFFLOAD
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false"); printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false"); printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
printf(" -gan N, --grp-attn-n N\n"); printf(" -gan N, --grp-attn-n N\n");
@ -1514,7 +1519,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false"); fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");

View File

@ -23,6 +23,9 @@ else()
add_subdirectory(infill) add_subdirectory(infill)
add_subdirectory(llama-bench) add_subdirectory(llama-bench)
add_subdirectory(llava) add_subdirectory(llava)
if (LLAMA_SYCL)
add_subdirectory(sycl)
endif()
add_subdirectory(main) add_subdirectory(main)
add_subdirectory(tokenize) add_subdirectory(tokenize)
add_subdirectory(parallel) add_subdirectory(parallel)

View File

@ -2099,7 +2099,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true; invalid_param = true;
break; break;
} }
#ifdef GGML_USE_CUBLAS #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
std::string arg_next = argv[i]; std::string arg_next = argv[i];
// split string by , and / // split string by , and /
@ -2125,7 +2125,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
} }
else if (arg == "--no-mul-mat-q" || arg == "-nommq") else if (arg == "--no-mul-mat-q" || arg == "-nommq")
{ {
#ifdef GGML_USE_CUBLAS #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
params.mul_mat_q = false; params.mul_mat_q = false;
#else #else
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {}); LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
@ -2138,7 +2138,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true; invalid_param = true;
break; break;
} }
#ifdef GGML_USE_CUBLAS #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
params.main_gpu = std::stoi(argv[i]); params.main_gpu = std::stoi(argv[i]);
#else #else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {}); LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});

View File

@ -0,0 +1,9 @@
# MIT license
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: MIT
set(TARGET ls-sycl-device)
add_executable(${TARGET} ls-sycl-device.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

47
examples/sycl/README.md Normal file
View File

@ -0,0 +1,47 @@
# llama.cpp/example/sycl
This example program provide the tools for llama.cpp for SYCL on Intel GPU.
## Tool
|Tool Name| Function|Status|
|-|-|-|
|ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
### ls-sycl-device
List all SYCL devices with ID, compute capability, max work group size, ect.
1. Build the llama.cpp for SYCL for all targets.
2. Enable oneAPI running environment
```
source /opt/intel/oneapi/setvars.sh
```
3. Execute
```
./build/bin/ls-sycl-device
```
Check the ID in startup log, like:
```
found 4 SYCL devices:
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
```
|Attribute|Note|
|-|-|
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|

20
examples/sycl/build.sh Executable file
View File

@ -0,0 +1,20 @@
# MIT license
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: MIT
mkdir -p build
cd build
source /opt/intel/oneapi/setvars.sh
#for FP16
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
#for FP32
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
#build example/main only
#cmake --build . --config Release --target main
#build all binary
cmake --build . --config Release -v

View File

@ -0,0 +1,11 @@
/*MIT license
Copyright (C) 2024 Intel Corporation
SPDX-License-Identifier: MIT
*/
#include "ggml-sycl.h"
int main(int argc, char ** argv) {
ggml_backend_sycl_print_sycl_devices();
return 0;
}

19
examples/sycl/run-llama2.sh Executable file
View File

@ -0,0 +1,19 @@
#!/bin/bash
# MIT license
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: MIT
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
source /opt/intel/oneapi/setvars.sh
if [ $# -gt 0 ]; then
export GGML_SYCL_DEVICE=$1
else
export GGML_SYCL_DEVICE=0
fi
echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
#export GGML_SYCL_DEBUG=1
./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0

View File

@ -339,6 +339,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
ggml_backend_cuda_reg_devices(); ggml_backend_cuda_reg_devices();
#endif #endif
#ifdef GGML_USE_SYCL
extern void ggml_backend_sycl_reg_devices(void);
ggml_backend_sycl_reg_devices();
#endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);

15197
ggml-sycl.cpp Normal file

File diff suppressed because it is too large Load Diff

27
ggml-sycl.h Normal file
View File

@ -0,0 +1,27 @@
/*MIT license
Copyright (C) 2024 Intel Corporation
SPDX-License-Identifier: MIT
*/
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_SYCL_MAX_DEVICES 16
#define GGML_SYCL_NAME "SYCL"
GGML_API void ggml_init_sycl(void);
GGML_API bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
#ifdef __cplusplus
}
#endif

22
ggml.c
View File

@ -248,6 +248,8 @@ inline static void * ggml_aligned_malloc(size_t size) {
#include "ggml-cuda.h" #include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h" #include "ggml-opencl.h"
#elif defined(GGML_USE_SYCL)
#include "ggml-sycl.h"
#endif #endif
// floating point type used to accumulate sums // floating point type used to accumulate sums
@ -2293,6 +2295,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
ggml_init_cublas(); ggml_init_cublas();
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
ggml_cl_init(); ggml_cl_init();
#elif defined(GGML_USE_SYCL)
ggml_init_sycl();
#endif #endif
ggml_setup_op_has_task_pass(); ggml_setup_op_has_task_pass();
@ -14701,6 +14705,12 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
#ifdef GGML_USE_SYCL
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
if (skip_cpu) {
return;
}
#endif // GGML_USE_SYCL
switch (tensor->op) { switch (tensor->op) {
case GGML_OP_DUP: case GGML_OP_DUP:
{ {
@ -20280,7 +20290,7 @@ int ggml_cpu_has_wasm_simd(void) {
} }
int ggml_cpu_has_blas(void) { int ggml_cpu_has_blas(void) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
return 1; return 1;
#else #else
return 0; return 0;
@ -20303,8 +20313,16 @@ int ggml_cpu_has_clblast(void) {
#endif #endif
} }
int ggml_cpu_has_sycl(void) {
#if defined(GGML_USE_SYCL)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_gpublas(void) { int ggml_cpu_has_gpublas(void) {
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast(); return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_sycl();
} }
int ggml_cpu_has_sse3(void) { int ggml_cpu_has_sse3(void) {

1
ggml.h
View File

@ -2266,6 +2266,7 @@ extern "C" {
GGML_API int ggml_cpu_has_gpublas (void); GGML_API int ggml_cpu_has_gpublas (void);
GGML_API int ggml_cpu_has_sse3 (void); GGML_API int ggml_cpu_has_sse3 (void);
GGML_API int ggml_cpu_has_ssse3 (void); GGML_API int ggml_cpu_has_ssse3 (void);
GGML_API int ggml_cpu_has_sycl (void);
GGML_API int ggml_cpu_has_vsx (void); GGML_API int ggml_cpu_has_vsx (void);
// //

View File

@ -11,6 +11,8 @@
# include "ggml-cuda.h" # include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
# include "ggml-opencl.h" # include "ggml-opencl.h"
#elif defined(GGML_USE_SYCL)
# include "ggml-sycl.h"
#endif #endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
@ -1278,6 +1280,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
if (host_buffer) { if (host_buffer) {
buft = ggml_backend_cuda_host_buffer_type(); buft = ggml_backend_cuda_host_buffer_type();
} }
#elif defined(GGML_USE_SYCL)
buft = ggml_backend_sycl_host_buffer_type();
#elif defined(GGML_USE_CPU_HBM) #elif defined(GGML_USE_CPU_HBM)
buft = ggml_backend_cpu_hbm_buffer_type(); buft = ggml_backend_cpu_hbm_buffer_type();
#endif #endif
@ -1297,6 +1301,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
buft = ggml_backend_metal_buffer_type(); buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUBLAS) #elif defined(GGML_USE_CUBLAS)
buft = ggml_backend_cuda_buffer_type(gpu); buft = ggml_backend_cuda_buffer_type(gpu);
#elif defined(GGML_USE_SYCL)
buft = ggml_backend_sycl_buffer_type(gpu);
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
buft = ggml_backend_opencl_buffer_type(); buft = ggml_backend_opencl_buffer_type();
#endif #endif
@ -10225,6 +10231,16 @@ struct llama_context * llama_new_context_with_model(
} }
} }
} }
#elif defined(GGML_USE_SYCL)
if (model->n_gpu_layers > 0) {
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
#endif #endif
ctx->backend_cpu = ggml_backend_cpu_init(); ctx->backend_cpu = ggml_backend_cpu_init();
if (ctx->backend_cpu == nullptr) { if (ctx->backend_cpu == nullptr) {

View File

@ -6,6 +6,9 @@
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h" #include "ggml-cuda.h"
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
#elif defined(GGML_USE_SYCL)
#include "ggml-sycl.h"
#define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
#else #else
#define LLAMA_MAX_DEVICES 1 #define LLAMA_MAX_DEVICES 1
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
@ -46,7 +49,7 @@
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 4 #define LLAMA_SESSION_VERSION 4
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_SYCL)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU. // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
#define LLAMA_SUPPORTS_GPU_OFFLOAD #define LLAMA_SUPPORTS_GPU_OFFLOAD
#endif #endif

View File

@ -239,10 +239,17 @@ static std::string var_to_str(ggml_type type) {
#define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j) #define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
#define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k) #define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
#ifdef GGML_USE_SYCL
static bool inline _isinf(float f) {
return (*(uint32_t *)&f & 0x7fffffff) == 0x7f800000;
}
#else
static bool inline _isinf(float f) { return std::isinf(f); }
#endif
// accept FLT_MAX as infinity // accept FLT_MAX as infinity
static bool isinf_or_max(float f) { static bool isinf_or_max(float f) {
return std::isinf(f) || f == FLT_MAX || f == -FLT_MAX; return _isinf(f) || f == FLT_MAX || f == -FLT_MAX;
} }
static bool ggml_is_view_op(enum ggml_op op) { static bool ggml_is_view_op(enum ggml_op op) {