ggml : add unified SYCL backend for Intel GPUs (#2690)

* first update for migration * update init_cublas * add debug functio, commit all help code * step 1 * step 2 * step3 add fp16, slower 31->28 * add GGML_LIST_DEVICE function * step 5 format device and print * step6, enhance error check, remove CUDA macro, enhance device id to fix none-zero id issue * support main device is non-zero * step7 add debug for code path, rm log * step 8, rename all macro & func from cuda by sycl * fix error of select non-zero device, format device list * ren ggml-sycl.hpp -> ggml-sycl.h * clear CMAKE to rm unused lib and options * correct queue: rm dtct:get_queue * add print tensor function to debug * fix error: wrong result in 658746bb26702e50f2c59c0e4ada8e9da6010481 * summary dpct definition in one header file to replace folder:dpct * refactor device log * mv dpct definition from folder dpct to ggml-sycl.h * update readme, refactor build script * fix build with sycl * set nthread=1 when sycl, increase performance * add run script, comment debug code * add ls-sycl-device tool * add ls-sycl-device, rm unused files * rm rear space * dos2unix * Update README_sycl.md * fix return type * remove sycl version from include path * restore rm code to fix hang issue * add syc and link for sycl readme * rm original sycl code before refactor * fix code err * add know issue for pvc hang issue * enable SYCL_F16 support * align pr4766 * check for sycl blas, better performance * cleanup 1 * remove extra endif * add build&run script, clean CMakefile, update guide by review comments * rename macro to intel hardware * editor config format * format fixes * format fixes * editor format fix * Remove unused headers * skip build sycl tool for other code path * replace tab by space * fix blas matmul function * fix mac build * restore hip dependency * fix conflict * ren as review comments * mv internal function to .cpp file * export funciton print_sycl_devices(), mv class dpct definition to source file * update CI/action for sycl code, fix CI error of repeat/dup * fix action ID format issue * rm unused strategy * enable llama_f16 in ci * fix conflict * fix build break on MacOS, due to CI of MacOS depend on external ggml, instead of internal ggml * fix ci cases for unsupported data type * revert unrelated changed in cuda cmake remove useless nommq fix typo of GGML_USE_CLBLAS_SYCL * revert hip cmake changes * fix indent * add prefix in func name * revert no mmq * rm cpu blas duplicate * fix no_new_line * fix src1->type==F16 bug. * pass batch offset for F16 src1 * fix batch error * fix wrong code * revert sycl checking in test-sampling * pass void as arguments of ggml_backend_sycl_print_sycl_devices * remove extra blank line in test-sampling * revert setting n_threads in sycl * implement std::isinf for icpx with fast math. * Update ci/run.sh Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update examples/sycl/run-llama2.sh Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update examples/sycl/run-llama2.sh Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update CMakeLists.txt Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update CMakeLists.txt Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update CMakeLists.txt Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update CMakeLists.txt Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * add copyright and MIT license declare * update the cmd example --------- Co-authored-by: jianyuzh <jianyu.zhang@intel.com> Co-authored-by: luoyu-intel <yu.luo@intel.com> Co-authored-by: Meng, Hengyu <hengyu.meng@intel.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-12-24 02:14:35 +00:00 · 2024-01-28 21:26:23 +05:30 · 2024-01-28 21:26:23 +05:30 · 0f648573dd
commit 0f648573dd
parent b764b8f1d0
22 changed files with 15764 additions and 24 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -143,6 +143,47 @@ jobs:
          cd build
          ctest -L main --verbose

+  ubuntu-22-cmake-sycl:
+    runs-on: ubuntu-22.04
+
+    continue-on-error: true
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: add oneAPI to apt
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+
+      - name: install oneAPI dpcpp compiler
+        shell: bash
+        run: |
+          sudo apt update
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp
+
+      - name: install oneAPI MKL library
+        shell: bash
+        run: |
+          sudo apt install intel-oneapi-mkl-devel
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          mkdir build
+          cd build
+          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
+          cmake --build . --config Release -j $(nproc)
+
  # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
  #       how to debug it.
  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,6 @@
 cmake_minimum_required(VERSION 3.14)  # for add_link_options and implicit target directories.
 project("llama.cpp" C CXX)
+include(CheckIncludeFileCXX)

 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

@ -103,6 +104,8 @@ option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"
 option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
+option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
+option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)

 option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
@ -121,8 +124,12 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
 #
 # Compile flags
 #
+if (LLAMA_SYCL)
+    set(CMAKE_CXX_STANDARD 17)
+else()
+    set(CMAKE_CXX_STANDARD 11)
+endif()

-set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
@ -454,6 +461,32 @@ if (LLAMA_HIPBLAS)
    endif()
 endif()

+
+if (LLAMA_SYCL)
+    if ( NOT DEFINED ENV{ONEAPI_ROOT})
+        message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
+    endif()
+    #todo: AOT
+
+    find_package(IntelSYCL REQUIRED)
+    if (LLAMA_SYCL_F16)
+        add_compile_definitions(GGML_SYCL_F16)
+    endif()
+    add_compile_definitions(GGML_USE_SYCL)
+
+    add_compile_options(-I./) #include DPCT
+    add_compile_options(-I/${SYCL_INCLUDE_DIR})
+
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
+
+    set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h)
+    set(GGML_SOURCES_SYCL ggml-sycl.cpp)
+
+    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
+endif()
+
 function(get_flags CCID CCVER)
    set(C_FLAGS "")
    set(CXX_FLAGS "")
@ -479,10 +512,12 @@ function(get_flags CCID CCVER)
            list(APPEND CXX_FLAGS -Wextra-semi)
        endif()
    elseif (CCID MATCHES "Intel")
-        # enable max optimization level when using Intel compiler
-        set(C_FLAGS   -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
-        set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
-        add_link_options(-fuse-ld=lld -static-intel)
+        if (NOT LLAMA_SYCL)
+            # enable max optimization level when using Intel compiler
+            set(C_FLAGS   -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
+            set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
+            add_link_options(-fuse-ld=lld -static-intel)
+        endif()
    endif()

    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
@ -799,6 +834,7 @@ add_library(ggml OBJECT
            ${GGML_SOURCES_METAL}  ${GGML_HEADERS_METAL}
            ${GGML_SOURCES_MPI}    ${GGML_HEADERS_MPI}
            ${GGML_SOURCES_EXTRA}  ${GGML_HEADERS_EXTRA}
+            ${GGML_SOURCES_SYCL}   ${GGML_HEADERS_SYCL}
            )

 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
--- a/README.md
+++ b/README.md
@ -63,7 +63,7 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
 - AVX, AVX2 and AVX512 support for x86 architectures
 - Mixed F16 / F32 precision
 - 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support
- CUDA, Metal and OpenCL GPU backend support
+- CUDA, Metal, OpenCL, SYCL GPU backend support

 The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
 Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves
@ -599,6 +599,15 @@ Building the program with BLAS support may lead to some performance improvements

  You can get a list of platforms and devices from the `clinfo -l` command, etc.

+- #### SYCL
+
+  SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
+
+  llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
+
+  For detailed info, please refer to [llama.cpp for SYCL](README_sycl.md).
+
+
 ### Prepare Data & Run

 ```bash
--- a/README_sycl.md
+++ b/README_sycl.md
@ -0,0 +1,252 @@
+# llama.cpp for SYCL
+
+[Background](#background)
+
+[OS](#os)
+
+[Intel GPU](#intel-gpu)
+
+[Linux](#linux)
+
+[Environment Variable](#environment-variable)
+
+[Known Issue](#known-issue)
+
+[Todo](#todo)
+
+## Background
+
+SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
+
+oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
+
+Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
+
+To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
+
+The llama.cpp for SYCL is used to support Intel GPUs.
+
+For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
+
+## OS
+
+|OS|Status|Verified|
+|-|-|-|
+|Linux|Support|Ubuntu 22.04|
+|Windows|Ongoing| |
+
+
+## Intel GPU
+
+|Intel GPU| Status | Verified Model|
+|-|-|-|
+|Intel Data Center Max Series| Support| Max 1550|
+|Intel Data Center Flex Series| Support| Flex 170|
+|Intel Arc Series| Support| Arc 770|
+|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
+|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
+
+
+## Linux
+
+### Setup Environment
+
+1. Install Intel GPU driver.
+
+a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
+
+Note: for iGPU, please install the client GPU driver.
+
+b. Add user to group: video, render.
+
+```
+sudo usermod -aG render username
+sudo usermod -aG video username
+```
+
+Note: re-login to enable it.
+
+c. Check
+
+```
+sudo apt install clinfo
+sudo clinfo -l
+```
+
+Output (example):
+
+```
+Platform #0: Intel(R) OpenCL Graphics
+ `-- Device #0: Intel(R) Arc(TM) A770 Graphics
+
+
+Platform #0: Intel(R) OpenCL HD Graphics
+ `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
+```
+
+2. Install Intel® oneAPI Base toolkit.
+
+
+a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
+
+Recommend to install to default folder: **/opt/intel/oneapi**.
+
+Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
+
+b. Check
+
+```
+source /opt/intel/oneapi/setvars.sh
+
+sycl-ls
+```
+
+There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
+
+Output (example):
+```
+[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
+[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
+[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
+[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
+
+```
+
+2. Build locally:
+
+```
+mkdir -p build
+cd build
+source /opt/intel/oneapi/setvars.sh
+
+#for FP16
+#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
+
+#for FP32
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+#build example/main only
+#cmake --build . --config Release --target main
+
+#build all binary
+cmake --build . --config Release -v
+
+```
+
+or
+
+```
+./examples/sycl/build.sh
+```
+
+Note:
+
+- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
+
+### Run
+
+1. Put model file to folder **models**
+
+2. Enable oneAPI running environment
+
+```
+source /opt/intel/oneapi/setvars.sh
+```
+
+3. List device ID
+
+Run without parameter:
+
+```
+./build/bin/ls-sycl-device
+
+or
+
+./build/bin/main
+```
+
+Check the ID in startup log, like:
+
+```
+found 4 SYCL devices:
+  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
+    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
+    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
+  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
+    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
+  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
+    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+
+```
+
+|Attribute|Note|
+|-|-|
+|compute capability 1.3|Level-zero running time, recommended |
+|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
+
+4. Set device ID and execute llama.cpp
+
+Set device ID = 0 by **GGML_SYCL_DEVICE=0**
+
+```
+GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
+```
+or run by script:
+
+```
+./examples/sycl/run_llama2.sh
+```
+
+Note:
+
+- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
+
+
+5. Check the device ID in output
+
+Like：
+```
+Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
+```
+
+
+## Environment Variable
+
+#### Build
+
+|Name|Value|Function|
+|-|-|-|
+|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
+|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
+|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
+|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
+
+#### Running
+
+
+|Name|Value|Function|
+|-|-|-|
+|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
+|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
+
+## Known Issue
+
+- Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
+
+  Miss to enable oneAPI running environment.
+
+  Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
+
+
+- Hang during startup
+
+  llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
+
+  Solution: add **--no-mmap**.
+
+## Todo
+
+- Support to build in Windows.
+
+- Support multiple cards.
--- a/ci/README.md
+++ b/ci/README.md
@ -22,4 +22,8 @@ bash ./ci/run.sh ./tmp/results ./tmp/mnt

 # with CUDA support
 GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+
+# with SYCL support
+source /opt/intel/oneapi/setvars.sh
+GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 ```
--- a/ci/run.sh
+++ b/ci/run.sh
@ -10,6 +10,9 @@
 # # with CUDA support
 # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
+# # with SYCL support
+# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#

 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@ -40,6 +43,14 @@ if [ ! -z ${GG_BUILD_CUDA} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
 fi

+if [ ! -z ${GG_BUILD_SYCL} ]; then
+    if [ -z ${ONEAPI_ROOT} ]; then
+        echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
+        exit 1
+    fi
+
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
+fi
 ## helpers

 # download a file if it does not exist or if it is outdated
--- a/common/common.cpp
+++ b/common/common.cpp
@ -42,6 +42,10 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
+#define GGML_USE_CUBLAS_SYCL
+#endif
+
 int32_t get_num_physical_cores() {
 #ifdef __linux__
    // enumerate the set of thread siblings, num entries is num cores
@ -599,9 +603,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CUBLAS
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#ifndef GGML_USE_CUBLAS_SYCL
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
+#endif // GGML_USE_CUBLAS_SYCL
        } else if (arg == "--split-mode" || arg == "-sm") {
            if (++i >= argc) {
                invalid_param = true;
@ -618,9 +622,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-#ifndef GGML_USE_CUBLAS
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#ifndef GGML_USE_CUBLAS_SYCL
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUBLAS_SYCL
+
        } else if (arg == "--tensor-split" || arg == "-ts") {
            if (++i >= argc) {
                invalid_param = true;
@ -643,9 +648,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                    params.tensor_split[i] = 0.0f;
                }
            }
-#ifndef GGML_USE_CUBLAS
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#ifndef GGML_USE_CUBLAS_SYCL
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
+#endif // GGML_USE_CUBLAS_SYCL
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
        } else if (arg == "--numa") {
@ -1007,7 +1012,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
    printf("  -mg i, --main-gpu i   the GPU to use for the model (with split-mode = none),\n");
    printf("                        or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
-#endif
+#endif // LLAMA_SUPPORTS_GPU_OFFLOAD
    printf("  --verbose-prompt      print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
    printf("  --no-display-prompt   don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
    printf("  -gan N, --grp-attn-n N\n");
@ -1514,7 +1519,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
-    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
    fprintf(stream, "cpu_has_cublas: %s\n",      ggml_cpu_has_cublas()      ? "true" : "false");
    fprintf(stream, "cpu_has_clblast: %s\n",     ggml_cpu_has_clblast()     ? "true" : "false");
    fprintf(stream, "cpu_has_fma: %s\n",         ggml_cpu_has_fma()         ? "true" : "false");
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -23,6 +23,9 @@ else()
    add_subdirectory(infill)
    add_subdirectory(llama-bench)
    add_subdirectory(llava)
+    if (LLAMA_SYCL)
+        add_subdirectory(sycl)
+    endif()
    add_subdirectory(main)
    add_subdirectory(tokenize)
    add_subdirectory(parallel)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2099,7 +2099,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                invalid_param = true;
                break;
            }
-#ifdef GGML_USE_CUBLAS
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
            std::string arg_next = argv[i];

            // split string by , and /
@ -2125,7 +2125,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
        }
        else if (arg == "--no-mul-mat-q" || arg == "-nommq")
        {
-#ifdef GGML_USE_CUBLAS
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
            params.mul_mat_q = false;
 #else
            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
@ -2138,7 +2138,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                invalid_param = true;
                break;
            }
-#ifdef GGML_USE_CUBLAS
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
            params.main_gpu = std::stoi(argv[i]);
 #else
            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@ -0,0 +1,9 @@
+#  MIT license
+#  Copyright (C) 2024 Intel Corporation
+#  SPDX-License-Identifier: MIT
+
+set(TARGET ls-sycl-device)
+add_executable(${TARGET} ls-sycl-device.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/sycl/README.md
+++ b/examples/sycl/README.md
@ -0,0 +1,47 @@
+# llama.cpp/example/sycl
+
+This example program provide the tools for llama.cpp for SYCL on Intel GPU.
+
+## Tool
+
+|Tool Name| Function|Status|
+|-|-|-|
+|ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
+
+### ls-sycl-device
+
+List all SYCL devices with ID, compute capability, max work group size, ect.
+
+1. Build the llama.cpp for SYCL for all targets.
+
+2. Enable oneAPI running environment
+
+```
+source /opt/intel/oneapi/setvars.sh
+```
+
+3. Execute
+
+```
+./build/bin/ls-sycl-device
+```
+
+Check the ID in startup log, like:
+
+```
+found 4 SYCL devices:
+  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
+    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
+    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
+  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
+    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
+  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
+    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+
+```
+
+|Attribute|Note|
+|-|-|
+|compute capability 1.3|Level-zero running time, recommended |
+|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@ -0,0 +1,20 @@
+
+#  MIT license
+#  Copyright (C) 2024 Intel Corporation
+#  SPDX-License-Identifier: MIT
+
+mkdir -p build
+cd build
+source /opt/intel/oneapi/setvars.sh
+
+#for FP16
+#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
+
+#for FP32
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+#build example/main only
+#cmake --build . --config Release --target main
+
+#build all binary
+cmake --build . --config Release -v
--- a/examples/sycl/ls-sycl-device.cpp
+++ b/examples/sycl/ls-sycl-device.cpp
@ -0,0 +1,11 @@
+/*MIT license
+  Copyright (C) 2024 Intel Corporation
+  SPDX-License-Identifier: MIT
+*/
+
+#include "ggml-sycl.h"
+
+int main(int argc, char ** argv) {
+    ggml_backend_sycl_print_sycl_devices();
+    return 0;
+}
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+
+#  MIT license
+#  Copyright (C) 2024 Intel Corporation
+#  SPDX-License-Identifier: MIT
+
+INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
+source /opt/intel/oneapi/setvars.sh
+
+if [ $# -gt 0 ]; then
+    export GGML_SYCL_DEVICE=$1
+else
+    export GGML_SYCL_DEVICE=0
+fi
+echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
+#export GGML_SYCL_DEBUG=1
+./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
+
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -339,6 +339,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
    ggml_backend_cuda_reg_devices();
 #endif

+#ifdef GGML_USE_SYCL
+    extern void ggml_backend_sycl_reg_devices(void);
+    ggml_backend_sycl_reg_devices();
+#endif
+
 #ifdef GGML_USE_METAL
    extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
    extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@ -0,0 +1,27 @@
+/*MIT license
+  Copyright (C) 2024 Intel Corporation
+  SPDX-License-Identifier: MIT
+*/
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_SYCL_MAX_DEVICES       16
+#define GGML_SYCL_NAME "SYCL"
+
+GGML_API void   ggml_init_sycl(void);
+GGML_API bool   ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
+GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
+GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
+GGML_API void   ggml_backend_sycl_print_sycl_devices(void);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ggml.c
+++ b/ggml.c
@ -248,6 +248,8 @@ inline static void * ggml_aligned_malloc(size_t size) {
 #include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
 #include "ggml-opencl.h"
+#elif defined(GGML_USE_SYCL)
+#include "ggml-sycl.h"
 #endif

 // floating point type used to accumulate sums
@ -2293,6 +2295,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
        ggml_init_cublas();
 #elif defined(GGML_USE_CLBLAST)
        ggml_cl_init();
+#elif defined(GGML_USE_SYCL)
+        ggml_init_sycl();
 #endif

        ggml_setup_op_has_task_pass();
@ -14701,6 +14705,12 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
    GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
 #endif // GGML_USE_CUBLAS

+#ifdef GGML_USE_SYCL
+    bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
+    if (skip_cpu) {
+        return;
+    }
+#endif // GGML_USE_SYCL
    switch (tensor->op) {
        case GGML_OP_DUP:
            {
@ -20280,7 +20290,7 @@ int ggml_cpu_has_wasm_simd(void) {
 }

 int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
    return 1;
 #else
    return 0;
@ -20303,8 +20313,16 @@ int ggml_cpu_has_clblast(void) {
 #endif
 }

+int ggml_cpu_has_sycl(void) {
+#if defined(GGML_USE_SYCL)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_gpublas(void) {
-    return ggml_cpu_has_cublas() || ggml_cpu_has_clblast();
+    return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_sycl();
 }

 int ggml_cpu_has_sse3(void) {
--- a/ggml.h
+++ b/ggml.h
@ -2266,6 +2266,7 @@ extern "C" {
    GGML_API int ggml_cpu_has_gpublas    (void);
    GGML_API int ggml_cpu_has_sse3       (void);
    GGML_API int ggml_cpu_has_ssse3      (void);
+    GGML_API int ggml_cpu_has_sycl       (void);
    GGML_API int ggml_cpu_has_vsx        (void);

    //
--- a/llama.cpp
+++ b/llama.cpp
@ -11,6 +11,8 @@
 #  include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
 #  include "ggml-opencl.h"
+#elif defined(GGML_USE_SYCL)
+#  include "ggml-sycl.h"
 #endif

 #ifdef GGML_USE_METAL
@ -1278,6 +1280,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
    if (host_buffer) {
        buft = ggml_backend_cuda_host_buffer_type();
    }
+#elif defined(GGML_USE_SYCL)
+    buft = ggml_backend_sycl_host_buffer_type();
 #elif defined(GGML_USE_CPU_HBM)
    buft = ggml_backend_cpu_hbm_buffer_type();
 #endif
@ -1297,6 +1301,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
    buft = ggml_backend_metal_buffer_type();
 #elif defined(GGML_USE_CUBLAS)
    buft = ggml_backend_cuda_buffer_type(gpu);
+#elif defined(GGML_USE_SYCL)
+    buft = ggml_backend_sycl_buffer_type(gpu);
 #elif defined(GGML_USE_CLBLAST)
    buft = ggml_backend_opencl_buffer_type();
 #endif
@ -10225,6 +10231,16 @@ struct llama_context * llama_new_context_with_model(
                }
            }
        }
+#elif defined(GGML_USE_SYCL)
+        if (model->n_gpu_layers > 0) {
+            ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
+            if (backend == nullptr) {
+                LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
+                llama_free(ctx);
+                return nullptr;
+            }
+            ctx->backends.push_back(backend);
+        }
 #endif
        ctx->backend_cpu = ggml_backend_cpu_init();
        if (ctx->backend_cpu == nullptr) {
--- a/llama.h
+++ b/llama.h
@ -6,6 +6,9 @@
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
+#elif defined(GGML_USE_SYCL)
+#include "ggml-sycl.h"
+#define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
 #else
 #define LLAMA_MAX_DEVICES 1
 #endif // GGML_USE_CUBLAS
@ -46,7 +49,7 @@
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION 4

-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_SYCL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -239,10 +239,17 @@ static std::string var_to_str(ggml_type type) {
 #define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
 #define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)

+#ifdef GGML_USE_SYCL
+static bool inline _isinf(float f) {
+    return (*(uint32_t *)&f & 0x7fffffff) == 0x7f800000;
+}
+#else
+static bool inline _isinf(float f) { return std::isinf(f); }
+#endif

 // accept FLT_MAX as infinity
 static bool isinf_or_max(float f) {
-    return std::isinf(f) || f == FLT_MAX || f == -FLT_MAX;
+    return _isinf(f) || f == FLT_MAX || f == -FLT_MAX;
 }

 static bool ggml_is_view_op(enum ggml_op op) {