mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-28 12:24:35 +00:00
Merge branch 'master' into gg/flash-attn
This commit is contained in:
commit
0ad44baf33
41
.github/workflows/build.yml
vendored
41
.github/workflows/build.yml
vendored
@ -143,6 +143,47 @@ jobs:
|
|||||||
cd build
|
cd build
|
||||||
ctest -L main --verbose
|
ctest -L main --verbose
|
||||||
|
|
||||||
|
ubuntu-22-cmake-sycl:
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
|
continue-on-error: true
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- name: add oneAPI to apt
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
cd /tmp
|
||||||
|
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||||
|
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||||
|
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||||
|
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
|
||||||
|
|
||||||
|
- name: install oneAPI dpcpp compiler
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install intel-oneapi-compiler-dpcpp-cpp
|
||||||
|
|
||||||
|
- name: install oneAPI MKL library
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
sudo apt install intel-oneapi-mkl-devel
|
||||||
|
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
|
||||||
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||||
# how to debug it.
|
# how to debug it.
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
|
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
|
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
|
||||||
project("llama.cpp" C CXX)
|
project("llama.cpp" C CXX)
|
||||||
|
include(CheckIncludeFileCXX)
|
||||||
|
|
||||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
|
|
||||||
@ -98,11 +99,14 @@ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
|||||||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||||
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
||||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||||
|
option(LLAMA_VULKAN "llama: use Vulkan" OFF)
|
||||||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||||
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
||||||
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
|
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
|
||||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||||
|
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
||||||
|
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
|
||||||
|
|
||||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||||
@ -121,8 +125,12 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
|||||||
#
|
#
|
||||||
# Compile flags
|
# Compile flags
|
||||||
#
|
#
|
||||||
|
if (LLAMA_SYCL)
|
||||||
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CXX_STANDARD 11)
|
||||||
|
endif()
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 11)
|
|
||||||
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
||||||
set(CMAKE_C_STANDARD 11)
|
set(CMAKE_C_STANDARD 11)
|
||||||
set(CMAKE_C_STANDARD_REQUIRED true)
|
set(CMAKE_C_STANDARD_REQUIRED true)
|
||||||
@ -409,6 +417,22 @@ if (LLAMA_CLBLAST)
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_VULKAN)
|
||||||
|
find_package(Vulkan)
|
||||||
|
if (Vulkan_FOUND)
|
||||||
|
message(STATUS "Vulkan found")
|
||||||
|
|
||||||
|
add_library(ggml-vulkan STATIC ggml-vulkan.cpp ggml-vulkan.h)
|
||||||
|
target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
|
||||||
|
|
||||||
|
add_compile_definitions(GGML_USE_VULKAN)
|
||||||
|
|
||||||
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-vulkan)
|
||||||
|
else()
|
||||||
|
message(WARNING "Vulkan not found")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
if (LLAMA_HIPBLAS)
|
if (LLAMA_HIPBLAS)
|
||||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
|
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
|
||||||
|
|
||||||
@ -454,6 +478,32 @@ if (LLAMA_HIPBLAS)
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
if (LLAMA_SYCL)
|
||||||
|
if ( NOT DEFINED ENV{ONEAPI_ROOT})
|
||||||
|
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
|
||||||
|
endif()
|
||||||
|
#todo: AOT
|
||||||
|
|
||||||
|
find_package(IntelSYCL REQUIRED)
|
||||||
|
if (LLAMA_SYCL_F16)
|
||||||
|
add_compile_definitions(GGML_SYCL_F16)
|
||||||
|
endif()
|
||||||
|
add_compile_definitions(GGML_USE_SYCL)
|
||||||
|
|
||||||
|
add_compile_options(-I./) #include DPCT
|
||||||
|
add_compile_options(-I/${SYCL_INCLUDE_DIR})
|
||||||
|
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
|
||||||
|
|
||||||
|
set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h)
|
||||||
|
set(GGML_SOURCES_SYCL ggml-sycl.cpp)
|
||||||
|
|
||||||
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||||
|
endif()
|
||||||
|
|
||||||
function(get_flags CCID CCVER)
|
function(get_flags CCID CCVER)
|
||||||
set(C_FLAGS "")
|
set(C_FLAGS "")
|
||||||
set(CXX_FLAGS "")
|
set(CXX_FLAGS "")
|
||||||
@ -479,10 +529,12 @@ function(get_flags CCID CCVER)
|
|||||||
list(APPEND CXX_FLAGS -Wextra-semi)
|
list(APPEND CXX_FLAGS -Wextra-semi)
|
||||||
endif()
|
endif()
|
||||||
elseif (CCID MATCHES "Intel")
|
elseif (CCID MATCHES "Intel")
|
||||||
# enable max optimization level when using Intel compiler
|
if (NOT LLAMA_SYCL)
|
||||||
set(C_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
|
# enable max optimization level when using Intel compiler
|
||||||
set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
|
set(C_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
|
||||||
add_link_options(-fuse-ld=lld -static-intel)
|
set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
|
||||||
|
add_link_options(-fuse-ld=lld -static-intel)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
|
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
|
||||||
@ -799,6 +851,7 @@ add_library(ggml OBJECT
|
|||||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||||
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
||||||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||||
|
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
|
||||||
)
|
)
|
||||||
|
|
||||||
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
||||||
|
13
Makefile
13
Makefile
@ -448,6 +448,19 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
|||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
endif # LLAMA_CLBLAST
|
endif # LLAMA_CLBLAST
|
||||||
|
|
||||||
|
ifdef LLAMA_VULKAN
|
||||||
|
MK_CPPFLAGS += -DGGML_USE_VULKAN
|
||||||
|
MK_LDFLAGS += -lvulkan
|
||||||
|
OBJS += ggml-vulkan.o
|
||||||
|
|
||||||
|
ifdef LLAMA_VULKAN_CHECK_RESULTS
|
||||||
|
MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
|
||||||
|
endif
|
||||||
|
|
||||||
|
ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
endif # LLAMA_VULKAN
|
||||||
|
|
||||||
ifdef LLAMA_HIPBLAS
|
ifdef LLAMA_HIPBLAS
|
||||||
|
|
||||||
ifeq ($(wildcard /opt/rocm),)
|
ifeq ($(wildcard /opt/rocm),)
|
||||||
|
11
README.md
11
README.md
@ -63,7 +63,7 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
|
|||||||
- AVX, AVX2 and AVX512 support for x86 architectures
|
- AVX, AVX2 and AVX512 support for x86 architectures
|
||||||
- Mixed F16 / F32 precision
|
- Mixed F16 / F32 precision
|
||||||
- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support
|
- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support
|
||||||
- CUDA, Metal and OpenCL GPU backend support
|
- CUDA, Metal, OpenCL, SYCL GPU backend support
|
||||||
|
|
||||||
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
|
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
|
||||||
Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves
|
Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves
|
||||||
@ -599,6 +599,15 @@ Building the program with BLAS support may lead to some performance improvements
|
|||||||
|
|
||||||
You can get a list of platforms and devices from the `clinfo -l` command, etc.
|
You can get a list of platforms and devices from the `clinfo -l` command, etc.
|
||||||
|
|
||||||
|
- #### SYCL
|
||||||
|
|
||||||
|
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
|
||||||
|
|
||||||
|
llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
|
||||||
|
|
||||||
|
For detailed info, please refer to [llama.cpp for SYCL](README_sycl.md).
|
||||||
|
|
||||||
|
|
||||||
### Prepare Data & Run
|
### Prepare Data & Run
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
252
README_sycl.md
Normal file
252
README_sycl.md
Normal file
@ -0,0 +1,252 @@
|
|||||||
|
# llama.cpp for SYCL
|
||||||
|
|
||||||
|
[Background](#background)
|
||||||
|
|
||||||
|
[OS](#os)
|
||||||
|
|
||||||
|
[Intel GPU](#intel-gpu)
|
||||||
|
|
||||||
|
[Linux](#linux)
|
||||||
|
|
||||||
|
[Environment Variable](#environment-variable)
|
||||||
|
|
||||||
|
[Known Issue](#known-issue)
|
||||||
|
|
||||||
|
[Todo](#todo)
|
||||||
|
|
||||||
|
## Background
|
||||||
|
|
||||||
|
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
|
||||||
|
|
||||||
|
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
|
||||||
|
|
||||||
|
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
|
||||||
|
|
||||||
|
To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
|
||||||
|
|
||||||
|
The llama.cpp for SYCL is used to support Intel GPUs.
|
||||||
|
|
||||||
|
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|
||||||
|
|
||||||
|
## OS
|
||||||
|
|
||||||
|
|OS|Status|Verified|
|
||||||
|
|-|-|-|
|
||||||
|
|Linux|Support|Ubuntu 22.04|
|
||||||
|
|Windows|Ongoing| |
|
||||||
|
|
||||||
|
|
||||||
|
## Intel GPU
|
||||||
|
|
||||||
|
|Intel GPU| Status | Verified Model|
|
||||||
|
|-|-|-|
|
||||||
|
|Intel Data Center Max Series| Support| Max 1550|
|
||||||
|
|Intel Data Center Flex Series| Support| Flex 170|
|
||||||
|
|Intel Arc Series| Support| Arc 770|
|
||||||
|
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|
||||||
|
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
|
||||||
|
|
||||||
|
|
||||||
|
## Linux
|
||||||
|
|
||||||
|
### Setup Environment
|
||||||
|
|
||||||
|
1. Install Intel GPU driver.
|
||||||
|
|
||||||
|
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
|
||||||
|
|
||||||
|
Note: for iGPU, please install the client GPU driver.
|
||||||
|
|
||||||
|
b. Add user to group: video, render.
|
||||||
|
|
||||||
|
```
|
||||||
|
sudo usermod -aG render username
|
||||||
|
sudo usermod -aG video username
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: re-login to enable it.
|
||||||
|
|
||||||
|
c. Check
|
||||||
|
|
||||||
|
```
|
||||||
|
sudo apt install clinfo
|
||||||
|
sudo clinfo -l
|
||||||
|
```
|
||||||
|
|
||||||
|
Output (example):
|
||||||
|
|
||||||
|
```
|
||||||
|
Platform #0: Intel(R) OpenCL Graphics
|
||||||
|
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
|
||||||
|
|
||||||
|
|
||||||
|
Platform #0: Intel(R) OpenCL HD Graphics
|
||||||
|
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Install Intel® oneAPI Base toolkit.
|
||||||
|
|
||||||
|
|
||||||
|
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||||
|
|
||||||
|
Recommend to install to default folder: **/opt/intel/oneapi**.
|
||||||
|
|
||||||
|
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
|
||||||
|
|
||||||
|
b. Check
|
||||||
|
|
||||||
|
```
|
||||||
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
|
sycl-ls
|
||||||
|
```
|
||||||
|
|
||||||
|
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
|
||||||
|
|
||||||
|
Output (example):
|
||||||
|
```
|
||||||
|
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||||
|
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||||
|
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
|
||||||
|
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Build locally:
|
||||||
|
|
||||||
|
```
|
||||||
|
mkdir -p build
|
||||||
|
cd build
|
||||||
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
|
#for FP16
|
||||||
|
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
|
||||||
|
|
||||||
|
#for FP32
|
||||||
|
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
|
#build example/main only
|
||||||
|
#cmake --build . --config Release --target main
|
||||||
|
|
||||||
|
#build all binary
|
||||||
|
cmake --build . --config Release -v
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
```
|
||||||
|
./examples/sycl/build.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Note:
|
||||||
|
|
||||||
|
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
||||||
|
|
||||||
|
### Run
|
||||||
|
|
||||||
|
1. Put model file to folder **models**
|
||||||
|
|
||||||
|
2. Enable oneAPI running environment
|
||||||
|
|
||||||
|
```
|
||||||
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
3. List device ID
|
||||||
|
|
||||||
|
Run without parameter:
|
||||||
|
|
||||||
|
```
|
||||||
|
./build/bin/ls-sycl-device
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
./build/bin/main
|
||||||
|
```
|
||||||
|
|
||||||
|
Check the ID in startup log, like:
|
||||||
|
|
||||||
|
```
|
||||||
|
found 4 SYCL devices:
|
||||||
|
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||||
|
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||||
|
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||||
|
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||||
|
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||||
|
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||||
|
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||||
|
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|Attribute|Note|
|
||||||
|
|-|-|
|
||||||
|
|compute capability 1.3|Level-zero running time, recommended |
|
||||||
|
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||||
|
|
||||||
|
4. Set device ID and execute llama.cpp
|
||||||
|
|
||||||
|
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
|
||||||
|
|
||||||
|
```
|
||||||
|
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||||
|
```
|
||||||
|
or run by script:
|
||||||
|
|
||||||
|
```
|
||||||
|
./examples/sycl/run_llama2.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Note:
|
||||||
|
|
||||||
|
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
|
||||||
|
|
||||||
|
|
||||||
|
5. Check the device ID in output
|
||||||
|
|
||||||
|
Like:
|
||||||
|
```
|
||||||
|
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Environment Variable
|
||||||
|
|
||||||
|
#### Build
|
||||||
|
|
||||||
|
|Name|Value|Function|
|
||||||
|
|-|-|-|
|
||||||
|
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
|
||||||
|
|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
|
||||||
|
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|
||||||
|
|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
|
||||||
|
|
||||||
|
#### Running
|
||||||
|
|
||||||
|
|
||||||
|
|Name|Value|Function|
|
||||||
|
|-|-|-|
|
||||||
|
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|
||||||
|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|
||||||
|
|
||||||
|
## Known Issue
|
||||||
|
|
||||||
|
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
||||||
|
|
||||||
|
Miss to enable oneAPI running environment.
|
||||||
|
|
||||||
|
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
|
||||||
|
|
||||||
|
|
||||||
|
- Hang during startup
|
||||||
|
|
||||||
|
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
|
||||||
|
|
||||||
|
Solution: add **--no-mmap**.
|
||||||
|
|
||||||
|
## Todo
|
||||||
|
|
||||||
|
- Support to build in Windows.
|
||||||
|
|
||||||
|
- Support multiple cards.
|
@ -22,4 +22,8 @@ bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
|||||||
|
|
||||||
# with CUDA support
|
# with CUDA support
|
||||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
|
|
||||||
|
# with SYCL support
|
||||||
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
```
|
```
|
||||||
|
11
ci/run.sh
11
ci/run.sh
@ -10,6 +10,9 @@
|
|||||||
# # with CUDA support
|
# # with CUDA support
|
||||||
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
#
|
#
|
||||||
|
# # with SYCL support
|
||||||
|
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
|
#
|
||||||
|
|
||||||
if [ -z "$2" ]; then
|
if [ -z "$2" ]; then
|
||||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||||
@ -40,6 +43,14 @@ if [ ! -z ${GG_BUILD_CUDA} ]; then
|
|||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||||
|
if [ -z ${ONEAPI_ROOT} ]; then
|
||||||
|
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
|
||||||
|
fi
|
||||||
## helpers
|
## helpers
|
||||||
|
|
||||||
# download a file if it does not exist or if it is outdated
|
# download a file if it does not exist or if it is outdated
|
||||||
|
@ -42,6 +42,10 @@
|
|||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
|
||||||
|
#define GGML_USE_CUBLAS_SYCL
|
||||||
|
#endif
|
||||||
|
|
||||||
int32_t get_num_physical_cores() {
|
int32_t get_num_physical_cores() {
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
// enumerate the set of thread siblings, num entries is num cores
|
// enumerate the set of thread siblings, num entries is num cores
|
||||||
@ -599,9 +603,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.main_gpu = std::stoi(argv[i]);
|
params.main_gpu = std::stoi(argv[i]);
|
||||||
#ifndef GGML_USE_CUBLAS
|
#ifndef GGML_USE_CUBLAS_SYCL
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS_SYCL
|
||||||
} else if (arg == "--split-mode" || arg == "-sm") {
|
} else if (arg == "--split-mode" || arg == "-sm") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
@ -618,9 +622,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#ifndef GGML_USE_CUBLAS
|
#ifndef GGML_USE_CUBLAS_SYCL
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS_SYCL
|
||||||
|
|
||||||
} else if (arg == "--tensor-split" || arg == "-ts") {
|
} else if (arg == "--tensor-split" || arg == "-ts") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
@ -643,9 +648,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||||||
params.tensor_split[i] = 0.0f;
|
params.tensor_split[i] = 0.0f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#ifndef GGML_USE_CUBLAS
|
#ifndef GGML_USE_CUBLAS_SYCL
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS_SYCL
|
||||||
} else if (arg == "--no-mmap") {
|
} else if (arg == "--no-mmap") {
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
} else if (arg == "--numa") {
|
} else if (arg == "--numa") {
|
||||||
@ -1007,7 +1012,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||||||
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
||||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||||
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
||||||
#endif
|
#endif // LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
|
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
|
||||||
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
|
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
|
||||||
printf(" -gan N, --grp-attn-n N\n");
|
printf(" -gan N, --grp-attn-n N\n");
|
||||||
@ -1514,7 +1519,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|||||||
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
|
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||||
|
@ -23,6 +23,9 @@ else()
|
|||||||
add_subdirectory(infill)
|
add_subdirectory(infill)
|
||||||
add_subdirectory(llama-bench)
|
add_subdirectory(llama-bench)
|
||||||
add_subdirectory(llava)
|
add_subdirectory(llava)
|
||||||
|
if (LLAMA_SYCL)
|
||||||
|
add_subdirectory(sycl)
|
||||||
|
endif()
|
||||||
add_subdirectory(main)
|
add_subdirectory(main)
|
||||||
add_subdirectory(tokenize)
|
add_subdirectory(tokenize)
|
||||||
add_subdirectory(parallel)
|
add_subdirectory(parallel)
|
||||||
|
@ -562,6 +562,7 @@ struct test {
|
|||||||
static const int build_number;
|
static const int build_number;
|
||||||
static const bool cuda;
|
static const bool cuda;
|
||||||
static const bool opencl;
|
static const bool opencl;
|
||||||
|
static const bool vulkan;
|
||||||
static const bool metal;
|
static const bool metal;
|
||||||
static const bool gpu_blas;
|
static const bool gpu_blas;
|
||||||
static const bool blas;
|
static const bool blas;
|
||||||
@ -643,6 +644,9 @@ struct test {
|
|||||||
if (opencl) {
|
if (opencl) {
|
||||||
return "OpenCL";
|
return "OpenCL";
|
||||||
}
|
}
|
||||||
|
if (vulkan) {
|
||||||
|
return "Vulkan";
|
||||||
|
}
|
||||||
if (metal) {
|
if (metal) {
|
||||||
return "Metal";
|
return "Metal";
|
||||||
}
|
}
|
||||||
@ -658,7 +662,7 @@ struct test {
|
|||||||
static const std::vector<std::string> & get_fields() {
|
static const std::vector<std::string> & get_fields() {
|
||||||
static const std::vector<std::string> fields = {
|
static const std::vector<std::string> fields = {
|
||||||
"build_commit", "build_number",
|
"build_commit", "build_number",
|
||||||
"cuda", "opencl", "metal", "gpu_blas", "blas",
|
"cuda", "opencl", "vulkan", "metal", "gpu_blas", "blas",
|
||||||
"cpu_info", "gpu_info",
|
"cpu_info", "gpu_info",
|
||||||
"model_filename", "model_type", "model_size", "model_n_params",
|
"model_filename", "model_type", "model_size", "model_n_params",
|
||||||
"n_batch", "n_threads", "type_k", "type_v",
|
"n_batch", "n_threads", "type_k", "type_v",
|
||||||
@ -682,7 +686,7 @@ struct test {
|
|||||||
field == "avg_ns" || field == "stddev_ns") {
|
field == "avg_ns" || field == "stddev_ns") {
|
||||||
return INT;
|
return INT;
|
||||||
}
|
}
|
||||||
if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
|
if (field == "cuda" || field == "opencl" || field == "vulkan"|| field == "metal" || field == "gpu_blas" || field == "blas" ||
|
||||||
field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") {
|
field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") {
|
||||||
return BOOL;
|
return BOOL;
|
||||||
}
|
}
|
||||||
@ -710,7 +714,7 @@ struct test {
|
|||||||
}
|
}
|
||||||
std::vector<std::string> values = {
|
std::vector<std::string> values = {
|
||||||
build_commit, std::to_string(build_number),
|
build_commit, std::to_string(build_number),
|
||||||
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
||||||
cpu_info, gpu_info,
|
cpu_info, gpu_info,
|
||||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||||
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
||||||
@ -738,6 +742,7 @@ const std::string test::build_commit = LLAMA_COMMIT;
|
|||||||
const int test::build_number = LLAMA_BUILD_NUMBER;
|
const int test::build_number = LLAMA_BUILD_NUMBER;
|
||||||
const bool test::cuda = !!ggml_cpu_has_cublas();
|
const bool test::cuda = !!ggml_cpu_has_cublas();
|
||||||
const bool test::opencl = !!ggml_cpu_has_clblast();
|
const bool test::opencl = !!ggml_cpu_has_clblast();
|
||||||
|
const bool test::vulkan = !!ggml_cpu_has_vulkan();
|
||||||
const bool test::metal = !!ggml_cpu_has_metal();
|
const bool test::metal = !!ggml_cpu_has_metal();
|
||||||
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
|
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
|
||||||
const bool test::blas = !!ggml_cpu_has_blas();
|
const bool test::blas = !!ggml_cpu_has_blas();
|
||||||
|
@ -2099,7 +2099,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#ifdef GGML_USE_CUBLAS
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
|
||||||
std::string arg_next = argv[i];
|
std::string arg_next = argv[i];
|
||||||
|
|
||||||
// split string by , and /
|
// split string by , and /
|
||||||
@ -2125,7 +2125,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||||||
}
|
}
|
||||||
else if (arg == "--no-mul-mat-q" || arg == "-nommq")
|
else if (arg == "--no-mul-mat-q" || arg == "-nommq")
|
||||||
{
|
{
|
||||||
#ifdef GGML_USE_CUBLAS
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
|
||||||
params.mul_mat_q = false;
|
params.mul_mat_q = false;
|
||||||
#else
|
#else
|
||||||
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
|
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
|
||||||
@ -2138,7 +2138,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#ifdef GGML_USE_CUBLAS
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
|
||||||
params.main_gpu = std::stoi(argv[i]);
|
params.main_gpu = std::stoi(argv[i]);
|
||||||
#else
|
#else
|
||||||
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
|
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
|
||||||
|
9
examples/sycl/CMakeLists.txt
Normal file
9
examples/sycl/CMakeLists.txt
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
# MIT license
|
||||||
|
# Copyright (C) 2024 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: MIT
|
||||||
|
|
||||||
|
set(TARGET ls-sycl-device)
|
||||||
|
add_executable(${TARGET} ls-sycl-device.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
47
examples/sycl/README.md
Normal file
47
examples/sycl/README.md
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
# llama.cpp/example/sycl
|
||||||
|
|
||||||
|
This example program provide the tools for llama.cpp for SYCL on Intel GPU.
|
||||||
|
|
||||||
|
## Tool
|
||||||
|
|
||||||
|
|Tool Name| Function|Status|
|
||||||
|
|-|-|-|
|
||||||
|
|ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
|
||||||
|
|
||||||
|
### ls-sycl-device
|
||||||
|
|
||||||
|
List all SYCL devices with ID, compute capability, max work group size, ect.
|
||||||
|
|
||||||
|
1. Build the llama.cpp for SYCL for all targets.
|
||||||
|
|
||||||
|
2. Enable oneAPI running environment
|
||||||
|
|
||||||
|
```
|
||||||
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Execute
|
||||||
|
|
||||||
|
```
|
||||||
|
./build/bin/ls-sycl-device
|
||||||
|
```
|
||||||
|
|
||||||
|
Check the ID in startup log, like:
|
||||||
|
|
||||||
|
```
|
||||||
|
found 4 SYCL devices:
|
||||||
|
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||||
|
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||||
|
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||||
|
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||||
|
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||||
|
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||||
|
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||||
|
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|Attribute|Note|
|
||||||
|
|-|-|
|
||||||
|
|compute capability 1.3|Level-zero running time, recommended |
|
||||||
|
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
20
examples/sycl/build.sh
Executable file
20
examples/sycl/build.sh
Executable file
@ -0,0 +1,20 @@
|
|||||||
|
|
||||||
|
# MIT license
|
||||||
|
# Copyright (C) 2024 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: MIT
|
||||||
|
|
||||||
|
mkdir -p build
|
||||||
|
cd build
|
||||||
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
|
#for FP16
|
||||||
|
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
|
||||||
|
|
||||||
|
#for FP32
|
||||||
|
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
|
#build example/main only
|
||||||
|
#cmake --build . --config Release --target main
|
||||||
|
|
||||||
|
#build all binary
|
||||||
|
cmake --build . --config Release -v
|
11
examples/sycl/ls-sycl-device.cpp
Normal file
11
examples/sycl/ls-sycl-device.cpp
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
/*MIT license
|
||||||
|
Copyright (C) 2024 Intel Corporation
|
||||||
|
SPDX-License-Identifier: MIT
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "ggml-sycl.h"
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
ggml_backend_sycl_print_sycl_devices();
|
||||||
|
return 0;
|
||||||
|
}
|
19
examples/sycl/run-llama2.sh
Executable file
19
examples/sycl/run-llama2.sh
Executable file
@ -0,0 +1,19 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# MIT license
|
||||||
|
# Copyright (C) 2024 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: MIT
|
||||||
|
|
||||||
|
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||||
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
|
if [ $# -gt 0 ]; then
|
||||||
|
export GGML_SYCL_DEVICE=$1
|
||||||
|
else
|
||||||
|
export GGML_SYCL_DEVICE=0
|
||||||
|
fi
|
||||||
|
echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
|
||||||
|
#export GGML_SYCL_DEBUG=1
|
||||||
|
./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||||
|
#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
|
||||||
|
|
@ -20,11 +20,11 @@
|
|||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1705677747,
|
"lastModified": 1706191920,
|
||||||
"narHash": "sha256-eyM3okYtMgYDgmYukoUzrmuoY4xl4FUujnsv/P6I/zI=",
|
"narHash": "sha256-eLihrZAPZX0R6RyM5fYAWeKVNuQPYjAkCUBr+JNvtdE=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "bbe7d8f876fbbe7c959c90ba2ae2852220573261",
|
"rev": "ae5c332cbb5827f6b1f02572496b141021de335f",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
106
ggml-alloc.c
106
ggml-alloc.c
@ -778,38 +778,26 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// utils
|
// utils
|
||||||
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
|
||||||
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
|
||||||
|
|
||||||
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
static bool alloc_tensor_range(struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * first, struct ggml_tensor * last,
|
||||||
size_t nbytes = 0;
|
ggml_backend_buffer_type_t buft, size_t size,
|
||||||
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
|
||||||
if (t->data == NULL && t->view_src == NULL) {
|
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
||||||
nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (nbytes == 0) {
|
|
||||||
// all the tensors in the context are already allocated
|
|
||||||
#ifndef NDEBUG
|
|
||||||
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
|
||||||
#endif
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
|
|
||||||
if (buffer == NULL) {
|
if (buffer == NULL) {
|
||||||
// failed to allocate buffer
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
|
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
||||||
#endif
|
#endif
|
||||||
return NULL;
|
for (size_t i = 0; i < *n_buffers; i++) {
|
||||||
|
ggml_backend_buffer_free(*buffers[i]);
|
||||||
|
}
|
||||||
|
free(buffers);
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
|
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
|
||||||
|
|
||||||
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
if (t->data == NULL) {
|
if (t->data == NULL) {
|
||||||
if (t->view_src == NULL) {
|
if (t->view_src == NULL) {
|
||||||
ggml_tallocr_alloc(tallocr, t);
|
ggml_tallocr_alloc(tallocr, t);
|
||||||
@ -826,6 +814,76 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|||||||
|
|
||||||
ggml_tallocr_free(tallocr);
|
ggml_tallocr_free(tallocr);
|
||||||
|
|
||||||
|
*buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
|
||||||
|
(*buffers)[(*n_buffers)++] = buffer;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||||
|
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
||||||
|
|
||||||
|
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
||||||
|
size_t max_size = ggml_backend_buft_get_max_size(buft);
|
||||||
|
|
||||||
|
ggml_backend_buffer_t * buffers = NULL;
|
||||||
|
size_t n_buffers = 0;
|
||||||
|
|
||||||
|
size_t cur_buf_size = 0;
|
||||||
|
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
|
||||||
|
for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
|
size_t this_size = 0;
|
||||||
|
if (t->data == NULL && t->view_src == NULL) {
|
||||||
|
this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this_size > max_size) {
|
||||||
|
// tensor is too large to fit in a single buffer
|
||||||
|
fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
||||||
|
__func__, t->name,
|
||||||
|
ggml_backend_buft_name(buft),
|
||||||
|
this_size, max_size);
|
||||||
|
for (size_t i = 0; i < n_buffers; i++) {
|
||||||
|
ggml_backend_buffer_free(buffers[i]);
|
||||||
|
}
|
||||||
|
free(buffers);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((cur_buf_size + this_size) > max_size) {
|
||||||
|
// allocate tensors in the current buffer
|
||||||
|
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
first = t;
|
||||||
|
cur_buf_size = this_size;
|
||||||
|
} else {
|
||||||
|
cur_buf_size += this_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate remaining tensors
|
||||||
|
if (cur_buf_size > 0) {
|
||||||
|
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n_buffers == 0) {
|
||||||
|
// all the tensors in the context are already allocated
|
||||||
|
#ifndef NDEBUG
|
||||||
|
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
||||||
|
#endif
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_t buffer;
|
||||||
|
if (n_buffers == 1) {
|
||||||
|
buffer = buffers[0];
|
||||||
|
} else {
|
||||||
|
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
|
||||||
|
}
|
||||||
|
free(buffers);
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19,6 +19,7 @@ extern "C" {
|
|||||||
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
||||||
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
||||||
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
||||||
|
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
|
||||||
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
||||||
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
||||||
// check if tensor data is in host memory
|
// check if tensor data is in host memory
|
||||||
@ -63,6 +64,11 @@ extern "C" {
|
|||||||
// do not use directly, use ggml_backend_tensor_copy instead
|
// do not use directly, use ggml_backend_tensor_copy instead
|
||||||
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
|
||||||
|
// buffer that contains a collection of buffers
|
||||||
|
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
|
||||||
|
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
|
||||||
|
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend
|
// Backend
|
||||||
//
|
//
|
||||||
|
109
ggml-backend.c
109
ggml-backend.c
@ -27,6 +27,14 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
|
|||||||
return buft->iface.get_alignment(buft);
|
return buft->iface.get_alignment(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||||
|
// get_max_size is optional, defaults to SIZE_MAX
|
||||||
|
if (buft->iface.get_max_size) {
|
||||||
|
return buft->iface.get_max_size(buft);
|
||||||
|
}
|
||||||
|
return SIZE_MAX;
|
||||||
|
}
|
||||||
|
|
||||||
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
||||||
// get_alloc_size is optional, defaults to ggml_nbytes
|
// get_alloc_size is optional, defaults to ggml_nbytes
|
||||||
if (buft->iface.get_alloc_size) {
|
if (buft->iface.get_alloc_size) {
|
||||||
@ -57,8 +65,6 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
|||||||
size_t size) {
|
size_t size) {
|
||||||
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
|
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
|
||||||
|
|
||||||
GGML_ASSERT(iface.get_base != NULL);
|
|
||||||
|
|
||||||
(*buffer) = (struct ggml_backend_buffer) {
|
(*buffer) = (struct ggml_backend_buffer) {
|
||||||
/* .interface = */ iface,
|
/* .interface = */ iface,
|
||||||
/* .buft = */ buft,
|
/* .buft = */ buft,
|
||||||
@ -108,6 +114,10 @@ size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
|
|||||||
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
|
||||||
|
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
|
||||||
|
}
|
||||||
|
|
||||||
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||||
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
||||||
}
|
}
|
||||||
@ -122,6 +132,11 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
|||||||
|
|
||||||
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
||||||
buffer->usage = usage;
|
buffer->usage = usage;
|
||||||
|
|
||||||
|
// FIXME: add a generic callback to the buffer interface
|
||||||
|
if (ggml_backend_buffer_is_multi_buffer(buffer)) {
|
||||||
|
ggml_backend_multi_buffer_set_usage(buffer, usage);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
||||||
@ -171,6 +186,10 @@ size_t ggml_backend_get_alignment(ggml_backend_t backend) {
|
|||||||
return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
|
return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t ggml_backend_get_max_size(ggml_backend_t backend) {
|
||||||
|
return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||||
@ -339,11 +358,21 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
|||||||
ggml_backend_cuda_reg_devices();
|
ggml_backend_cuda_reg_devices();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_SYCL
|
||||||
|
extern void ggml_backend_sycl_reg_devices(void);
|
||||||
|
ggml_backend_sycl_reg_devices();
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
||||||
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||||
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
|
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_VULKAN
|
||||||
|
extern GGML_CALL int ggml_backend_vk_reg_devices(void);
|
||||||
|
ggml_backend_vk_reg_devices();
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
||||||
@ -547,6 +576,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|||||||
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
||||||
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||||
@ -602,6 +632,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
|||||||
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
||||||
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||||
@ -758,6 +789,80 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, v
|
|||||||
GGML_UNUSED(user_data);
|
GGML_UNUSED(user_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// multi-buffer buffer
|
||||||
|
|
||||||
|
struct ggml_backend_multi_buffer_context {
|
||||||
|
ggml_backend_buffer_t * buffers;
|
||||||
|
size_t n_buffers;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
|
||||||
|
|
||||||
|
GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||||
|
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
||||||
|
|
||||||
|
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
|
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
||||||
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
||||||
|
ggml_backend_buffer_free(ctx->buffers[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
free(ctx->buffers);
|
||||||
|
free(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
|
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
||||||
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
||||||
|
ggml_backend_buffer_clear(ctx->buffers[i], value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
|
||||||
|
static struct ggml_backend_buffer_i multi_backend_buffer_i = {
|
||||||
|
/* .get_name = */ ggml_backend_multi_buffer_get_name,
|
||||||
|
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
|
||||||
|
/* .get_base = */ NULL,
|
||||||
|
/* .init_tensor = */ NULL,
|
||||||
|
/* .set_tensor = */ NULL,
|
||||||
|
/* .get_tensor = */ NULL,
|
||||||
|
/* .cpy_tensor = */ NULL,
|
||||||
|
/* .clear = */ ggml_backend_multi_buffer_clear,
|
||||||
|
/* .reset = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
return multi_backend_buffer_i;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
|
||||||
|
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
|
||||||
|
ctx->n_buffers = n_buffers;
|
||||||
|
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
|
||||||
|
|
||||||
|
size_t total_size = 0;
|
||||||
|
for (size_t i = 0; i < n_buffers; i++) {
|
||||||
|
ctx->buffers[i] = buffers[i];
|
||||||
|
total_size += ggml_backend_buffer_get_size(buffers[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
|
||||||
|
return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
||||||
|
GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
|
||||||
|
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
||||||
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
||||||
|
ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// scheduler
|
// scheduler
|
||||||
|
|
||||||
|
@ -20,6 +20,7 @@ extern "C" {
|
|||||||
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||||
|
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||||
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
||||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||||
@ -36,6 +37,7 @@ extern "C" {
|
|||||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||||
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||||
|
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||||
@ -54,6 +56,7 @@ extern "C" {
|
|||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
||||||
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
|
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
|
||||||
|
GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
|
@ -10440,6 +10440,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
|||||||
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
||||||
/* .is_host = */ NULL,
|
/* .is_host = */ NULL,
|
||||||
@ -10715,6 +10716,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
|
|||||||
/* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
|
/* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
|
||||||
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
||||||
@ -10794,6 +10796,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|||||||
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
||||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||||
|
34
ggml-metal.m
34
ggml-metal.m
@ -24,10 +24,7 @@
|
|||||||
|
|
||||||
#define UNUSED(x) (void)(x)
|
#define UNUSED(x) (void)(x)
|
||||||
|
|
||||||
#define GGML_METAL_MAX_KERNELS 256
|
|
||||||
|
|
||||||
struct ggml_metal_kernel {
|
struct ggml_metal_kernel {
|
||||||
id<MTLFunction> function;
|
|
||||||
id<MTLComputePipelineState> pipeline;
|
id<MTLComputePipelineState> pipeline;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -162,11 +159,10 @@ struct ggml_metal_context {
|
|||||||
|
|
||||||
id<MTLDevice> device;
|
id<MTLDevice> device;
|
||||||
id<MTLCommandQueue> queue;
|
id<MTLCommandQueue> queue;
|
||||||
id<MTLLibrary> library;
|
|
||||||
|
|
||||||
dispatch_queue_t d_queue;
|
dispatch_queue_t d_queue;
|
||||||
|
|
||||||
struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS];
|
struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];
|
||||||
|
|
||||||
bool support_simdgroup_reduction;
|
bool support_simdgroup_reduction;
|
||||||
bool support_simdgroup_mm;
|
bool support_simdgroup_mm;
|
||||||
@ -249,6 +245,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
ctx->queue = [ctx->device newCommandQueue];
|
ctx->queue = [ctx->device newCommandQueue];
|
||||||
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||||
|
|
||||||
|
id<MTLLibrary> metal_library;
|
||||||
|
|
||||||
// load library
|
// load library
|
||||||
{
|
{
|
||||||
NSBundle * bundle = nil;
|
NSBundle * bundle = nil;
|
||||||
@ -263,7 +261,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
// pre-compiled library found
|
// pre-compiled library found
|
||||||
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
||||||
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
|
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
|
||||||
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
|
||||||
if (error) {
|
if (error) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -305,7 +303,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
|
|
||||||
//[options setFastMathEnabled:false];
|
//[options setFastMathEnabled:false];
|
||||||
|
|
||||||
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
metal_library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
||||||
if (error) {
|
if (error) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -370,8 +368,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
{
|
{
|
||||||
NSError * error = nil;
|
NSError * error = nil;
|
||||||
|
|
||||||
for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) {
|
for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
|
||||||
ctx->kernels[i].function = nil;
|
|
||||||
ctx->kernels[i].pipeline = nil;
|
ctx->kernels[i].pipeline = nil;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -383,13 +380,15 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
#define GGML_METAL_ADD_KERNEL(e, name, supported) \
|
#define GGML_METAL_ADD_KERNEL(e, name, supported) \
|
||||||
if (supported) { \
|
if (supported) { \
|
||||||
struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
|
struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
|
||||||
kernel->function = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
id<MTLFunction> metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \
|
||||||
kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:kernel->function error:&error]; \
|
kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:metal_function error:&error]; \
|
||||||
|
[metal_function release]; \
|
||||||
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
|
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
|
||||||
(int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
|
(int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
|
||||||
(int) kernel->pipeline.threadExecutionWidth); \
|
(int) kernel->pipeline.threadExecutionWidth); \
|
||||||
if (error) { \
|
if (error) { \
|
||||||
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||||
|
[metal_library release]; \
|
||||||
return NULL; \
|
return NULL; \
|
||||||
} \
|
} \
|
||||||
} else { \
|
} else { \
|
||||||
@ -521,23 +520,17 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[metal_library release];
|
||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_metal_free(struct ggml_metal_context * ctx) {
|
static void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||||
GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
|
GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
|
||||||
|
|
||||||
for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) {
|
for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
|
||||||
if (ctx->kernels[i].pipeline) {
|
[ctx->kernels[i].pipeline release];
|
||||||
[ctx->kernels[i].pipeline release];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ctx->kernels[i].function) {
|
|
||||||
[ctx->kernels[i].function release];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
[ctx->library release];
|
|
||||||
[ctx->queue release];
|
[ctx->queue release];
|
||||||
[ctx->device release];
|
[ctx->device release];
|
||||||
|
|
||||||
@ -2504,6 +2497,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|||||||
/* .get_name = */ ggml_backend_metal_buffer_type_get_name,
|
/* .get_name = */ ggml_backend_metal_buffer_type_get_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
||||||
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
||||||
|
@ -2136,6 +2136,7 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
|
|||||||
/* .get_name = */ ggml_backend_opencl_buffer_type_name,
|
/* .get_name = */ ggml_backend_opencl_buffer_type_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // TODO: return from device info
|
||||||
/* .get_alloc_size = */ NULL,
|
/* .get_alloc_size = */ NULL,
|
||||||
/* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
|
||||||
/* .is_host = */ NULL,
|
/* .is_host = */ NULL,
|
||||||
@ -2192,6 +2193,7 @@ ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
|
|||||||
/* .get_name = */ ggml_backend_opencl_host_buffer_type_name,
|
/* .get_name = */ ggml_backend_opencl_host_buffer_type_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
||||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||||
|
15199
ggml-sycl.cpp
Normal file
15199
ggml-sycl.cpp
Normal file
File diff suppressed because it is too large
Load Diff
27
ggml-sycl.h
Normal file
27
ggml-sycl.h
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
/*MIT license
|
||||||
|
Copyright (C) 2024 Intel Corporation
|
||||||
|
SPDX-License-Identifier: MIT
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define GGML_SYCL_MAX_DEVICES 16
|
||||||
|
#define GGML_SYCL_NAME "SYCL"
|
||||||
|
|
||||||
|
GGML_API void ggml_init_sycl(void);
|
||||||
|
GGML_API bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||||
|
GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
||||||
|
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
61420
ggml-vulkan-shaders.hpp
Normal file
61420
ggml-vulkan-shaders.hpp
Normal file
File diff suppressed because it is too large
Load Diff
5176
ggml-vulkan.cpp
Normal file
5176
ggml-vulkan.cpp
Normal file
File diff suppressed because it is too large
Load Diff
34
ggml-vulkan.h
Normal file
34
ggml-vulkan.h
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define GGML_VK_NAME "Vulkan"
|
||||||
|
|
||||||
|
GGML_API void ggml_vk_init(void);
|
||||||
|
|
||||||
|
GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node);
|
||||||
|
GGML_API void ggml_vk_preallocate_buffers(void);
|
||||||
|
GGML_API void ggml_vk_build_graph(struct ggml_tensor * node, bool last_node);
|
||||||
|
GGML_API bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||||
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||||
|
void ggml_vk_check_results_1(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||||
|
#endif
|
||||||
|
GGML_API void ggml_vk_graph_cleanup(void);
|
||||||
|
|
||||||
|
// backend API
|
||||||
|
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(void);
|
||||||
|
|
||||||
|
GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(void);
|
||||||
|
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||||
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
65
ggml.c
65
ggml.c
@ -248,6 +248,10 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
#include "ggml-opencl.h"
|
#include "ggml-opencl.h"
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
#include "ggml-vulkan.h"
|
||||||
|
#elif defined(GGML_USE_SYCL)
|
||||||
|
#include "ggml-sycl.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// floating point type used to accumulate sums
|
// floating point type used to accumulate sums
|
||||||
@ -2355,6 +2359,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||||||
ggml_init_cublas();
|
ggml_init_cublas();
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
ggml_cl_init();
|
ggml_cl_init();
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
ggml_vk_init();
|
||||||
|
#elif defined(GGML_USE_SYCL)
|
||||||
|
ggml_init_sycl();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ggml_setup_op_has_task_pass();
|
ggml_setup_op_has_task_pass();
|
||||||
@ -8130,7 +8138,7 @@ static void ggml_compute_forward_mul_f32(
|
|||||||
const int ith = params->ith;
|
const int ith = params->ith;
|
||||||
const int nth = params->nth;
|
const int nth = params->nth;
|
||||||
|
|
||||||
#ifdef GGML_USE_CLBLAST
|
#if defined(GGML_USE_CLBLAST)
|
||||||
if (src1->backend == GGML_BACKEND_GPU) {
|
if (src1->backend == GGML_BACKEND_GPU) {
|
||||||
// TODO: OpenCL kernel support full broadcast
|
// TODO: OpenCL kernel support full broadcast
|
||||||
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
||||||
@ -10085,7 +10093,7 @@ static void ggml_compute_forward_mul_mat(
|
|||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||||
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
||||||
const int64_t ne_plane = ne01*ne00;
|
const int64_t ne_plane = ne01*ne00;
|
||||||
const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
||||||
UNUSED(desired_wsize);
|
UNUSED(desired_wsize);
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT) {
|
if (params->type == GGML_TASK_INIT) {
|
||||||
@ -15010,8 +15018,26 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||||||
}
|
}
|
||||||
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
||||||
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
const bool skip_cpu = ggml_vk_compute_forward(params, tensor);
|
||||||
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||||
|
if (skip_cpu) {
|
||||||
|
ggml_vk_check_results_1(params, tensor);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
if (skip_cpu) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
||||||
|
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
|
#ifdef GGML_USE_SYCL
|
||||||
|
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
|
||||||
|
if (skip_cpu) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif // GGML_USE_SYCL
|
||||||
switch (tensor->op) {
|
switch (tensor->op) {
|
||||||
case GGML_OP_DUP:
|
case GGML_OP_DUP:
|
||||||
{
|
{
|
||||||
@ -17418,6 +17444,17 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_VULKAN
|
||||||
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
|
ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]);
|
||||||
|
}
|
||||||
|
ggml_vk_preallocate_buffers();
|
||||||
|
|
||||||
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
|
ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
const int n_threads = cplan->n_threads;
|
const int n_threads = cplan->n_threads;
|
||||||
|
|
||||||
struct ggml_compute_state_shared state_shared = {
|
struct ggml_compute_state_shared state_shared = {
|
||||||
@ -17469,6 +17506,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_VULKAN
|
||||||
|
ggml_vk_graph_cleanup();
|
||||||
|
#endif
|
||||||
|
|
||||||
// performance stats (graph)
|
// performance stats (graph)
|
||||||
{
|
{
|
||||||
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
||||||
@ -20603,7 +20644,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_blas(void) {
|
int ggml_cpu_has_blas(void) {
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
|
||||||
return 1;
|
return 1;
|
||||||
#else
|
#else
|
||||||
return 0;
|
return 0;
|
||||||
@ -20626,8 +20667,24 @@ int ggml_cpu_has_clblast(void) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_vulkan(void) {
|
||||||
|
#if defined(GGML_USE_VULKAN)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_sycl(void) {
|
||||||
|
#if defined(GGML_USE_SYCL)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_gpublas(void) {
|
int ggml_cpu_has_gpublas(void) {
|
||||||
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast();
|
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_sycl();
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_sse3(void) {
|
int ggml_cpu_has_sse3(void) {
|
||||||
|
2
ggml.h
2
ggml.h
@ -2281,9 +2281,11 @@ extern "C" {
|
|||||||
GGML_API int ggml_cpu_has_blas (void);
|
GGML_API int ggml_cpu_has_blas (void);
|
||||||
GGML_API int ggml_cpu_has_cublas (void);
|
GGML_API int ggml_cpu_has_cublas (void);
|
||||||
GGML_API int ggml_cpu_has_clblast (void);
|
GGML_API int ggml_cpu_has_clblast (void);
|
||||||
|
GGML_API int ggml_cpu_has_vulkan (void);
|
||||||
GGML_API int ggml_cpu_has_gpublas (void);
|
GGML_API int ggml_cpu_has_gpublas (void);
|
||||||
GGML_API int ggml_cpu_has_sse3 (void);
|
GGML_API int ggml_cpu_has_sse3 (void);
|
||||||
GGML_API int ggml_cpu_has_ssse3 (void);
|
GGML_API int ggml_cpu_has_ssse3 (void);
|
||||||
|
GGML_API int ggml_cpu_has_sycl (void);
|
||||||
GGML_API int ggml_cpu_has_vsx (void);
|
GGML_API int ggml_cpu_has_vsx (void);
|
||||||
|
|
||||||
//
|
//
|
||||||
|
2362
ggml_vk_generate_shaders.py
Normal file
2362
ggml_vk_generate_shaders.py
Normal file
File diff suppressed because it is too large
Load Diff
90
llama.cpp
90
llama.cpp
@ -11,6 +11,10 @@
|
|||||||
# include "ggml-cuda.h"
|
# include "ggml-cuda.h"
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
# include "ggml-opencl.h"
|
# include "ggml-opencl.h"
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
# include "ggml-vulkan.h"
|
||||||
|
#elif defined(GGML_USE_SYCL)
|
||||||
|
# include "ggml-sycl.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
@ -52,6 +56,7 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <cfloat>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
#include <climits>
|
#include <climits>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@ -1279,8 +1284,14 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|||||||
if (host_buffer) {
|
if (host_buffer) {
|
||||||
buft = ggml_backend_cuda_host_buffer_type();
|
buft = ggml_backend_cuda_host_buffer_type();
|
||||||
}
|
}
|
||||||
|
#elif defined(GGML_USE_SYCL)
|
||||||
|
buft = ggml_backend_sycl_host_buffer_type();
|
||||||
#elif defined(GGML_USE_CPU_HBM)
|
#elif defined(GGML_USE_CPU_HBM)
|
||||||
buft = ggml_backend_cpu_hbm_buffer_type();
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
if (host_buffer) {
|
||||||
|
buft = ggml_backend_vk_host_buffer_type();
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (buft == nullptr) {
|
if (buft == nullptr) {
|
||||||
@ -1298,6 +1309,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|||||||
buft = ggml_backend_metal_buffer_type();
|
buft = ggml_backend_metal_buffer_type();
|
||||||
#elif defined(GGML_USE_CUBLAS)
|
#elif defined(GGML_USE_CUBLAS)
|
||||||
buft = ggml_backend_cuda_buffer_type(gpu);
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
buft = ggml_backend_vk_buffer_type();
|
||||||
|
#elif defined(GGML_USE_SYCL)
|
||||||
|
buft = ggml_backend_sycl_buffer_type(gpu);
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
buft = ggml_backend_opencl_buffer_type();
|
buft = ggml_backend_opencl_buffer_type();
|
||||||
#endif
|
#endif
|
||||||
@ -6875,7 +6890,7 @@ static int llama_decode_internal(
|
|||||||
}
|
}
|
||||||
|
|
||||||
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
|
||||||
if (ggml_cpu_has_cublas() && fully_offloaded) {
|
if ((ggml_cpu_has_cublas() || ggml_cpu_has_vulkan()) && fully_offloaded) {
|
||||||
n_threads = 1;
|
n_threads = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -8282,21 +8297,56 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sample_softmax(ctx, candidates);
|
|
||||||
|
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
float scale = candidates->data[0].p; // scale by max prob
|
bool min_p_applied = false;
|
||||||
size_t i = 1; // first token always matches
|
|
||||||
|
|
||||||
for (; i < candidates->size; ++i) {
|
// if the candidates aren't sorted, try the unsorted implementation first
|
||||||
if (candidates->data[i].p < p * scale && i >= min_keep) {
|
if (!candidates->sorted) {
|
||||||
break; // prob too small
|
std::vector<llama_token_data> filtered_tokens;
|
||||||
|
|
||||||
|
float max_logit = -FLT_MAX;
|
||||||
|
for (size_t i = 0; i < candidates->size; ++i) {
|
||||||
|
max_logit = std::max(max_logit, candidates->data[i].logit);
|
||||||
|
}
|
||||||
|
const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
|
||||||
|
|
||||||
|
for (size_t i = 0; i < candidates->size; ++i) {
|
||||||
|
if (candidates->data[i].logit >= min_logit) {
|
||||||
|
filtered_tokens.push_back(candidates->data[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// if we have enough values the operation was a success
|
||||||
|
if (filtered_tokens.size() >= min_keep) {
|
||||||
|
memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
|
||||||
|
candidates->size = filtered_tokens.size();
|
||||||
|
min_p_applied = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Resize the output vector to keep only the matching tokens
|
// if the candidates are sorted or the unsorted implementation failed, use this implementation
|
||||||
candidates->size = i;
|
if (!min_p_applied) {
|
||||||
|
// Sort the logits in descending order
|
||||||
|
if (!candidates->sorted) {
|
||||||
|
std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||||
|
return a.logit > b.logit;
|
||||||
|
});
|
||||||
|
candidates->sorted = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
|
||||||
|
size_t i = 1; // first token always matches
|
||||||
|
|
||||||
|
for (; i < candidates->size; ++i) {
|
||||||
|
if (candidates->data[i].logit < min_logit && i >= min_keep) {
|
||||||
|
break; // prob too small
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Resize the output vector to keep only the matching tokens
|
||||||
|
candidates->size = i;
|
||||||
|
}
|
||||||
|
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||||
@ -10225,6 +10275,26 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
if (model->n_gpu_layers > 0) {
|
||||||
|
ggml_backend_t backend = ggml_backend_vk_init();
|
||||||
|
if (backend == nullptr) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
||||||
|
llama_free(ctx);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
ctx->backends.push_back(backend);
|
||||||
|
}
|
||||||
|
#elif defined(GGML_USE_SYCL)
|
||||||
|
if (model->n_gpu_layers > 0) {
|
||||||
|
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
||||||
|
if (backend == nullptr) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
|
||||||
|
llama_free(ctx);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
ctx->backends.push_back(backend);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
ctx->backend_cpu = ggml_backend_cpu_init();
|
||||||
if (ctx->backend_cpu == nullptr) {
|
if (ctx->backend_cpu == nullptr) {
|
||||||
|
5
llama.h
5
llama.h
@ -6,6 +6,9 @@
|
|||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
||||||
|
#elif defined(GGML_USE_SYCL)
|
||||||
|
#include "ggml-sycl.h"
|
||||||
|
#define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
|
||||||
#else
|
#else
|
||||||
#define LLAMA_MAX_DEVICES 1
|
#define LLAMA_MAX_DEVICES 1
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
@ -46,7 +49,7 @@
|
|||||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
#define LLAMA_SESSION_VERSION 4
|
#define LLAMA_SESSION_VERSION 4
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
#endif
|
#endif
|
||||||
|
@ -1 +1 @@
|
|||||||
c2448f88d17395452a587d0176d19ed87e0f7ce1
|
f2a9472b23cf27e672ed70a2a6eb078f7b060f18
|
||||||
|
@ -239,10 +239,17 @@ static std::string var_to_str(ggml_type type) {
|
|||||||
#define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
|
#define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
|
||||||
#define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
|
#define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
|
||||||
|
|
||||||
|
#ifdef GGML_USE_SYCL
|
||||||
|
static bool inline _isinf(float f) {
|
||||||
|
return (*(uint32_t *)&f & 0x7fffffff) == 0x7f800000;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
static bool inline _isinf(float f) { return std::isinf(f); }
|
||||||
|
#endif
|
||||||
|
|
||||||
// accept FLT_MAX as infinity
|
// accept FLT_MAX as infinity
|
||||||
static bool isinf_or_max(float f) {
|
static bool isinf_or_max(float f) {
|
||||||
return std::isinf(f) || f == FLT_MAX || f == -FLT_MAX;
|
return _isinf(f) || f == FLT_MAX || f == -FLT_MAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_is_view_op(enum ggml_op op) {
|
static bool ggml_is_view_op(enum ggml_op op) {
|
||||||
|
Loading…
Reference in New Issue
Block a user