mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 02:44:36 +00:00
Support multiple GPUs (split mode) on SYCL backend (#5806)
* suport multiple cards: split-mode - layer|row * rm warning * rebase with master, support tow new OPs, close feature for -sm=row, fix for unit test * update news * fix merge error * update according to review comments
This commit is contained in:
parent
9bf297a02b
commit
715641391d
@ -1,6 +1,7 @@
|
|||||||
# llama.cpp for SYCL
|
# llama.cpp for SYCL
|
||||||
|
|
||||||
- [Background](#background)
|
- [Background](#background)
|
||||||
|
- [News](#news)
|
||||||
- [OS](#os)
|
- [OS](#os)
|
||||||
- [Intel GPU](#intel-gpu)
|
- [Intel GPU](#intel-gpu)
|
||||||
- [Docker](#docker)
|
- [Docker](#docker)
|
||||||
@ -25,6 +26,21 @@ The llama.cpp for SYCL is used to support Intel GPUs.
|
|||||||
|
|
||||||
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|
||||||
|
|
||||||
|
## News
|
||||||
|
|
||||||
|
- 2024.3
|
||||||
|
- Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
|
||||||
|
- Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
|
||||||
|
- Support detecting all GPUs with level-zero and same top **Max compute units**.
|
||||||
|
- Support OPs
|
||||||
|
- hardsigmoid
|
||||||
|
- hardswish
|
||||||
|
- pool2d
|
||||||
|
|
||||||
|
- 2024.1
|
||||||
|
- Create SYCL backend for Intel GPU.
|
||||||
|
- Support Windows build
|
||||||
|
|
||||||
## OS
|
## OS
|
||||||
|
|
||||||
|OS|Status|Verified|
|
|OS|Status|Verified|
|
||||||
@ -449,6 +465,7 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
|||||||
|-|-|-|
|
|-|-|-|
|
||||||
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|
||||||
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|
||||||
|
|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|
|
||||||
|
|
||||||
## Known Issue
|
## Known Issue
|
||||||
|
|
||||||
@ -458,6 +475,10 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
|||||||
|
|
||||||
Solution: add **--no-mmap** or **--mmap 0**.
|
Solution: add **--no-mmap** or **--mmap 0**.
|
||||||
|
|
||||||
|
- Split-mode: [row] is not supported
|
||||||
|
|
||||||
|
It's on developing.
|
||||||
|
|
||||||
## Q&A
|
## Q&A
|
||||||
|
|
||||||
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
||||||
|
@ -640,6 +640,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||||||
} else if (arg_next == "layer") {
|
} else if (arg_next == "layer") {
|
||||||
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||||
} else if (arg_next == "row") {
|
} else if (arg_next == "row") {
|
||||||
|
#ifdef GGML_USE_SYCL
|
||||||
|
fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
|
||||||
|
exit(1);
|
||||||
|
#endif // GGML_USE_SYCL
|
||||||
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
||||||
} else {
|
} else {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -123,20 +123,15 @@ static std::string get_gpu_info() {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#ifdef GGML_USE_SYCL
|
#ifdef GGML_USE_SYCL
|
||||||
int device_list[GGML_SYCL_MAX_DEVICES];
|
int count = ggml_backend_sycl_get_device_count();
|
||||||
ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES);
|
for (int i = 0; i < count; i++) {
|
||||||
|
|
||||||
for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
|
|
||||||
if (device_list[i] >0 ){
|
|
||||||
char buf[128];
|
char buf[128];
|
||||||
ggml_sycl_get_device_description(i, buf, sizeof(buf));
|
ggml_sycl_get_device_description(i, buf, sizeof(buf));
|
||||||
id += buf;
|
id += buf;
|
||||||
|
if (i < count - 1) {
|
||||||
id += "/";
|
id += "/";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (id.length() >2 ) {
|
|
||||||
id.pop_back();
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
// TODO: other backends
|
// TODO: other backends
|
||||||
return id;
|
return id;
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
|
|
||||||
#include "ggml-sycl.h"
|
#include "ggml-sycl.h"
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main() {
|
||||||
ggml_backend_sycl_print_sycl_devices();
|
ggml_backend_sycl_print_sycl_devices();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -8,12 +8,19 @@ INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
|||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
if [ $# -gt 0 ]; then
|
if [ $# -gt 0 ]; then
|
||||||
export GGML_SYCL_DEVICE=$1
|
GGML_SYCL_DEVICE=$1
|
||||||
else
|
else
|
||||||
export GGML_SYCL_DEVICE=0
|
GGML_SYCL_DEVICE=0
|
||||||
fi
|
fi
|
||||||
echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
|
echo "use $GGML_SYCL_DEVICE as main GPU"
|
||||||
#export GGML_SYCL_DEBUG=1
|
#export GGML_SYCL_DEBUG=1
|
||||||
./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
|
||||||
#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
|
|
||||||
|
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
|
||||||
|
|
||||||
|
#use all GPUs with same max compute units
|
||||||
|
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||||
|
|
||||||
|
#use main GPU only
|
||||||
|
#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||||
|
|
||||||
|
2129
ggml-sycl.cpp
2129
ggml-sycl.cpp
File diff suppressed because it is too large
Load Diff
@ -24,6 +24,11 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
|||||||
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
|
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
|
||||||
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
|
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
|
||||||
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
|
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
|
||||||
|
GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
|
||||||
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
||||||
|
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
||||||
|
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
39
llama.cpp
39
llama.cpp
@ -104,6 +104,7 @@
|
|||||||
#define LLAMA_MAX_NODES 8192
|
#define LLAMA_MAX_NODES 8192
|
||||||
#define LLAMA_MAX_EXPERTS 8
|
#define LLAMA_MAX_EXPERTS 8
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// logging
|
// logging
|
||||||
//
|
//
|
||||||
@ -1429,7 +1430,9 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|||||||
buft = ggml_backend_cuda_host_buffer_type();
|
buft = ggml_backend_cuda_host_buffer_type();
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_SYCL)
|
#elif defined(GGML_USE_SYCL)
|
||||||
|
if (host_buffer) {
|
||||||
buft = ggml_backend_sycl_host_buffer_type();
|
buft = ggml_backend_sycl_host_buffer_type();
|
||||||
|
}
|
||||||
#elif defined(GGML_USE_CPU_HBM)
|
#elif defined(GGML_USE_CPU_HBM)
|
||||||
buft = ggml_backend_cpu_hbm_buffer_type();
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
@ -1483,6 +1486,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_SYCL
|
||||||
|
if (ggml_backend_sycl_get_device_count() > 1) {
|
||||||
|
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if (buft == nullptr) {
|
if (buft == nullptr) {
|
||||||
buft = llama_default_buffer_type_offload(fallback_gpu);
|
buft = llama_default_buffer_type_offload(fallback_gpu);
|
||||||
}
|
}
|
||||||
@ -1494,6 +1503,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|||||||
static size_t llama_get_device_count() {
|
static size_t llama_get_device_count() {
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUBLAS)
|
||||||
return ggml_backend_cuda_get_device_count();
|
return ggml_backend_cuda_get_device_count();
|
||||||
|
#elif defined(GGML_USE_SYCL)
|
||||||
|
return ggml_backend_sycl_get_device_count();
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
return ggml_backend_vk_get_device_count();
|
return ggml_backend_vk_get_device_count();
|
||||||
#else
|
#else
|
||||||
@ -1507,6 +1518,11 @@ static size_t llama_get_device_memory(int device) {
|
|||||||
size_t free;
|
size_t free;
|
||||||
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
||||||
return free;
|
return free;
|
||||||
|
#elif defined(GGML_USE_SYCL)
|
||||||
|
size_t total;
|
||||||
|
size_t free;
|
||||||
|
ggml_backend_sycl_get_device_memory(device, &total, &free);
|
||||||
|
return free;
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
@ -12075,13 +12091,31 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
}
|
}
|
||||||
#elif defined(GGML_USE_SYCL)
|
#elif defined(GGML_USE_SYCL)
|
||||||
if (model->n_gpu_layers > 0) {
|
if (model->n_gpu_layers > 0) {
|
||||||
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
||||||
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||||
|
int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
|
||||||
|
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
|
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
ctx->backends.push_back(backend);
|
ctx->backends.push_back(backend);
|
||||||
|
} else {
|
||||||
|
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
||||||
|
int id_list[GGML_SYCL_MAX_DEVICES];
|
||||||
|
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
||||||
|
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
||||||
|
int device_id = id_list[i];
|
||||||
|
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
||||||
|
if (backend == nullptr) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
|
||||||
|
llama_free(ctx);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
ctx->backends.push_back(backend);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_KOMPUTE)
|
#elif defined(GGML_USE_KOMPUTE)
|
||||||
if (model->n_gpu_layers > 0) {
|
if (model->n_gpu_layers > 0) {
|
||||||
@ -12161,7 +12195,6 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
ggml_set_name(ctx->inp_cls, "inp_cls");
|
ggml_set_name(ctx->inp_cls, "inp_cls");
|
||||||
|
|
||||||
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
|
LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
|
||||||
ggml_backend_buffer_name(ctx->buf_input),
|
ggml_backend_buffer_name(ctx->buf_input),
|
||||||
ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
|
ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
|
||||||
|
Loading…
Reference in New Issue
Block a user