fix set main gpu error (#6073)

This commit is contained in:
Neo Zhang Jianyu 2024-03-15 18:53:53 +08:00 committed by GitHub
parent 131b058409
commit 46acb36767
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 282 additions and 99 deletions

View File

@ -13,8 +13,11 @@ source /opt/intel/oneapi/setvars.sh
#for FP32 #for FP32
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
#build example/main only #build example/main
#cmake --build . --config Release --target main #cmake --build . --config Release --target main
#build example/llama-bench
#cmake --build . --config Release --target llama-bench
#build all binary #build all binary
cmake --build . --config Release -v cmake --build . --config Release -v

View File

@ -9,18 +9,28 @@ source /opt/intel/oneapi/setvars.sh
if [ $# -gt 0 ]; then if [ $# -gt 0 ]; then
GGML_SYCL_DEVICE=$1 GGML_SYCL_DEVICE=$1
GGML_SYCL_SINGLE_GPU=1
else else
GGML_SYCL_DEVICE=0 GGML_SYCL_DEVICE=0
fi fi
echo "use $GGML_SYCL_DEVICE as main GPU"
#export GGML_SYCL_DEBUG=1 #export GGML_SYCL_DEBUG=1
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer. #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
#use all GPUs with same max compute units if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 echo "use $GGML_SYCL_DEVICE as main GPU"
#use signle GPU only
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
else
#use multiple GPUs with same max compute units
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
fi
#use main GPU only #use main GPU only
#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none #ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
#use multiple GPUs with same max compute units
#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0

View File

@ -16,6 +16,7 @@
#include <cinttypes> #include <cinttypes>
#include <cstddef> #include <cstddef>
#include <cstdint> #include <cstdint>
#include <cstdlib>
#include <float.h> #include <float.h>
#include <limits> #include <limits>
#include <stdint.h> #include <stdint.h>
@ -24,10 +25,9 @@
#include <cmath> #include <cmath>
#include <iostream> #include <iostream>
#include <fstream> #include <fstream>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <regex>
#include <sycl/sycl.hpp> #include <sycl/sycl.hpp>
#include <sycl/half_type.hpp> #include <sycl/half_type.hpp>
@ -82,6 +82,30 @@ Following definition copied from DPCT head files, which are used by ggml-sycl.cp
#define __dpct_noinline__ __attribute__((noinline)) #define __dpct_noinline__ __attribute__((noinline))
#endif #endif
std::string get_device_type_name(const sycl::device &Device) {
auto DeviceType = Device.get_info<sycl::info::device::device_type>();
switch (DeviceType) {
case sycl::info::device_type::cpu:
return "cpu";
case sycl::info::device_type::gpu:
return "gpu";
case sycl::info::device_type::host:
return "host";
case sycl::info::device_type::accelerator:
return "acc";
default:
return "unknown";
}
}
std::string get_device_backend_and_type(const sycl::device &device) {
std::stringstream device_type;
sycl::backend backend = device.get_backend();
device_type << backend << ":" << get_device_type_name(device);
return device_type.str();
}
namespace dpct namespace dpct
{ {
typedef sycl::queue *queue_ptr; typedef sycl::queue *queue_ptr;
@ -942,17 +966,65 @@ namespace dpct
private: private:
mutable std::recursive_mutex m_mutex; mutable std::recursive_mutex m_mutex;
static bool compare_dev(sycl::device &device1, sycl::device &device2)
{
dpct::device_info prop1;
dpct::get_device_info(prop1, device1);
dpct::device_info prop2;
dpct::get_device_info(prop2, device2);
return prop1.get_max_compute_units() > prop2.get_max_compute_units();
}
static int convert_backend_index(std::string & backend) {
if (backend == "ext_oneapi_level_zero:gpu") return 0;
if (backend == "opencl:gpu") return 1;
if (backend == "opencl:cpu") return 2;
if (backend == "opencl:acc") return 3;
printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
GGML_ASSERT(false);
}
static bool compare_backend(std::string &backend1, std::string &backend2) {
return convert_backend_index(backend1) < convert_backend_index(backend2);
}
dev_mgr() dev_mgr()
{ {
sycl::device default_device = sycl::device default_device =
sycl::device(sycl::default_selector_v); sycl::device(sycl::default_selector_v);
_devs.push_back(std::make_shared<device_ext>(default_device)); _devs.push_back(std::make_shared<device_ext>(default_device));
std::vector<sycl::device> sycl_all_devs = std::vector<sycl::device> sycl_all_devs;
sycl::device::get_devices(sycl::info::device_type::all);
// Collect other devices except for the default device. // Collect other devices except for the default device.
if (default_device.is_cpu()) if (default_device.is_cpu())
_cpu_device = 0; _cpu_device = 0;
auto Platforms = sycl::platform::get_platforms();
// Keep track of the number of devices per backend
std::map<sycl::backend, size_t> DeviceNums;
std::map<std::string, std::vector<sycl::device>> backend_devices;
while (!Platforms.empty()) {
auto Platform = Platforms.back();
Platforms.pop_back();
auto devices = Platform.get_devices();
std::string backend_type = get_device_backend_and_type(devices[0]);
for (const auto &device : devices) {
backend_devices[backend_type].push_back(device);
}
}
std::vector<std::string> keys;
for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) {
keys.push_back(it->first);
}
std::sort(keys.begin(), keys.end(), compare_backend);
for (auto &key : keys) {
std::vector<sycl::device> devs = backend_devices[key];
std::sort(devs.begin(), devs.end(), compare_dev);
for (const auto &dev : devs) {
sycl_all_devs.push_back(dev);
}
}
for (auto &dev : sycl_all_devs) for (auto &dev : sycl_all_devs)
{ {
if (dev == default_device) if (dev == default_device)
@ -3202,6 +3274,11 @@ static int g_work_group_size = 0;
#define GGML_SYCL_MMV_Y 1 #define GGML_SYCL_MMV_Y 1
#endif #endif
enum ggml_sycl_backend_gpu_mode {
SYCL_UNSET_GPU_MODE = -1,
SYCL_SINGLE_GPU_MODE = 0,
SYCL_MUL_GPU_MODE
};
static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size"); static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
@ -3401,12 +3478,31 @@ class sycl_gpu_mgr {
int work_group_size = 0; int work_group_size = 0;
std::string gpus_list = ""; std::string gpus_list = "";
/*
Use all GPUs with same top max compute units
*/
sycl_gpu_mgr() { sycl_gpu_mgr() {
detect_sycl_gpu_list_with_max_cu(); detect_sycl_gpu_list_with_max_cu();
get_allow_gpus(); get_allow_gpus();
create_context_with_gpus(); create_context_with_gpus();
} }
/*
Only use the assigned GPU
*/
sycl_gpu_mgr(int main_gpu_id) {
sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
dpct::device_info prop;
dpct::get_device_info(prop, device);
gpus.push_back(main_gpu_id);
devices.push_back(device);
work_group_size = prop.get_max_work_group_size();
max_compute_units = prop.get_max_compute_units();
get_allow_gpus();
create_context_with_gpus();
}
void create_context_with_gpus() { void create_context_with_gpus() {
sycl::context ctx = sycl::context(devices); sycl::context ctx = sycl::context(devices);
assert(gpus.size() > 0); assert(gpus.size() > 0);
@ -3422,7 +3518,7 @@ class sycl_gpu_mgr {
gpus_list += std::to_string(gpus[i]); gpus_list += std::to_string(gpus[i]);
gpus_list += ","; gpus_list += ",";
} }
if (gpus_list.length() > 2) { if (gpus_list.length() > 1) {
gpus_list.pop_back(); gpus_list.pop_back();
} }
} }
@ -3471,8 +3567,8 @@ class sycl_gpu_mgr {
if (gpus[i] == id) if (gpus[i] == id)
return i; return i;
} }
assert(false); printf("miss to get device index by id=%d\n", id);
return -1; GGML_ASSERT(false);
} }
int get_next_index(int id) { int get_next_index(int id) {
@ -3481,8 +3577,7 @@ class sycl_gpu_mgr {
if (gpus[i] == id) if (gpus[i] == id)
return i; return i;
} }
assert(false); GGML_ASSERT(false);
return -1;
} }
bool is_ext_oneapi_device(const sycl::device &dev) { bool is_ext_oneapi_device(const sycl::device &dev) {
@ -3500,11 +3595,14 @@ static int g_device_count = -1;
static int g_all_sycl_device_count = -1; static int g_all_sycl_device_count = -1;
static int g_main_device = -1; static int g_main_device = -1;
static int g_main_device_id = -1; static int g_main_device_id = -1;
static bool g_ggml_backend_sycl_buffer_type_initialized = false;
static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {}; static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};
static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0}; static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0};
static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode = SYCL_UNSET_GPU_MODE;
struct sycl_device_capabilities { struct sycl_device_capabilities {
int cc; // compute capability int cc; // compute capability
bool vmm; // virtual memory support bool vmm; // virtual memory support
@ -13008,17 +13106,20 @@ bool ggml_sycl_loaded(void) {
return g_sycl_loaded; return g_sycl_loaded;
} }
void print_device_detail(int id) { void print_device_detail(int id, sycl::device &device, std::string device_type) {
dpct::device_info prop; dpct::device_info prop;
SYCL_CHECK(CHECK_TRY_ERROR( SYCL_CHECK(CHECK_TRY_ERROR(
dpct::get_device_info(prop, dpct::dev_mgr::instance().get_device(id)))); dpct::get_device_info(prop, device)));
sycl::device cur_device = dpct::dev_mgr::instance().get_device(id);
std::string version; std::string version;
version += std::to_string(prop.get_major_version()); version += std::to_string(prop.get_major_version());
version += "."; version += ".";
version += std::to_string(prop.get_minor_version()); version += std::to_string(prop.get_minor_version());
fprintf(stderr, "|%2d|%45s|%18s|%17d|%14d|%13d|%15lu|\n", id, device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(),
prop.get_name(), version.c_str(), prop.get_max_compute_units(), prop.get_name(), version.c_str(), prop.get_max_compute_units(),
prop.get_max_work_group_size(), prop.get_max_sub_group_size(), prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
prop.get_global_mem_size()); prop.get_global_mem_size());
@ -13026,19 +13127,35 @@ void print_device_detail(int id) {
void ggml_backend_sycl_print_sycl_devices() { void ggml_backend_sycl_print_sycl_devices() {
int device_count = dpct::dev_mgr::instance().device_count(); int device_count = dpct::dev_mgr::instance().device_count();
std::map<std::string, size_t> DeviceNums;
fprintf(stderr, "found %d SYCL devices:\n", device_count); fprintf(stderr, "found %d SYCL devices:\n", device_count);
fprintf(stderr, "|ID| Name |compute capability|Max compute units|Max work group|Max sub group|Global mem size|\n"); fprintf(stderr, "| | | |Compute |Max compute|Max work|Max sub| |\n");
fprintf(stderr, "|--|---------------------------------------------|------------------|-----------------|--------------|-------------|---------------|\n"); fprintf(stderr, "|ID| Device Type| Name|capability|units |group |group |Global mem size|\n");
fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n");
for (int id = 0; id < device_count; ++id) { for (int id = 0; id < device_count; ++id) {
print_device_detail(id); sycl::device device = dpct::dev_mgr::instance().get_device(id);
sycl::backend backend = device.get_backend();
std::string backend_type = get_device_backend_and_type(device);
int type_id=DeviceNums[backend_type]++;
std::stringstream device_type;
device_type << "[" << backend_type << ":" << std::to_string(type_id) << "]";
print_device_detail(id, device, device_type.str());
} }
} }
void print_gpu_device_list() { void print_gpu_device_list() {
fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n", GGML_ASSERT(g_sycl_gpu_mgr);
g_sycl_gpu_mgr->get_gpu_count(),
g_sycl_gpu_mgr->gpus_list.c_str(), char* hint=NULL;
g_sycl_gpu_mgr->max_compute_units); if (g_ggml_sycl_backend_gpu_mode == SYCL_SINGLE_GPU_MODE) {
hint = "use %d SYCL GPUs: [%s] with Max compute units:%d\n";
} else {
hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n";
}
fprintf(stderr, hint,
g_sycl_gpu_mgr->get_gpu_count(),
g_sycl_gpu_mgr->gpus_list.c_str(),
g_sycl_gpu_mgr->max_compute_units);
} }
int get_sycl_env(const char *env_name, int default_val) { int get_sycl_env(const char *env_name, int default_val) {
@ -13074,23 +13191,6 @@ void ggml_init_sycl() try {
#else #else
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__); fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
#endif #endif
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
dpct::dev_mgr::instance().device_count()) != 0) {
initialized = true;
g_sycl_loaded = false;
return;
}
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
ggml_backend_sycl_print_sycl_devices();
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
g_device_count = g_sycl_gpu_mgr->get_gpu_count();
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
print_gpu_device_list();
int64_t total_vram = 0;
/* NOT REMOVE, keep it for next optimize for XMX. /* NOT REMOVE, keep it for next optimize for XMX.
#if defined(SYCL_USE_XMX) #if defined(SYCL_USE_XMX)
@ -13099,49 +13199,15 @@ void ggml_init_sycl() try {
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__); fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
#endif #endif
*/ */
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
g_device_caps[id].vmm = 0; if (CHECK_TRY_ERROR(g_all_sycl_device_count =
g_device_caps[id].device_id = -1; dpct::dev_mgr::instance().device_count()) != 0) {
g_device_caps[id].cc = 0; initialized = true;
g_tensor_split[id] = 0; g_sycl_loaded = false;
g_default_tensor_split[id] = 0; return;
} }
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
for (int i = 0; i < g_device_count; ++i) { ggml_backend_sycl_print_sycl_devices();
int device_id = g_sycl_gpu_mgr->gpus[i];
g_device_caps[i].vmm = 0;
dpct::device_info prop;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
prop, dpct::dev_mgr::instance().get_device(device_id))));
g_default_tensor_split[i] = total_vram;
total_vram += prop.get_global_mem_size();
g_device_caps[i].cc =
100 * prop.get_major_version() + 10 * prop.get_minor_version();
}
for (int i = 0; i < g_device_count; ++i) {
g_default_tensor_split[i] /= total_vram;
}
for (int i = 0; i < g_device_count; ++i) {
SYCL_CHECK(ggml_sycl_set_device(i));
// create sycl streams
for (int is = 0; is < MAX_STREAMS; ++is) {
SYCL_CHECK(CHECK_TRY_ERROR(
g_syclStreams[i][is] =
dpct::get_current_device().create_queue(
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
}
const dpct::queue_ptr stream = g_syclStreams[i][0];
// create sycl handle
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
}
initialized = true; initialized = true;
g_sycl_loaded = true; g_sycl_loaded = true;
} }
@ -13152,6 +13218,63 @@ catch (sycl::exception const &exc) {
std::exit(1); std::exit(1);
} }
void ggml_init_by_gpus(int device_count) try {
g_device_count = device_count;
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
int64_t total_vram = 0;
print_gpu_device_list();
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
g_device_caps[id].vmm = 0;
g_device_caps[id].device_id = -1;
g_device_caps[id].cc = 0;
g_tensor_split[id] = 0;
g_default_tensor_split[id] = 0;
}
for (int i = 0; i < g_device_count; ++i) {
int device_id = g_sycl_gpu_mgr->gpus[i];
g_device_caps[i].vmm = 0;
dpct::device_info prop;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
prop, dpct::dev_mgr::instance().get_device(device_id))));
g_default_tensor_split[i] = total_vram;
total_vram += prop.get_global_mem_size();
g_device_caps[i].cc =
100 * prop.get_major_version() + 10 * prop.get_minor_version();
}
for (int i = 0; i < g_device_count; ++i) {
g_default_tensor_split[i] /= total_vram;
}
for (int i = 0; i < g_device_count; ++i) {
SYCL_CHECK(ggml_sycl_set_device(i));
// create sycl streams
for (int is = 0; is < MAX_STREAMS; ++is) {
SYCL_CHECK(CHECK_TRY_ERROR(
g_syclStreams[i][is] =
dpct::get_current_device().create_queue(
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
}
const dpct::queue_ptr stream = g_syclStreams[i][0];
// create sycl handle
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
}
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
<< ", line:" << __LINE__ << std::endl;
std::exit(1);
}
void *ggml_sycl_host_malloc(size_t size) try { void *ggml_sycl_host_malloc(size_t size) try {
if (getenv("GGML_SYCL_NO_PINNED") != nullptr) { if (getenv("GGML_SYCL_NO_PINNED") != nullptr) {
return nullptr; return nullptr;
@ -16551,22 +16674,24 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
/* .is_host = */ nullptr, /* .is_host = */ nullptr,
}; };
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
if (device_index>=g_device_count or device_index<0) {
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
device_index, g_device_count-1);
GGML_ASSERT(device_index<g_device_count);
}
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES]; static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
static bool ggml_backend_sycl_buffer_type_initialized = false; if (!g_ggml_backend_sycl_buffer_type_initialized) {
if (!ggml_backend_sycl_buffer_type_initialized) {
for (int i = 0; i < g_device_count; i++) { for (int i = 0; i < g_device_count; i++) {
ggml_backend_sycl_buffer_types[i] = { ggml_backend_sycl_buffer_types[i] = {
/* .iface = */ ggml_backend_sycl_buffer_type_interface, /* .iface = */ ggml_backend_sycl_buffer_type_interface,
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])}, /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
}; };
} }
ggml_backend_sycl_buffer_type_initialized = true; g_ggml_backend_sycl_buffer_type_initialized = true;
} }
return &ggml_backend_sycl_buffer_types[device_index];
return &ggml_backend_sycl_buffer_types[device];
} }
// sycl split buffer type // sycl split buffer type
@ -17319,11 +17444,42 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
return g_sycl_gpu_mgr->get_index(device_id); return g_sycl_gpu_mgr->get_index(device_id);
} }
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
return g_sycl_gpu_mgr->gpus[device_index];
}
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
if (g_sycl_gpu_mgr) {
delete g_sycl_gpu_mgr;
}
g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
g_ggml_sycl_backend_gpu_mode = SYCL_SINGLE_GPU_MODE;
ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
g_ggml_backend_sycl_buffer_type_initialized = false;
}
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
return;
}
fprintf(stderr, "ggml_backend_sycl_set_mul_device_mode: true\n");
if (g_sycl_gpu_mgr) {
delete g_sycl_gpu_mgr;
}
g_sycl_gpu_mgr = new sycl_gpu_mgr();
g_ggml_sycl_backend_gpu_mode = SYCL_MUL_GPU_MODE;
ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
g_ggml_backend_sycl_buffer_type_initialized = false;
}
extern "C" int ggml_backend_sycl_reg_devices(); extern "C" int ggml_backend_sycl_reg_devices();
int ggml_backend_sycl_reg_devices() { int ggml_backend_sycl_reg_devices() {
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr(); ggml_backend_sycl_set_mul_device_mode();
g_device_count = g_sycl_gpu_mgr->get_gpu_count();
assert(g_device_count>0); assert(g_device_count>0);
for (int i = 0; i < g_device_count; i++) { for (int i = 0; i < g_device_count; i++) {
int id = g_sycl_gpu_mgr->gpus[i]; int id = g_sycl_gpu_mgr->gpus[i];

View File

@ -29,6 +29,11 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_typ
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id); GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
// TODO: these are temporary
// ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -5064,6 +5064,16 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
} }
#endif #endif
#ifdef GGML_USE_SYCL
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
ggml_backend_sycl_set_single_device_mode(params.main_gpu);
//SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index.
params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu);
} else {
ggml_backend_sycl_set_mul_device_mode();
}
#endif
if (!llm_load_tensors( if (!llm_load_tensors(
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock, ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
params.progress_callback, params.progress_callback_user_data params.progress_callback, params.progress_callback_user_data
@ -12921,23 +12931,22 @@ struct llama_context * llama_new_context_with_model(
if (model->n_gpu_layers > 0) { if (model->n_gpu_layers > 0) {
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu); ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
if (backend == nullptr) { if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index); int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
llama_free(ctx); llama_free(ctx);
return nullptr; return nullptr;
} }
ctx->backends.push_back(backend); ctx->backends.push_back(backend);
} else { } else {
// LLAMA_SPLIT_LAYER requires a backend for each GPU // LLAMA_SPLIT_LAYER requires a backend for each GPU
int id_list[GGML_SYCL_MAX_DEVICES];
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) { for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
int device_id = id_list[i];
ggml_backend_t backend = ggml_backend_sycl_init(i); ggml_backend_t backend = ggml_backend_sycl_init(i);
if (backend == nullptr) { if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i); int id_list[GGML_SYCL_MAX_DEVICES];
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
llama_free(ctx); llama_free(ctx);
return nullptr; return nullptr;
} }