mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 03:31:46 +00:00
[SYCL] fix error when set main gpu to non-zero (#5901)
* fix error when set main gpu to non-zero * fix delete condition
This commit is contained in:
parent
e04e04f8fa
commit
ceca1aef07
178
ggml-sycl.cpp
178
ggml-sycl.cpp
@ -3559,12 +3559,31 @@ class sycl_gpu_mgr {
|
|||||||
int work_group_size = 0;
|
int work_group_size = 0;
|
||||||
std::string gpus_list = "";
|
std::string gpus_list = "";
|
||||||
|
|
||||||
|
/*
|
||||||
|
Use all GPU with same top max compute units
|
||||||
|
*/
|
||||||
sycl_gpu_mgr() {
|
sycl_gpu_mgr() {
|
||||||
detect_sycl_gpu_list_with_max_cu();
|
detect_sycl_gpu_list_with_max_cu();
|
||||||
get_allow_gpus();
|
get_allow_gpus();
|
||||||
create_context_with_gpus();
|
create_context_with_gpus();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
Use the assigned GPU as only one
|
||||||
|
*/
|
||||||
|
sycl_gpu_mgr(int main_gpu_id) {
|
||||||
|
sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
|
||||||
|
dpct::device_info prop;
|
||||||
|
dpct::get_device_info(prop, device);
|
||||||
|
gpus.push_back(main_gpu_id);
|
||||||
|
devices.push_back(device);
|
||||||
|
work_group_size = prop.get_max_work_group_size();
|
||||||
|
max_compute_units = prop.get_max_compute_units();
|
||||||
|
|
||||||
|
get_allow_gpus();
|
||||||
|
create_context_with_gpus();
|
||||||
|
}
|
||||||
|
|
||||||
void create_context_with_gpus() {
|
void create_context_with_gpus() {
|
||||||
sycl::context ctx = sycl::context(devices);
|
sycl::context ctx = sycl::context(devices);
|
||||||
assert(gpus.size() > 0);
|
assert(gpus.size() > 0);
|
||||||
@ -3580,7 +3599,7 @@ class sycl_gpu_mgr {
|
|||||||
gpus_list += std::to_string(gpus[i]);
|
gpus_list += std::to_string(gpus[i]);
|
||||||
gpus_list += ",";
|
gpus_list += ",";
|
||||||
}
|
}
|
||||||
if (gpus_list.length() > 2) {
|
if (gpus_list.length() > 1) {
|
||||||
gpus_list.pop_back();
|
gpus_list.pop_back();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -3629,8 +3648,8 @@ class sycl_gpu_mgr {
|
|||||||
if (gpus[i] == id)
|
if (gpus[i] == id)
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
assert(false);
|
printf("miss to get device index by id=%d\n", id);
|
||||||
return -1;
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
int get_next_index(int id) {
|
int get_next_index(int id) {
|
||||||
@ -3639,8 +3658,7 @@ class sycl_gpu_mgr {
|
|||||||
if (gpus[i] == id)
|
if (gpus[i] == id)
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
assert(false);
|
GGML_ASSERT(false);
|
||||||
return -1;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -3649,6 +3667,7 @@ static int g_device_count = -1;
|
|||||||
static int g_all_sycl_device_count = -1;
|
static int g_all_sycl_device_count = -1;
|
||||||
static int g_main_device = -1;
|
static int g_main_device = -1;
|
||||||
static int g_main_device_id = -1;
|
static int g_main_device_id = -1;
|
||||||
|
static bool g_ggml_backend_sycl_buffer_type_initialized = false;
|
||||||
|
|
||||||
static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};
|
static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};
|
||||||
|
|
||||||
@ -13225,7 +13244,7 @@ void ggml_backend_sycl_print_sycl_devices() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void print_gpu_device_list() {
|
void print_gpu_device_list() {
|
||||||
fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n",
|
fprintf(stderr, "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n",
|
||||||
g_sycl_gpu_mgr->get_gpu_count(),
|
g_sycl_gpu_mgr->get_gpu_count(),
|
||||||
g_sycl_gpu_mgr->gpus_list.c_str(),
|
g_sycl_gpu_mgr->gpus_list.c_str(),
|
||||||
g_sycl_gpu_mgr->max_compute_units);
|
g_sycl_gpu_mgr->max_compute_units);
|
||||||
@ -13264,23 +13283,6 @@ void ggml_init_sycl() try {
|
|||||||
#else
|
#else
|
||||||
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
|
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
|
|
||||||
dpct::dev_mgr::instance().device_count()) != 0) {
|
|
||||||
initialized = true;
|
|
||||||
g_sycl_loaded = false;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
|
|
||||||
ggml_backend_sycl_print_sycl_devices();
|
|
||||||
|
|
||||||
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
|
||||||
|
|
||||||
g_device_count = g_sycl_gpu_mgr->get_gpu_count();
|
|
||||||
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
|
|
||||||
|
|
||||||
print_gpu_device_list();
|
|
||||||
|
|
||||||
int64_t total_vram = 0;
|
|
||||||
|
|
||||||
/* NOT REMOVE, keep it for next optimize for XMX.
|
/* NOT REMOVE, keep it for next optimize for XMX.
|
||||||
#if defined(SYCL_USE_XMX)
|
#if defined(SYCL_USE_XMX)
|
||||||
@ -13289,52 +13291,71 @@ void ggml_init_sycl() try {
|
|||||||
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
|
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
*/
|
*/
|
||||||
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
|
|
||||||
g_device_caps[id].vmm = 0;
|
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
|
||||||
g_device_caps[id].device_id = -1;
|
dpct::dev_mgr::instance().device_count()) != 0) {
|
||||||
g_device_caps[id].cc = 0;
|
initialized = true;
|
||||||
g_tensor_split[id] = 0;
|
g_sycl_loaded = false;
|
||||||
g_default_tensor_split[id] = 0;
|
return;
|
||||||
}
|
}
|
||||||
|
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
|
||||||
for (int i = 0; i < g_device_count; ++i) {
|
ggml_backend_sycl_print_sycl_devices();
|
||||||
int device_id = g_sycl_gpu_mgr->gpus[i];
|
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
||||||
g_device_caps[i].vmm = 0;
|
print_gpu_device_list();
|
||||||
|
|
||||||
dpct::device_info prop;
|
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
|
||||||
prop, dpct::dev_mgr::instance().get_device(device_id))));
|
|
||||||
|
|
||||||
g_default_tensor_split[i] = total_vram;
|
|
||||||
total_vram += prop.get_global_mem_size();
|
|
||||||
|
|
||||||
g_device_caps[i].cc =
|
|
||||||
100 * prop.get_major_version() + 10 * prop.get_minor_version();
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < g_device_count; ++i) {
|
|
||||||
g_default_tensor_split[i] /= total_vram;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < g_device_count; ++i) {
|
|
||||||
SYCL_CHECK(ggml_sycl_set_device(i));
|
|
||||||
|
|
||||||
// create sycl streams
|
|
||||||
for (int is = 0; is < MAX_STREAMS; ++is) {
|
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
||||||
g_syclStreams[i][is] =
|
|
||||||
dpct::get_current_device().create_queue(
|
|
||||||
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
|
|
||||||
}
|
|
||||||
|
|
||||||
const dpct::queue_ptr stream = g_syclStreams[i][0];
|
|
||||||
// create sycl handle
|
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
|
|
||||||
}
|
|
||||||
|
|
||||||
initialized = true;
|
initialized = true;
|
||||||
g_sycl_loaded = true;
|
g_sycl_loaded = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
g_device_count = g_sycl_gpu_mgr->get_gpu_count();
|
||||||
|
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
|
||||||
|
|
||||||
|
int64_t total_vram = 0;
|
||||||
|
|
||||||
|
|
||||||
|
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
|
||||||
|
g_device_caps[id].vmm = 0;
|
||||||
|
g_device_caps[id].device_id = -1;
|
||||||
|
g_device_caps[id].cc = 0;
|
||||||
|
g_tensor_split[id] = 0;
|
||||||
|
g_default_tensor_split[id] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < g_device_count; ++i) {
|
||||||
|
int device_id = g_sycl_gpu_mgr->gpus[i];
|
||||||
|
g_device_caps[i].vmm = 0;
|
||||||
|
|
||||||
|
dpct::device_info prop;
|
||||||
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
||||||
|
prop, dpct::dev_mgr::instance().get_device(device_id))));
|
||||||
|
|
||||||
|
g_default_tensor_split[i] = total_vram;
|
||||||
|
total_vram += prop.get_global_mem_size();
|
||||||
|
|
||||||
|
g_device_caps[i].cc =
|
||||||
|
100 * prop.get_major_version() + 10 * prop.get_minor_version();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < g_device_count; ++i) {
|
||||||
|
g_default_tensor_split[i] /= total_vram;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < g_device_count; ++i) {
|
||||||
|
SYCL_CHECK(ggml_sycl_set_device(i));
|
||||||
|
|
||||||
|
// create sycl streams
|
||||||
|
for (int is = 0; is < MAX_STREAMS; ++is) {
|
||||||
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||||
|
g_syclStreams[i][is] =
|
||||||
|
dpct::get_current_device().create_queue(
|
||||||
|
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
|
||||||
|
}
|
||||||
|
|
||||||
|
const dpct::queue_ptr stream = g_syclStreams[i][0];
|
||||||
|
// create sycl handle
|
||||||
|
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
catch (sycl::exception const &exc) {
|
catch (sycl::exception const &exc) {
|
||||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||||
@ -16732,22 +16753,24 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
|||||||
/* .is_host = */ nullptr,
|
/* .is_host = */ nullptr,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
|
||||||
|
if (device_index>=g_device_count or device_index<0) {
|
||||||
|
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
|
||||||
|
device_index, g_device_count-1);
|
||||||
|
GGML_ASSERT(device_index<g_device_count);
|
||||||
|
}
|
||||||
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
|
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
|
||||||
|
|
||||||
static bool ggml_backend_sycl_buffer_type_initialized = false;
|
if (!g_ggml_backend_sycl_buffer_type_initialized) {
|
||||||
|
|
||||||
if (!ggml_backend_sycl_buffer_type_initialized) {
|
|
||||||
for (int i = 0; i < g_device_count; i++) {
|
for (int i = 0; i < g_device_count; i++) {
|
||||||
ggml_backend_sycl_buffer_types[i] = {
|
ggml_backend_sycl_buffer_types[i] = {
|
||||||
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
||||||
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
|
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
ggml_backend_sycl_buffer_type_initialized = true;
|
g_ggml_backend_sycl_buffer_type_initialized = true;
|
||||||
}
|
}
|
||||||
|
return &ggml_backend_sycl_buffer_types[device_index];
|
||||||
return &ggml_backend_sycl_buffer_types[device];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// sycl split buffer type
|
// sycl split buffer type
|
||||||
@ -17496,6 +17519,17 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
|
|||||||
return g_sycl_gpu_mgr->get_index(device_id);
|
return g_sycl_gpu_mgr->get_index(device_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device(int main_gpu_id) {
|
||||||
|
GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
|
||||||
|
printf("ggml_backend_sycl_set_single_device: use single device: %d\n", main_gpu_id);
|
||||||
|
if (g_sycl_gpu_mgr) {
|
||||||
|
delete g_sycl_gpu_mgr;
|
||||||
|
}
|
||||||
|
g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
|
||||||
|
ggml_init_sycl();
|
||||||
|
g_ggml_backend_sycl_buffer_type_initialized = false;
|
||||||
|
}
|
||||||
|
|
||||||
extern "C" int ggml_backend_sycl_reg_devices();
|
extern "C" int ggml_backend_sycl_reg_devices();
|
||||||
|
|
||||||
int ggml_backend_sycl_reg_devices() {
|
int ggml_backend_sycl_reg_devices() {
|
||||||
|
@ -28,6 +28,7 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
|
|||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
||||||
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
||||||
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
|
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
|
||||||
|
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device(int main_gpu);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
16
llama.cpp
16
llama.cpp
@ -3750,6 +3750,14 @@ static bool llm_load_tensors(
|
|||||||
model.main_gpu = main_gpu;
|
model.main_gpu = main_gpu;
|
||||||
model.n_gpu_layers = n_gpu_layers;
|
model.n_gpu_layers = n_gpu_layers;
|
||||||
|
|
||||||
|
#ifdef GGML_USE_SYCL
|
||||||
|
if (split_mode == LLAMA_SPLIT_MODE_NONE) {
|
||||||
|
ggml_backend_sycl_set_single_device(main_gpu);
|
||||||
|
//SYCL use device index (0, 1, 2), instead if device id.
|
||||||
|
main_gpu = ggml_backend_sycl_get_device_index(main_gpu);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
const int64_t n_layer = hparams.n_layer;
|
const int64_t n_layer = hparams.n_layer;
|
||||||
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
|
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
|
||||||
|
|
||||||
@ -12260,13 +12268,13 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
ctx->backends.push_back(backend);
|
ctx->backends.push_back(backend);
|
||||||
} else {
|
} else {
|
||||||
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
||||||
int id_list[GGML_SYCL_MAX_DEVICES];
|
|
||||||
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
|
||||||
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
||||||
int device_id = id_list[i];
|
|
||||||
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
|
int id_list[GGML_SYCL_MAX_DEVICES];
|
||||||
|
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, id_list[i], i);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user