From 4ef1b017af7ff09a89c37c9b3dff00ec3a1d79ba Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 4 Oct 2024 17:35:58 +0300 Subject: [PATCH] llama : adapt to backend changes ggml-ci --- ggml/src/ggml-backend.cpp | 4 ++++ ggml/src/ggml-metal.m | 9 +++++++- src/llama.cpp | 48 ++++----------------------------------- 3 files changed, 16 insertions(+), 45 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index bb18f5bfd..afe64b105 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -479,6 +479,10 @@ ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t devic } ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) { + if (device->iface.get_host_buffer_type == NULL) { + return NULL; + } + return device->iface.get_host_buffer_type(device); } diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 1ba502f78..ee4dbd246 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -3628,6 +3628,13 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml UNUSED(dev); } +static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { + return false; + + GGML_UNUSED(dev); + GGML_UNUSED(op); +} + static struct ggml_backend_device_i ggml_backend_metal_device_i = { /* .get_name = */ ggml_backend_metal_device_get_name, /* .get_description = */ ggml_backend_metal_device_get_description, @@ -3640,7 +3647,7 @@ static struct ggml_backend_device_i ggml_backend_metal_device_i = { /* .buffer_from_host_ptr = */ ggml_backend_metal_device_buffer_from_ptr, /* .supports_op = */ ggml_backend_metal_device_supports_op, /* .supports_buft = */ ggml_backend_metal_device_supports_buft, - /* .offload_op = */ NULL, + /* .offload_op = */ ggml_backend_metal_device_offload_op, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_synchronize = */ NULL, diff --git a/src/llama.cpp b/src/llama.cpp index bf6fd9277..a4581c2dd 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -26,10 +26,6 @@ # include "ggml-blas.h" #endif -#ifdef GGML_USE_METAL -# include "ggml-metal.h" -#endif - // TODO: replace with ggml API call #define QK_K 256 @@ -3292,9 +3288,6 @@ struct llama_context { std::unordered_map lora_adapters; std::vector backends; -#ifdef GGML_USE_METAL - ggml_backend_t backend_metal = nullptr; -#endif #ifdef GGML_USE_BLAS ggml_backend_t backend_blas = nullptr; #endif @@ -3420,9 +3413,7 @@ static int llama_get_device_count(const llama_model & model) { count += (int) model.rpc_servers.size(); #endif -#if defined(GGML_USE_METAL) - count += 1; -#elif defined(GGML_USE_SYCL) +#if defined(GGML_USE_SYCL) count += ggml_backend_sycl_get_device_count(); #elif defined(GGML_USE_VULKAN) count += ggml_backend_vk_get_device_count(); @@ -3488,9 +3479,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_ } device -= (int)model.devices.size(); -#if defined(GGML_USE_METAL) - buft = ggml_backend_metal_buffer_type(); -#elif defined(GGML_USE_VULKAN) +#if defined(GGML_USE_VULKAN) buft = ggml_backend_vk_buffer_type(device); #elif defined(GGML_USE_SYCL) buft = ggml_backend_sycl_buffer_type(device); @@ -8937,25 +8926,6 @@ static bool llm_load_tensors( bufs.emplace(idx, buf); } } -#ifdef GGML_USE_METAL - else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) { - for (uint32_t idx = 0; idx < ml.files.size(); idx++) { - const size_t max_size = ggml_get_max_tensor_size(ctx); - void * addr = nullptr; - size_t first, last; - ml.get_mapping_range(&first, &last, &addr, idx, ctx); - if (first >= last) { - continue; - } - ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size); - if (buf == nullptr) { - throw std::runtime_error("unable to allocate backend metal buffer"); - } - model.bufs.push_back(buf); - bufs.emplace(idx, buf); - } - } -#endif else { ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); if (buf == nullptr) { @@ -19041,7 +19011,7 @@ bool llama_supports_mlock(void) { } bool llama_supports_gpu_offload(void) { -#if defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ +#if defined(GGML_USE_VULKAN) || \ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. return true; @@ -19344,17 +19314,7 @@ struct llama_context * llama_new_context_with_model( } #endif -#if defined(GGML_USE_METAL) - if (model->n_gpu_layers > 0) { - ctx->backend_metal = ggml_backend_metal_init(); - if (ctx->backend_metal == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__); - llama_free(ctx); - return nullptr; - } - ctx->backends.push_back(ctx->backend_metal); - } -#elif defined(GGML_USE_VULKAN) +#if defined(GGML_USE_VULKAN) if (model->split_mode == LLAMA_SPLIT_MODE_ROW) { LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__); llama_free(ctx);