From 6ff0e7a32e6f952d43a62c2eb396b65a1fb18b4f Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 2 Oct 2024 01:23:54 +0200 Subject: [PATCH] add device props/caps, fully support async upload for all compatible backends --- ggml/include/ggml-backend.h | 77 ++++++++++++++++--------- ggml/src/ggml-backend-impl.h | 31 ++++++++--- ggml/src/ggml-backend.cpp | 74 ++++++++++++++---------- ggml/src/ggml-cuda.cu | 24 +++++++- src/llama.cpp | 105 +++++++++++++++++++++++++++-------- tests/test-backend-ops.cpp | 3 - 6 files changed, 221 insertions(+), 93 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 46a9435be..b096aaed6 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -20,13 +20,13 @@ extern "C" { // Backend buffer type // - GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); - GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); - GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); - GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); - GGML_API size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); - GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); - GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft); + GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); + GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); + GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); + GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); + GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); + GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); + GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft); // // Backend buffer @@ -53,6 +53,9 @@ extern "C" { GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer); GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer); + // tensor copy between different backends + GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); + // // Backend (stream) // @@ -88,49 +91,70 @@ extern "C" { GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft); GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); - // tensor copy between different backends - GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); - // asynchronous copy // the copy is performed after all the currently queued operations in backend_src // backend_dst will wait for the copy to complete before performing other operations // automatic fallback to sync copy if async is not supported GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst); - // events - GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_dev_t device); - GGML_API void ggml_backend_event_free (ggml_backend_event_t event); - GGML_API void ggml_backend_event_record (ggml_backend_event_t event, ggml_backend_t backend); + GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend); + + // + // Events + // + + GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device); + GGML_API void ggml_backend_event_free(ggml_backend_event_t event); + GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend); GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event); - GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); + GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event); // // Backend device // - enum ggml_backend_device_type { + enum ggml_backend_dev_type { GGML_BACKEND_DEVICE_TYPE_CPU, GGML_BACKEND_DEVICE_TYPE_GPU, - // devices with full capabilities (excludes backends such as BLAS) + // devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication) GGML_BACKEND_DEVICE_TYPE_CPU_FULL, GGML_BACKEND_DEVICE_TYPE_GPU_FULL }; + // functionality supported by the device + struct ggml_backend_dev_caps { + // asynchronous operations + bool async; + // pinned host buffer + bool host_buffer; + // event synchronization + bool events; + }; + + // all the device properties + struct ggml_backend_dev_props { + const char * name; + const char * description; + size_t memory_free; + size_t memory_total; + enum ggml_backend_dev_type type; + struct ggml_backend_dev_caps caps; + }; + GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device); GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device); GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total); - GGML_API enum ggml_backend_device_type ggml_backend_dev_type(ggml_backend_dev_t device); + GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device); + GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props); GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device); GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params); GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device); GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device); GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size); - GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op); - GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft); - GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op); - - GGML_API ggml_backend_event_t ggml_backend_dev_event_new(ggml_backend_dev_t device); + GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op); + GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft); + GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op); // // Backend (reg) @@ -158,16 +182,16 @@ extern "C" { GGML_API size_t ggml_backend_dev_count(void); GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index); GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name); - GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_device_type type); + GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type); // Set the log callback for all registered backends GGML_API void ggml_backend_set_log_callback(ggml_log_callback log_callback, void * user_data); - // Direct Backend (stream) initialization + // Direct backend (stream) initialization // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params) GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params); // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params) - GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_device_type type, const char * params); + GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params); // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL) GGML_API ggml_backend_t ggml_backend_init_best(void); @@ -276,7 +300,6 @@ extern "C" { GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor); - // // CPU backend // diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index 964a9c529..1b631f994 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -9,10 +9,9 @@ extern "C" { #endif // - // Backend buffer + // Backend buffer type // - // buffer type struct ggml_backend_buffer_type_i { const char * (*get_name) (ggml_backend_buffer_type_t buft); // allocate a buffer of this type @@ -33,7 +32,10 @@ extern "C" { void * context; }; - // buffer + // + // Backend buffer + // + struct ggml_backend_buffer_i { const char * (*get_name) (ggml_backend_buffer_t buffer); // (optional) free the buffer @@ -143,15 +145,26 @@ extern "C" { }; // - // Backend registry v2 + // Backend device // + // Note: if additional properties are needed, we should add a struct with all of them + // the current functions to obtain the properties can remain, since they are more convenient for often used properties struct ggml_backend_device_i { - // device properties + // device name: short identifier for this device, such as "CPU" or "CUDA0" const char * (*get_name)(ggml_backend_dev_t dev); + + // device description: short informative description of the device, could be the model name const char * (*get_description)(ggml_backend_dev_t dev); + + // device memory in bytes void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total); - enum ggml_backend_device_type (*get_type)(ggml_backend_dev_t dev); + + // device type + enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev); + + // device properties + void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props); // get the backend (reg) associated with this device ggml_backend_reg_t (*get_backend_reg)(ggml_backend_dev_t dev); @@ -190,6 +203,10 @@ extern "C" { void * context; }; + // + // Backend (reg) + // + struct ggml_backend_reg_i { const char * (*get_name)(ggml_backend_reg_t reg); @@ -212,7 +229,7 @@ extern "C" { }; - // Internal API + // Internal backend registry API void ggml_backend_register(ggml_backend_reg_t reg); void ggml_backend_device_register(ggml_backend_dev_t device); // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 2ccd8dba7..cca6c41a3 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -331,6 +331,10 @@ bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * return false; } +ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) { + return backend->device; +} + // backend copy static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { @@ -440,10 +444,14 @@ void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * device->iface.get_memory(device, free, total); } -enum ggml_backend_device_type ggml_backend_dev_type(ggml_backend_dev_t device) { +enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) { return device->iface.get_type(device); } +void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) { + device->iface.get_props(device, props); +} + ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) { return device->iface.get_backend_reg(device); } @@ -476,13 +484,6 @@ bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_te return device->iface.offload_op(device, op); } -ggml_backend_event_t ggml_backend_dev_event_new(ggml_backend_dev_t device) { - if (!device->iface.event_new) { - return NULL; - } - return device->iface.event_new(device); -} - // Backend (reg) const char * ggml_backend_reg_name(ggml_backend_reg_t reg) { @@ -603,7 +604,7 @@ ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) { return NULL; } -ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_device_type type) { +ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) { for (size_t i = 0; i < ggml_backend_dev_count(); i++) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); if (ggml_backend_dev_type(dev) == type) { @@ -629,7 +630,7 @@ ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) return ggml_backend_dev_init(dev, params); } -ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_device_type type, const char * params) { +ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) { ggml_backend_dev_t dev = ggml_backend_dev_by_type(type); if (!dev) { return NULL; @@ -1028,60 +1029,72 @@ static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user //////////////////////// -static const char * ggml_backend_cpu_device_name(ggml_backend_dev_t device) { +static const char * ggml_backend_cpu_device_name(ggml_backend_dev_t dev) { return "CPU"; - GGML_UNUSED(device); + GGML_UNUSED(dev); } -static const char * ggml_backend_cpu_device_description(ggml_backend_dev_t device) { +static const char * ggml_backend_cpu_device_description(ggml_backend_dev_t dev) { // TODO return "CPU"; - GGML_UNUSED(device); + GGML_UNUSED(dev); } -static void ggml_backend_cpu_device_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { +static void ggml_backend_cpu_device_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { // TODO *free = 0; *total = 0; - GGML_UNUSED(device); + GGML_UNUSED(dev); } -static enum ggml_backend_device_type ggml_backend_cpu_device_type(ggml_backend_dev_t device) { +static enum ggml_backend_dev_type ggml_backend_cpu_device_type(ggml_backend_dev_t dev) { return GGML_BACKEND_DEVICE_TYPE_CPU_FULL; - GGML_UNUSED(device); + GGML_UNUSED(dev); } -static ggml_backend_reg_t ggml_backend_cpu_device_reg(ggml_backend_dev_t device) { +static void ggml_backend_cpu_device_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + props->name = ggml_backend_cpu_device_name(dev); + props->description = ggml_backend_cpu_device_description(dev); + props->type = ggml_backend_cpu_device_type(dev); + ggml_backend_cpu_device_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* async */ false, + /* host_buffer */ false, + /* events */ false, + }; +} + +static ggml_backend_reg_t ggml_backend_cpu_device_reg(ggml_backend_dev_t dev) { return ggml_backend_cpu_reg(); - GGML_UNUSED(device); + GGML_UNUSED(dev); } -static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t device, const char * params) { +static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t dev, const char * params) { return ggml_backend_cpu_init(); - GGML_UNUSED(device); + GGML_UNUSED(dev); GGML_UNUSED(params); } -static ggml_backend_buffer_type_t ggml_backend_cpu_device_buffer_type(ggml_backend_dev_t device) { +static ggml_backend_buffer_type_t ggml_backend_cpu_device_buffer_type(ggml_backend_dev_t dev) { return ggml_backend_cpu_buffer_type(); - GGML_UNUSED(device); + GGML_UNUSED(dev); } -static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) { +static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { return ggml_backend_cpu_buffer_from_ptr(ptr, size); - GGML_UNUSED(device); + GGML_UNUSED(dev); GGML_UNUSED(max_tensor_size); } -static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) { +static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { switch (op->op) { case GGML_OP_CPY: return @@ -1101,13 +1114,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t device, const return true; } - GGML_UNUSED(device); + GGML_UNUSED(dev); } -static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) { +static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { return ggml_backend_buft_is_host(buft); - GGML_UNUSED(device); + GGML_UNUSED(dev); } struct ggml_backend_device_i ggml_backend_cpu_device_i = { @@ -1115,6 +1128,7 @@ struct ggml_backend_device_i ggml_backend_cpu_device_i = { /* .get_description = */ ggml_backend_cpu_device_description, /* .get_memory = */ ggml_backend_cpu_device_memory, /* .get_type = */ ggml_backend_cpu_device_type, + /* .get_props = */ ggml_backend_cpu_device_props, /* .get_backend_reg = */ ggml_backend_cpu_device_reg, /* .init_backend = */ ggml_backend_cpu_device_init, /* .buffer_type = */ ggml_backend_cpu_device_buffer_type, diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index ed8e43323..415d59465 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2929,11 +2929,31 @@ static void ggml_backend_cuda_device_memory(ggml_backend_dev_t dev, size_t * fre CUDA_CHECK(cudaMemGetInfo(free, total)); } -static enum ggml_backend_device_type ggml_backend_cuda_device_type(ggml_backend_dev_t dev) { +static enum ggml_backend_dev_type ggml_backend_cuda_device_type(ggml_backend_dev_t dev) { GGML_UNUSED(dev); return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; } +static void ggml_backend_cuda_device_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { + props->name = ggml_backend_cuda_device_name(dev); + props->description = ggml_backend_cuda_device_description(dev); + props->type = ggml_backend_cuda_device_type(dev); + ggml_backend_cuda_device_memory(dev, &props->memory_free, &props->memory_total); + + bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr; +#ifdef GGML_CUDA_NO_PEER_COPY + bool events = false; +#else + bool events = true; +#endif + + props->caps = { + /* async */ true, + /* host_buffer */ host_buffer, + /* events */ events, + }; +} + static ggml_backend_reg_t ggml_backend_cuda_device_reg(ggml_backend_dev_t dev) { GGML_UNUSED(dev); return ggml_backend_cuda_reg(); @@ -3206,7 +3226,6 @@ static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const gg GGML_UNUSED(dev); } - static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) { #ifdef GGML_CUDA_NO_PEER_COPY return nullptr; @@ -3242,6 +3261,7 @@ static ggml_backend_device_i ggml_backend_cuda_device_interface = { /* .get_description = */ ggml_backend_cuda_device_description, /* .get_memory = */ ggml_backend_cuda_device_memory, /* .get_type = */ ggml_backend_cuda_device_type, + /* .get_props = */ ggml_backend_cuda_device_props, /* .get_backend_reg = */ ggml_backend_cuda_device_reg, /* .init_backend = */ ggml_backend_cuda_device_init, /* .buffer_type = */ ggml_backend_cuda_device_buffer_type, diff --git a/src/llama.cpp b/src/llama.cpp index 406d0f5b3..a7ed46b22 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5035,7 +5035,7 @@ struct llama_model_loader { // Returns false if cancelled by progress_callback bool load_all_data( struct ggml_context * ctx, - llama_buf_map & bufs_mmap, + llama_buf_map & bufs, llama_mlocks * lmlocks, llama_progress_callback progress_callback, void * progress_callback_user_data) { @@ -5044,7 +5044,6 @@ struct llama_model_loader { std::vector> read_buf; std::vector>> validation_result; - // TODO: adapt to ggml-backend // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. // NVMe raid configurations might require more / larger buffers. constexpr size_t n_buffers = 4; @@ -5054,26 +5053,84 @@ struct llama_model_loader { std::vector events; std::vector host_ptrs; size_t buffer_idx = 0; // buffer to use for async loads - - // TODO: only do this if the backend supports all the required features: async, events, pinned memory - // it also must be avoided for split buffers and other buffers that require the entire tensor to be loaded at once - ggml_backend_t upload_backend = nullptr; - if (!use_mmap && !check_tensors) { - // When not using mmaped io use async uploads from pinned memory to GPU memory. - // First determine if the CUDA backend is active, and if so, determine the device ID. - ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr; - ggml_backend_dev_t dev = buf ? ggml_backend_buft_get_device(ggml_backend_buffer_get_type(buf)) : nullptr; - ggml_backend_buffer_type_t host_buft = dev ? ggml_backend_dev_host_buffer_type(dev) : nullptr; - upload_backend = host_buft ? ggml_backend_dev_init(dev, nullptr) : nullptr; - - // If the cuda is active create pinned memory buffers and events for synchronisation. - if (upload_backend) { - for (size_t idx = 0; idx < n_buffers; ++idx) { - host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(host_buft, buffer_size)); - host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx])); - events.emplace_back(ggml_backend_dev_event_new(dev)); - } + ggml_backend_t upload_backend = [&](const char * fn) -> ggml_backend_t { + if (use_mmap || check_tensors) { + return nullptr; } + // When not using mmaped io use async uploads from pinned memory to GPU memory. + // First determine if the backend supports the necessary features for async uploads. + auto * buf = bufs.count(0) ? bufs.at(0) : nullptr; + if (!buf) { + LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", fn); + return nullptr; + } + + auto * buft = ggml_backend_buffer_get_type(buf); + auto * dev = ggml_backend_buft_get_device(buft); + if (!dev) { + LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", fn, + ggml_backend_buft_name(buft)); + return nullptr; + } + + if (buft != ggml_backend_dev_buffer_type(dev)) { + LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", fn, + ggml_backend_buft_name(buft), ggml_backend_dev_name(dev)); + return nullptr; + } + + ggml_backend_dev_props props; + ggml_backend_dev_get_props(dev, &props); + if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) { + LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", fn, + ggml_backend_dev_name(dev)); + return nullptr; + } + + auto * host_buft = ggml_backend_dev_host_buffer_type(dev); + if (!host_buft) { + LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", fn, + ggml_backend_dev_name(dev)); + return nullptr; + } + + // If the backend is supported, create pinned memory buffers and events for synchronisation. + for (size_t idx = 0; idx < n_buffers; ++idx) { + auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); + if (!buf) { + LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", fn, + ggml_backend_dev_name(dev)); + return nullptr; + } + + host_buffers.emplace_back(buf); + host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf)); + + auto * event = ggml_backend_event_new(dev); + if (!event) { + LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", fn, + ggml_backend_dev_name(dev)); + return nullptr; + } + + events.emplace_back(event); + } + + ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); + if (!backend) { + LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", fn, + ggml_backend_dev_name(dev)); + return nullptr; + } + + return backend; + }(__func__); + + if (upload_backend) { + LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__, + ggml_backend_dev_name(ggml_backend_get_device(upload_backend)), + ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))), + ggml_backend_name(upload_backend)); } for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { @@ -5094,8 +5151,8 @@ struct llama_model_loader { if (use_mmap) { const auto & mapping = mappings.at(weight->idx); ggml_backend_buffer_t buf_mmap = nullptr; - if (bufs_mmap.count(weight->idx)) { - buf_mmap = bufs_mmap.at(weight->idx); + if (bufs.count(weight->idx)) { + buf_mmap = bufs.at(weight->idx); } uint8_t * data = (uint8_t *) mapping->addr + weight->offs; @@ -5131,7 +5188,7 @@ struct llama_model_loader { })); } } else { - // If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. + // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. if (upload_backend) { file->seek(weight->offs, SEEK_SET); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index e12ecf558..86a0b379b 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -672,14 +672,11 @@ struct test_case { } // run - ggml_backend_synchronize(backend); - int64_t total_time_us = 0; int total_runs = 0; do { int64_t start_time = ggml_time_us(); ggml_backend_graph_compute(backend, gf); - ggml_backend_synchronize(backend); int64_t end_time = ggml_time_us(); total_time_us += end_time - start_time;