add device props/caps, fully support async upload for all compatible backends

This commit is contained in:
slaren 2024-10-02 01:23:54 +02:00
parent 805fea97ac
commit 6ff0e7a32e
6 changed files with 221 additions and 93 deletions

View File

@ -53,6 +53,9 @@ extern "C" {
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer); GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer); GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
// //
// Backend (stream) // Backend (stream)
// //
@ -88,16 +91,18 @@ extern "C" {
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft); GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
// asynchronous copy // asynchronous copy
// the copy is performed after all the currently queued operations in backend_src // the copy is performed after all the currently queued operations in backend_src
// backend_dst will wait for the copy to complete before performing other operations // backend_dst will wait for the copy to complete before performing other operations
// automatic fallback to sync copy if async is not supported // automatic fallback to sync copy if async is not supported
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst); GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
// events GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
//
// Events
//
GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device); GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
GGML_API void ggml_backend_event_free(ggml_backend_event_t event); GGML_API void ggml_backend_event_free(ggml_backend_event_t event);
GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend); GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
@ -108,18 +113,39 @@ extern "C" {
// Backend device // Backend device
// //
enum ggml_backend_device_type { enum ggml_backend_dev_type {
GGML_BACKEND_DEVICE_TYPE_CPU, GGML_BACKEND_DEVICE_TYPE_CPU,
GGML_BACKEND_DEVICE_TYPE_GPU, GGML_BACKEND_DEVICE_TYPE_GPU,
// devices with full capabilities (excludes backends such as BLAS) // devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
GGML_BACKEND_DEVICE_TYPE_CPU_FULL, GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
GGML_BACKEND_DEVICE_TYPE_GPU_FULL GGML_BACKEND_DEVICE_TYPE_GPU_FULL
}; };
// functionality supported by the device
struct ggml_backend_dev_caps {
// asynchronous operations
bool async;
// pinned host buffer
bool host_buffer;
// event synchronization
bool events;
};
// all the device properties
struct ggml_backend_dev_props {
const char * name;
const char * description;
size_t memory_free;
size_t memory_total;
enum ggml_backend_dev_type type;
struct ggml_backend_dev_caps caps;
};
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device); GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device); GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device);
GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total); GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
GGML_API enum ggml_backend_device_type ggml_backend_dev_type(ggml_backend_dev_t device); GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device);
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device); GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params); GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device); GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
@ -130,8 +156,6 @@ extern "C" {
GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft); GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op); GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
GGML_API ggml_backend_event_t ggml_backend_dev_event_new(ggml_backend_dev_t device);
// //
// Backend (reg) // Backend (reg)
// //
@ -158,16 +182,16 @@ extern "C" {
GGML_API size_t ggml_backend_dev_count(void); GGML_API size_t ggml_backend_dev_count(void);
GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index); GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name); GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_device_type type); GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
// Set the log callback for all registered backends // Set the log callback for all registered backends
GGML_API void ggml_backend_set_log_callback(ggml_log_callback log_callback, void * user_data); GGML_API void ggml_backend_set_log_callback(ggml_log_callback log_callback, void * user_data);
// Direct Backend (stream) initialization // Direct backend (stream) initialization
// = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params) // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params); GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params) // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_device_type type, const char * params); GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL) // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
GGML_API ggml_backend_t ggml_backend_init_best(void); GGML_API ggml_backend_t ggml_backend_init_best(void);
@ -276,7 +300,6 @@ extern "C" {
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor); GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
// //
// CPU backend // CPU backend
// //

View File

@ -9,10 +9,9 @@ extern "C" {
#endif #endif
// //
// Backend buffer // Backend buffer type
// //
// buffer type
struct ggml_backend_buffer_type_i { struct ggml_backend_buffer_type_i {
const char * (*get_name) (ggml_backend_buffer_type_t buft); const char * (*get_name) (ggml_backend_buffer_type_t buft);
// allocate a buffer of this type // allocate a buffer of this type
@ -33,7 +32,10 @@ extern "C" {
void * context; void * context;
}; };
// buffer //
// Backend buffer
//
struct ggml_backend_buffer_i { struct ggml_backend_buffer_i {
const char * (*get_name) (ggml_backend_buffer_t buffer); const char * (*get_name) (ggml_backend_buffer_t buffer);
// (optional) free the buffer // (optional) free the buffer
@ -143,15 +145,26 @@ extern "C" {
}; };
// //
// Backend registry v2 // Backend device
// //
// Note: if additional properties are needed, we should add a struct with all of them
// the current functions to obtain the properties can remain, since they are more convenient for often used properties
struct ggml_backend_device_i { struct ggml_backend_device_i {
// device properties // device name: short identifier for this device, such as "CPU" or "CUDA0"
const char * (*get_name)(ggml_backend_dev_t dev); const char * (*get_name)(ggml_backend_dev_t dev);
// device description: short informative description of the device, could be the model name
const char * (*get_description)(ggml_backend_dev_t dev); const char * (*get_description)(ggml_backend_dev_t dev);
// device memory in bytes
void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total); void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
enum ggml_backend_device_type (*get_type)(ggml_backend_dev_t dev);
// device type
enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev);
// device properties
void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props);
// get the backend (reg) associated with this device // get the backend (reg) associated with this device
ggml_backend_reg_t (*get_backend_reg)(ggml_backend_dev_t dev); ggml_backend_reg_t (*get_backend_reg)(ggml_backend_dev_t dev);
@ -190,6 +203,10 @@ extern "C" {
void * context; void * context;
}; };
//
// Backend (reg)
//
struct ggml_backend_reg_i { struct ggml_backend_reg_i {
const char * (*get_name)(ggml_backend_reg_t reg); const char * (*get_name)(ggml_backend_reg_t reg);
@ -212,7 +229,7 @@ extern "C" {
}; };
// Internal API // Internal backend registry API
void ggml_backend_register(ggml_backend_reg_t reg); void ggml_backend_register(ggml_backend_reg_t reg);
void ggml_backend_device_register(ggml_backend_dev_t device); void ggml_backend_device_register(ggml_backend_dev_t device);
// TODO: backends can be loaded as a dynamic library, in which case it needs to export this function // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function

View File

@ -331,6 +331,10 @@ bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor *
return false; return false;
} }
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
return backend->device;
}
// backend copy // backend copy
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
@ -440,10 +444,14 @@ void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t *
device->iface.get_memory(device, free, total); device->iface.get_memory(device, free, total);
} }
enum ggml_backend_device_type ggml_backend_dev_type(ggml_backend_dev_t device) { enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
return device->iface.get_type(device); return device->iface.get_type(device);
} }
void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
device->iface.get_props(device, props);
}
ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) { ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
return device->iface.get_backend_reg(device); return device->iface.get_backend_reg(device);
} }
@ -476,13 +484,6 @@ bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_te
return device->iface.offload_op(device, op); return device->iface.offload_op(device, op);
} }
ggml_backend_event_t ggml_backend_dev_event_new(ggml_backend_dev_t device) {
if (!device->iface.event_new) {
return NULL;
}
return device->iface.event_new(device);
}
// Backend (reg) // Backend (reg)
const char * ggml_backend_reg_name(ggml_backend_reg_t reg) { const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
@ -603,7 +604,7 @@ ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
return NULL; return NULL;
} }
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_device_type type) { ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
for (size_t i = 0; i < ggml_backend_dev_count(); i++) { for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i); ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (ggml_backend_dev_type(dev) == type) { if (ggml_backend_dev_type(dev) == type) {
@ -629,7 +630,7 @@ ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params)
return ggml_backend_dev_init(dev, params); return ggml_backend_dev_init(dev, params);
} }
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_device_type type, const char * params) { ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
ggml_backend_dev_t dev = ggml_backend_dev_by_type(type); ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
if (!dev) { if (!dev) {
return NULL; return NULL;
@ -1028,60 +1029,72 @@ static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user
//////////////////////// ////////////////////////
static const char * ggml_backend_cpu_device_name(ggml_backend_dev_t device) { static const char * ggml_backend_cpu_device_name(ggml_backend_dev_t dev) {
return "CPU"; return "CPU";
GGML_UNUSED(device); GGML_UNUSED(dev);
} }
static const char * ggml_backend_cpu_device_description(ggml_backend_dev_t device) { static const char * ggml_backend_cpu_device_description(ggml_backend_dev_t dev) {
// TODO // TODO
return "CPU"; return "CPU";
GGML_UNUSED(device); GGML_UNUSED(dev);
} }
static void ggml_backend_cpu_device_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { static void ggml_backend_cpu_device_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
// TODO // TODO
*free = 0; *free = 0;
*total = 0; *total = 0;
GGML_UNUSED(device); GGML_UNUSED(dev);
} }
static enum ggml_backend_device_type ggml_backend_cpu_device_type(ggml_backend_dev_t device) { static enum ggml_backend_dev_type ggml_backend_cpu_device_type(ggml_backend_dev_t dev) {
return GGML_BACKEND_DEVICE_TYPE_CPU_FULL; return GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
GGML_UNUSED(device); GGML_UNUSED(dev);
} }
static ggml_backend_reg_t ggml_backend_cpu_device_reg(ggml_backend_dev_t device) { static void ggml_backend_cpu_device_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
props->name = ggml_backend_cpu_device_name(dev);
props->description = ggml_backend_cpu_device_description(dev);
props->type = ggml_backend_cpu_device_type(dev);
ggml_backend_cpu_device_memory(dev, &props->memory_free, &props->memory_total);
props->caps = {
/* async */ false,
/* host_buffer */ false,
/* events */ false,
};
}
static ggml_backend_reg_t ggml_backend_cpu_device_reg(ggml_backend_dev_t dev) {
return ggml_backend_cpu_reg(); return ggml_backend_cpu_reg();
GGML_UNUSED(device); GGML_UNUSED(dev);
} }
static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t device, const char * params) { static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t dev, const char * params) {
return ggml_backend_cpu_init(); return ggml_backend_cpu_init();
GGML_UNUSED(device); GGML_UNUSED(dev);
GGML_UNUSED(params); GGML_UNUSED(params);
} }
static ggml_backend_buffer_type_t ggml_backend_cpu_device_buffer_type(ggml_backend_dev_t device) { static ggml_backend_buffer_type_t ggml_backend_cpu_device_buffer_type(ggml_backend_dev_t dev) {
return ggml_backend_cpu_buffer_type(); return ggml_backend_cpu_buffer_type();
GGML_UNUSED(device); GGML_UNUSED(dev);
} }
static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) { static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
return ggml_backend_cpu_buffer_from_ptr(ptr, size); return ggml_backend_cpu_buffer_from_ptr(ptr, size);
GGML_UNUSED(device); GGML_UNUSED(dev);
GGML_UNUSED(max_tensor_size); GGML_UNUSED(max_tensor_size);
} }
static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) { static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
switch (op->op) { switch (op->op) {
case GGML_OP_CPY: case GGML_OP_CPY:
return return
@ -1101,13 +1114,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t device, const
return true; return true;
} }
GGML_UNUSED(device); GGML_UNUSED(dev);
} }
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) { static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
return ggml_backend_buft_is_host(buft); return ggml_backend_buft_is_host(buft);
GGML_UNUSED(device); GGML_UNUSED(dev);
} }
struct ggml_backend_device_i ggml_backend_cpu_device_i = { struct ggml_backend_device_i ggml_backend_cpu_device_i = {
@ -1115,6 +1128,7 @@ struct ggml_backend_device_i ggml_backend_cpu_device_i = {
/* .get_description = */ ggml_backend_cpu_device_description, /* .get_description = */ ggml_backend_cpu_device_description,
/* .get_memory = */ ggml_backend_cpu_device_memory, /* .get_memory = */ ggml_backend_cpu_device_memory,
/* .get_type = */ ggml_backend_cpu_device_type, /* .get_type = */ ggml_backend_cpu_device_type,
/* .get_props = */ ggml_backend_cpu_device_props,
/* .get_backend_reg = */ ggml_backend_cpu_device_reg, /* .get_backend_reg = */ ggml_backend_cpu_device_reg,
/* .init_backend = */ ggml_backend_cpu_device_init, /* .init_backend = */ ggml_backend_cpu_device_init,
/* .buffer_type = */ ggml_backend_cpu_device_buffer_type, /* .buffer_type = */ ggml_backend_cpu_device_buffer_type,

View File

@ -2929,11 +2929,31 @@ static void ggml_backend_cuda_device_memory(ggml_backend_dev_t dev, size_t * fre
CUDA_CHECK(cudaMemGetInfo(free, total)); CUDA_CHECK(cudaMemGetInfo(free, total));
} }
static enum ggml_backend_device_type ggml_backend_cuda_device_type(ggml_backend_dev_t dev) { static enum ggml_backend_dev_type ggml_backend_cuda_device_type(ggml_backend_dev_t dev) {
GGML_UNUSED(dev); GGML_UNUSED(dev);
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
} }
static void ggml_backend_cuda_device_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_cuda_device_name(dev);
props->description = ggml_backend_cuda_device_description(dev);
props->type = ggml_backend_cuda_device_type(dev);
ggml_backend_cuda_device_memory(dev, &props->memory_free, &props->memory_total);
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
#ifdef GGML_CUDA_NO_PEER_COPY
bool events = false;
#else
bool events = true;
#endif
props->caps = {
/* async */ true,
/* host_buffer */ host_buffer,
/* events */ events,
};
}
static ggml_backend_reg_t ggml_backend_cuda_device_reg(ggml_backend_dev_t dev) { static ggml_backend_reg_t ggml_backend_cuda_device_reg(ggml_backend_dev_t dev) {
GGML_UNUSED(dev); GGML_UNUSED(dev);
return ggml_backend_cuda_reg(); return ggml_backend_cuda_reg();
@ -3206,7 +3226,6 @@ static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const gg
GGML_UNUSED(dev); GGML_UNUSED(dev);
} }
static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) { static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
#ifdef GGML_CUDA_NO_PEER_COPY #ifdef GGML_CUDA_NO_PEER_COPY
return nullptr; return nullptr;
@ -3242,6 +3261,7 @@ static ggml_backend_device_i ggml_backend_cuda_device_interface = {
/* .get_description = */ ggml_backend_cuda_device_description, /* .get_description = */ ggml_backend_cuda_device_description,
/* .get_memory = */ ggml_backend_cuda_device_memory, /* .get_memory = */ ggml_backend_cuda_device_memory,
/* .get_type = */ ggml_backend_cuda_device_type, /* .get_type = */ ggml_backend_cuda_device_type,
/* .get_props = */ ggml_backend_cuda_device_props,
/* .get_backend_reg = */ ggml_backend_cuda_device_reg, /* .get_backend_reg = */ ggml_backend_cuda_device_reg,
/* .init_backend = */ ggml_backend_cuda_device_init, /* .init_backend = */ ggml_backend_cuda_device_init,
/* .buffer_type = */ ggml_backend_cuda_device_buffer_type, /* .buffer_type = */ ggml_backend_cuda_device_buffer_type,

View File

@ -5035,7 +5035,7 @@ struct llama_model_loader {
// Returns false if cancelled by progress_callback // Returns false if cancelled by progress_callback
bool load_all_data( bool load_all_data(
struct ggml_context * ctx, struct ggml_context * ctx,
llama_buf_map & bufs_mmap, llama_buf_map & bufs,
llama_mlocks * lmlocks, llama_mlocks * lmlocks,
llama_progress_callback progress_callback, llama_progress_callback progress_callback,
void * progress_callback_user_data) { void * progress_callback_user_data) {
@ -5044,7 +5044,6 @@ struct llama_model_loader {
std::vector<no_init<uint8_t>> read_buf; std::vector<no_init<uint8_t>> read_buf;
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result; std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
// TODO: adapt to ggml-backend
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
// NVMe raid configurations might require more / larger buffers. // NVMe raid configurations might require more / larger buffers.
constexpr size_t n_buffers = 4; constexpr size_t n_buffers = 4;
@ -5054,26 +5053,84 @@ struct llama_model_loader {
std::vector<ggml_backend_event_t> events; std::vector<ggml_backend_event_t> events;
std::vector<void *> host_ptrs; std::vector<void *> host_ptrs;
size_t buffer_idx = 0; // buffer to use for async loads size_t buffer_idx = 0; // buffer to use for async loads
ggml_backend_t upload_backend = [&](const char * fn) -> ggml_backend_t {
// TODO: only do this if the backend supports all the required features: async, events, pinned memory if (use_mmap || check_tensors) {
// it also must be avoided for split buffers and other buffers that require the entire tensor to be loaded at once return nullptr;
ggml_backend_t upload_backend = nullptr; }
if (!use_mmap && !check_tensors) {
// When not using mmaped io use async uploads from pinned memory to GPU memory. // When not using mmaped io use async uploads from pinned memory to GPU memory.
// First determine if the CUDA backend is active, and if so, determine the device ID. // First determine if the backend supports the necessary features for async uploads.
ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr; auto * buf = bufs.count(0) ? bufs.at(0) : nullptr;
ggml_backend_dev_t dev = buf ? ggml_backend_buft_get_device(ggml_backend_buffer_get_type(buf)) : nullptr; if (!buf) {
ggml_backend_buffer_type_t host_buft = dev ? ggml_backend_dev_host_buffer_type(dev) : nullptr; LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", fn);
upload_backend = host_buft ? ggml_backend_dev_init(dev, nullptr) : nullptr; return nullptr;
}
// If the cuda is active create pinned memory buffers and events for synchronisation. auto * buft = ggml_backend_buffer_get_type(buf);
if (upload_backend) { auto * dev = ggml_backend_buft_get_device(buft);
if (!dev) {
LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", fn,
ggml_backend_buft_name(buft));
return nullptr;
}
if (buft != ggml_backend_dev_buffer_type(dev)) {
LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", fn,
ggml_backend_buft_name(buft), ggml_backend_dev_name(dev));
return nullptr;
}
ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props);
if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) {
LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", fn,
ggml_backend_dev_name(dev));
return nullptr;
}
auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
if (!host_buft) {
LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", fn,
ggml_backend_dev_name(dev));
return nullptr;
}
// If the backend is supported, create pinned memory buffers and events for synchronisation.
for (size_t idx = 0; idx < n_buffers; ++idx) { for (size_t idx = 0; idx < n_buffers; ++idx) {
host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(host_buft, buffer_size)); auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx])); if (!buf) {
events.emplace_back(ggml_backend_dev_event_new(dev)); LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", fn,
ggml_backend_dev_name(dev));
return nullptr;
} }
host_buffers.emplace_back(buf);
host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf));
auto * event = ggml_backend_event_new(dev);
if (!event) {
LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", fn,
ggml_backend_dev_name(dev));
return nullptr;
} }
events.emplace_back(event);
}
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
if (!backend) {
LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", fn,
ggml_backend_dev_name(dev));
return nullptr;
}
return backend;
}(__func__);
if (upload_backend) {
LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__,
ggml_backend_dev_name(ggml_backend_get_device(upload_backend)),
ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))),
ggml_backend_name(upload_backend));
} }
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
@ -5094,8 +5151,8 @@ struct llama_model_loader {
if (use_mmap) { if (use_mmap) {
const auto & mapping = mappings.at(weight->idx); const auto & mapping = mappings.at(weight->idx);
ggml_backend_buffer_t buf_mmap = nullptr; ggml_backend_buffer_t buf_mmap = nullptr;
if (bufs_mmap.count(weight->idx)) { if (bufs.count(weight->idx)) {
buf_mmap = bufs_mmap.at(weight->idx); buf_mmap = bufs.at(weight->idx);
} }
uint8_t * data = (uint8_t *) mapping->addr + weight->offs; uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
@ -5131,7 +5188,7 @@ struct llama_model_loader {
})); }));
} }
} else { } else {
// If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
if (upload_backend) { if (upload_backend) {
file->seek(weight->offs, SEEK_SET); file->seek(weight->offs, SEEK_SET);

View File

@ -672,14 +672,11 @@ struct test_case {
} }
// run // run
ggml_backend_synchronize(backend);
int64_t total_time_us = 0; int64_t total_time_us = 0;
int total_runs = 0; int total_runs = 0;
do { do {
int64_t start_time = ggml_time_us(); int64_t start_time = ggml_time_us();
ggml_backend_graph_compute(backend, gf); ggml_backend_graph_compute(backend, gf);
ggml_backend_synchronize(backend);
int64_t end_time = ggml_time_us(); int64_t end_time = ggml_time_us();
total_time_us += end_time - start_time; total_time_us += end_time - start_time;