mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-03 15:24:35 +00:00
add device props/caps, fully support async upload for all compatible backends
This commit is contained in:
parent
805fea97ac
commit
6ff0e7a32e
@ -20,13 +20,13 @@ extern "C" {
|
||||
// Backend buffer type
|
||||
//
|
||||
|
||||
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
|
||||
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
|
||||
|
||||
//
|
||||
// Backend buffer
|
||||
@ -53,6 +53,9 @@ extern "C" {
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
||||
|
||||
// tensor copy between different backends
|
||||
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
//
|
||||
// Backend (stream)
|
||||
//
|
||||
@ -88,49 +91,70 @@ extern "C" {
|
||||
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
||||
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
|
||||
// tensor copy between different backends
|
||||
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
// asynchronous copy
|
||||
// the copy is performed after all the currently queued operations in backend_src
|
||||
// backend_dst will wait for the copy to complete before performing other operations
|
||||
// automatic fallback to sync copy if async is not supported
|
||||
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
// events
|
||||
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_dev_t device);
|
||||
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_record (ggml_backend_event_t event, ggml_backend_t backend);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
|
||||
|
||||
//
|
||||
// Events
|
||||
//
|
||||
|
||||
GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
|
||||
GGML_API void ggml_backend_event_free(ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
|
||||
|
||||
//
|
||||
// Backend device
|
||||
//
|
||||
|
||||
enum ggml_backend_device_type {
|
||||
enum ggml_backend_dev_type {
|
||||
GGML_BACKEND_DEVICE_TYPE_CPU,
|
||||
GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||
// devices with full capabilities (excludes backends such as BLAS)
|
||||
// devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
|
||||
GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
|
||||
GGML_BACKEND_DEVICE_TYPE_GPU_FULL
|
||||
};
|
||||
|
||||
// functionality supported by the device
|
||||
struct ggml_backend_dev_caps {
|
||||
// asynchronous operations
|
||||
bool async;
|
||||
// pinned host buffer
|
||||
bool host_buffer;
|
||||
// event synchronization
|
||||
bool events;
|
||||
};
|
||||
|
||||
// all the device properties
|
||||
struct ggml_backend_dev_props {
|
||||
const char * name;
|
||||
const char * description;
|
||||
size_t memory_free;
|
||||
size_t memory_total;
|
||||
enum ggml_backend_dev_type type;
|
||||
struct ggml_backend_dev_caps caps;
|
||||
};
|
||||
|
||||
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
|
||||
GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device);
|
||||
GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
|
||||
GGML_API enum ggml_backend_device_type ggml_backend_dev_type(ggml_backend_dev_t device);
|
||||
GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device);
|
||||
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
|
||||
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
|
||||
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
|
||||
|
||||
GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
|
||||
GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
|
||||
GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
|
||||
|
||||
GGML_API ggml_backend_event_t ggml_backend_dev_event_new(ggml_backend_dev_t device);
|
||||
GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
|
||||
GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
|
||||
GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
|
||||
|
||||
//
|
||||
// Backend (reg)
|
||||
@ -158,16 +182,16 @@ extern "C" {
|
||||
GGML_API size_t ggml_backend_dev_count(void);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_device_type type);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
|
||||
|
||||
// Set the log callback for all registered backends
|
||||
GGML_API void ggml_backend_set_log_callback(ggml_log_callback log_callback, void * user_data);
|
||||
|
||||
// Direct Backend (stream) initialization
|
||||
// Direct backend (stream) initialization
|
||||
// = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
|
||||
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
|
||||
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
|
||||
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_device_type type, const char * params);
|
||||
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
|
||||
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
|
||||
GGML_API ggml_backend_t ggml_backend_init_best(void);
|
||||
|
||||
@ -276,7 +300,6 @@ extern "C" {
|
||||
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
||||
|
||||
|
||||
//
|
||||
// CPU backend
|
||||
//
|
||||
|
@ -9,10 +9,9 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// Backend buffer
|
||||
// Backend buffer type
|
||||
//
|
||||
|
||||
// buffer type
|
||||
struct ggml_backend_buffer_type_i {
|
||||
const char * (*get_name) (ggml_backend_buffer_type_t buft);
|
||||
// allocate a buffer of this type
|
||||
@ -33,7 +32,10 @@ extern "C" {
|
||||
void * context;
|
||||
};
|
||||
|
||||
// buffer
|
||||
//
|
||||
// Backend buffer
|
||||
//
|
||||
|
||||
struct ggml_backend_buffer_i {
|
||||
const char * (*get_name) (ggml_backend_buffer_t buffer);
|
||||
// (optional) free the buffer
|
||||
@ -143,15 +145,26 @@ extern "C" {
|
||||
};
|
||||
|
||||
//
|
||||
// Backend registry v2
|
||||
// Backend device
|
||||
//
|
||||
|
||||
// Note: if additional properties are needed, we should add a struct with all of them
|
||||
// the current functions to obtain the properties can remain, since they are more convenient for often used properties
|
||||
struct ggml_backend_device_i {
|
||||
// device properties
|
||||
// device name: short identifier for this device, such as "CPU" or "CUDA0"
|
||||
const char * (*get_name)(ggml_backend_dev_t dev);
|
||||
|
||||
// device description: short informative description of the device, could be the model name
|
||||
const char * (*get_description)(ggml_backend_dev_t dev);
|
||||
|
||||
// device memory in bytes
|
||||
void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
|
||||
enum ggml_backend_device_type (*get_type)(ggml_backend_dev_t dev);
|
||||
|
||||
// device type
|
||||
enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev);
|
||||
|
||||
// device properties
|
||||
void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props);
|
||||
|
||||
// get the backend (reg) associated with this device
|
||||
ggml_backend_reg_t (*get_backend_reg)(ggml_backend_dev_t dev);
|
||||
@ -190,6 +203,10 @@ extern "C" {
|
||||
void * context;
|
||||
};
|
||||
|
||||
//
|
||||
// Backend (reg)
|
||||
//
|
||||
|
||||
struct ggml_backend_reg_i {
|
||||
const char * (*get_name)(ggml_backend_reg_t reg);
|
||||
|
||||
@ -212,7 +229,7 @@ extern "C" {
|
||||
};
|
||||
|
||||
|
||||
// Internal API
|
||||
// Internal backend registry API
|
||||
void ggml_backend_register(ggml_backend_reg_t reg);
|
||||
void ggml_backend_device_register(ggml_backend_dev_t device);
|
||||
// TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
|
||||
|
@ -331,6 +331,10 @@ bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor *
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
|
||||
return backend->device;
|
||||
}
|
||||
|
||||
// backend copy
|
||||
|
||||
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
||||
@ -440,10 +444,14 @@ void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t *
|
||||
device->iface.get_memory(device, free, total);
|
||||
}
|
||||
|
||||
enum ggml_backend_device_type ggml_backend_dev_type(ggml_backend_dev_t device) {
|
||||
enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
|
||||
return device->iface.get_type(device);
|
||||
}
|
||||
|
||||
void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
|
||||
device->iface.get_props(device, props);
|
||||
}
|
||||
|
||||
ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
|
||||
return device->iface.get_backend_reg(device);
|
||||
}
|
||||
@ -476,13 +484,6 @@ bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_te
|
||||
return device->iface.offload_op(device, op);
|
||||
}
|
||||
|
||||
ggml_backend_event_t ggml_backend_dev_event_new(ggml_backend_dev_t device) {
|
||||
if (!device->iface.event_new) {
|
||||
return NULL;
|
||||
}
|
||||
return device->iface.event_new(device);
|
||||
}
|
||||
|
||||
// Backend (reg)
|
||||
|
||||
const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
|
||||
@ -603,7 +604,7 @@ ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_device_type type) {
|
||||
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
if (ggml_backend_dev_type(dev) == type) {
|
||||
@ -629,7 +630,7 @@ ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params)
|
||||
return ggml_backend_dev_init(dev, params);
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_device_type type, const char * params) {
|
||||
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
|
||||
if (!dev) {
|
||||
return NULL;
|
||||
@ -1028,60 +1029,72 @@ static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user
|
||||
|
||||
////////////////////////
|
||||
|
||||
static const char * ggml_backend_cpu_device_name(ggml_backend_dev_t device) {
|
||||
static const char * ggml_backend_cpu_device_name(ggml_backend_dev_t dev) {
|
||||
return "CPU";
|
||||
|
||||
GGML_UNUSED(device);
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static const char * ggml_backend_cpu_device_description(ggml_backend_dev_t device) {
|
||||
static const char * ggml_backend_cpu_device_description(ggml_backend_dev_t dev) {
|
||||
// TODO
|
||||
return "CPU";
|
||||
|
||||
GGML_UNUSED(device);
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_device_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
|
||||
static void ggml_backend_cpu_device_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
// TODO
|
||||
*free = 0;
|
||||
*total = 0;
|
||||
|
||||
GGML_UNUSED(device);
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static enum ggml_backend_device_type ggml_backend_cpu_device_type(ggml_backend_dev_t device) {
|
||||
static enum ggml_backend_dev_type ggml_backend_cpu_device_type(ggml_backend_dev_t dev) {
|
||||
return GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
|
||||
|
||||
GGML_UNUSED(device);
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static ggml_backend_reg_t ggml_backend_cpu_device_reg(ggml_backend_dev_t device) {
|
||||
static void ggml_backend_cpu_device_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_cpu_device_name(dev);
|
||||
props->description = ggml_backend_cpu_device_description(dev);
|
||||
props->type = ggml_backend_cpu_device_type(dev);
|
||||
ggml_backend_cpu_device_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* async */ false,
|
||||
/* host_buffer */ false,
|
||||
/* events */ false,
|
||||
};
|
||||
}
|
||||
|
||||
static ggml_backend_reg_t ggml_backend_cpu_device_reg(ggml_backend_dev_t dev) {
|
||||
return ggml_backend_cpu_reg();
|
||||
|
||||
GGML_UNUSED(device);
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t device, const char * params) {
|
||||
static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
return ggml_backend_cpu_init();
|
||||
|
||||
GGML_UNUSED(device);
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(params);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_cpu_device_buffer_type(ggml_backend_dev_t device) {
|
||||
static ggml_backend_buffer_type_t ggml_backend_cpu_device_buffer_type(ggml_backend_dev_t dev) {
|
||||
return ggml_backend_cpu_buffer_type();
|
||||
|
||||
GGML_UNUSED(device);
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
|
||||
GGML_UNUSED(device);
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(max_tensor_size);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
||||
static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
switch (op->op) {
|
||||
case GGML_OP_CPY:
|
||||
return
|
||||
@ -1101,13 +1114,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t device, const
|
||||
return true;
|
||||
}
|
||||
|
||||
GGML_UNUSED(device);
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
|
||||
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
return ggml_backend_buft_is_host(buft);
|
||||
|
||||
GGML_UNUSED(device);
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
struct ggml_backend_device_i ggml_backend_cpu_device_i = {
|
||||
@ -1115,6 +1128,7 @@ struct ggml_backend_device_i ggml_backend_cpu_device_i = {
|
||||
/* .get_description = */ ggml_backend_cpu_device_description,
|
||||
/* .get_memory = */ ggml_backend_cpu_device_memory,
|
||||
/* .get_type = */ ggml_backend_cpu_device_type,
|
||||
/* .get_props = */ ggml_backend_cpu_device_props,
|
||||
/* .get_backend_reg = */ ggml_backend_cpu_device_reg,
|
||||
/* .init_backend = */ ggml_backend_cpu_device_init,
|
||||
/* .buffer_type = */ ggml_backend_cpu_device_buffer_type,
|
||||
|
@ -2929,11 +2929,31 @@ static void ggml_backend_cuda_device_memory(ggml_backend_dev_t dev, size_t * fre
|
||||
CUDA_CHECK(cudaMemGetInfo(free, total));
|
||||
}
|
||||
|
||||
static enum ggml_backend_device_type ggml_backend_cuda_device_type(ggml_backend_dev_t dev) {
|
||||
static enum ggml_backend_dev_type ggml_backend_cuda_device_type(ggml_backend_dev_t dev) {
|
||||
GGML_UNUSED(dev);
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_device_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_cuda_device_name(dev);
|
||||
props->description = ggml_backend_cuda_device_description(dev);
|
||||
props->type = ggml_backend_cuda_device_type(dev);
|
||||
ggml_backend_cuda_device_memory(dev, &props->memory_free, &props->memory_total);
|
||||
|
||||
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
bool events = false;
|
||||
#else
|
||||
bool events = true;
|
||||
#endif
|
||||
|
||||
props->caps = {
|
||||
/* async */ true,
|
||||
/* host_buffer */ host_buffer,
|
||||
/* events */ events,
|
||||
};
|
||||
}
|
||||
|
||||
static ggml_backend_reg_t ggml_backend_cuda_device_reg(ggml_backend_dev_t dev) {
|
||||
GGML_UNUSED(dev);
|
||||
return ggml_backend_cuda_reg();
|
||||
@ -3206,7 +3226,6 @@ static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const gg
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
|
||||
static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
return nullptr;
|
||||
@ -3242,6 +3261,7 @@ static ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||
/* .get_description = */ ggml_backend_cuda_device_description,
|
||||
/* .get_memory = */ ggml_backend_cuda_device_memory,
|
||||
/* .get_type = */ ggml_backend_cuda_device_type,
|
||||
/* .get_props = */ ggml_backend_cuda_device_props,
|
||||
/* .get_backend_reg = */ ggml_backend_cuda_device_reg,
|
||||
/* .init_backend = */ ggml_backend_cuda_device_init,
|
||||
/* .buffer_type = */ ggml_backend_cuda_device_buffer_type,
|
||||
|
105
src/llama.cpp
105
src/llama.cpp
@ -5035,7 +5035,7 @@ struct llama_model_loader {
|
||||
// Returns false if cancelled by progress_callback
|
||||
bool load_all_data(
|
||||
struct ggml_context * ctx,
|
||||
llama_buf_map & bufs_mmap,
|
||||
llama_buf_map & bufs,
|
||||
llama_mlocks * lmlocks,
|
||||
llama_progress_callback progress_callback,
|
||||
void * progress_callback_user_data) {
|
||||
@ -5044,7 +5044,6 @@ struct llama_model_loader {
|
||||
std::vector<no_init<uint8_t>> read_buf;
|
||||
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
||||
|
||||
// TODO: adapt to ggml-backend
|
||||
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
|
||||
// NVMe raid configurations might require more / larger buffers.
|
||||
constexpr size_t n_buffers = 4;
|
||||
@ -5054,26 +5053,84 @@ struct llama_model_loader {
|
||||
std::vector<ggml_backend_event_t> events;
|
||||
std::vector<void *> host_ptrs;
|
||||
size_t buffer_idx = 0; // buffer to use for async loads
|
||||
|
||||
// TODO: only do this if the backend supports all the required features: async, events, pinned memory
|
||||
// it also must be avoided for split buffers and other buffers that require the entire tensor to be loaded at once
|
||||
ggml_backend_t upload_backend = nullptr;
|
||||
if (!use_mmap && !check_tensors) {
|
||||
// When not using mmaped io use async uploads from pinned memory to GPU memory.
|
||||
// First determine if the CUDA backend is active, and if so, determine the device ID.
|
||||
ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
|
||||
ggml_backend_dev_t dev = buf ? ggml_backend_buft_get_device(ggml_backend_buffer_get_type(buf)) : nullptr;
|
||||
ggml_backend_buffer_type_t host_buft = dev ? ggml_backend_dev_host_buffer_type(dev) : nullptr;
|
||||
upload_backend = host_buft ? ggml_backend_dev_init(dev, nullptr) : nullptr;
|
||||
|
||||
// If the cuda is active create pinned memory buffers and events for synchronisation.
|
||||
if (upload_backend) {
|
||||
for (size_t idx = 0; idx < n_buffers; ++idx) {
|
||||
host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(host_buft, buffer_size));
|
||||
host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
|
||||
events.emplace_back(ggml_backend_dev_event_new(dev));
|
||||
}
|
||||
ggml_backend_t upload_backend = [&](const char * fn) -> ggml_backend_t {
|
||||
if (use_mmap || check_tensors) {
|
||||
return nullptr;
|
||||
}
|
||||
// When not using mmaped io use async uploads from pinned memory to GPU memory.
|
||||
// First determine if the backend supports the necessary features for async uploads.
|
||||
auto * buf = bufs.count(0) ? bufs.at(0) : nullptr;
|
||||
if (!buf) {
|
||||
LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", fn);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto * buft = ggml_backend_buffer_get_type(buf);
|
||||
auto * dev = ggml_backend_buft_get_device(buft);
|
||||
if (!dev) {
|
||||
LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", fn,
|
||||
ggml_backend_buft_name(buft));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (buft != ggml_backend_dev_buffer_type(dev)) {
|
||||
LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", fn,
|
||||
ggml_backend_buft_name(buft), ggml_backend_dev_name(dev));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_backend_dev_props props;
|
||||
ggml_backend_dev_get_props(dev, &props);
|
||||
if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) {
|
||||
LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", fn,
|
||||
ggml_backend_dev_name(dev));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
|
||||
if (!host_buft) {
|
||||
LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", fn,
|
||||
ggml_backend_dev_name(dev));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// If the backend is supported, create pinned memory buffers and events for synchronisation.
|
||||
for (size_t idx = 0; idx < n_buffers; ++idx) {
|
||||
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
|
||||
if (!buf) {
|
||||
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", fn,
|
||||
ggml_backend_dev_name(dev));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
host_buffers.emplace_back(buf);
|
||||
host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf));
|
||||
|
||||
auto * event = ggml_backend_event_new(dev);
|
||||
if (!event) {
|
||||
LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", fn,
|
||||
ggml_backend_dev_name(dev));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
events.emplace_back(event);
|
||||
}
|
||||
|
||||
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
|
||||
if (!backend) {
|
||||
LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", fn,
|
||||
ggml_backend_dev_name(dev));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return backend;
|
||||
}(__func__);
|
||||
|
||||
if (upload_backend) {
|
||||
LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__,
|
||||
ggml_backend_dev_name(ggml_backend_get_device(upload_backend)),
|
||||
ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))),
|
||||
ggml_backend_name(upload_backend));
|
||||
}
|
||||
|
||||
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
@ -5094,8 +5151,8 @@ struct llama_model_loader {
|
||||
if (use_mmap) {
|
||||
const auto & mapping = mappings.at(weight->idx);
|
||||
ggml_backend_buffer_t buf_mmap = nullptr;
|
||||
if (bufs_mmap.count(weight->idx)) {
|
||||
buf_mmap = bufs_mmap.at(weight->idx);
|
||||
if (bufs.count(weight->idx)) {
|
||||
buf_mmap = bufs.at(weight->idx);
|
||||
}
|
||||
uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
|
||||
|
||||
@ -5131,7 +5188,7 @@ struct llama_model_loader {
|
||||
}));
|
||||
}
|
||||
} else {
|
||||
// If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
||||
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
||||
if (upload_backend) {
|
||||
file->seek(weight->offs, SEEK_SET);
|
||||
|
||||
|
@ -672,14 +672,11 @@ struct test_case {
|
||||
}
|
||||
|
||||
// run
|
||||
ggml_backend_synchronize(backend);
|
||||
|
||||
int64_t total_time_us = 0;
|
||||
int total_runs = 0;
|
||||
do {
|
||||
int64_t start_time = ggml_time_us();
|
||||
ggml_backend_graph_compute(backend, gf);
|
||||
ggml_backend_synchronize(backend);
|
||||
int64_t end_time = ggml_time_us();
|
||||
|
||||
total_time_us += end_time - start_time;
|
||||
|
Loading…
Reference in New Issue
Block a user