llama : refactor model loader with backend registry (#10026)

This commit is contained in:
Diego Devesa 2024-10-30 02:01:23 +01:00 committed by GitHub
parent 8f275a7c45
commit c5b0f4b5d9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 1903 additions and 2019 deletions

View File

@ -21,12 +21,6 @@
#include "ggml.h" #include "ggml.h"
#include "llama.h" #include "llama.h"
#include "common.h" #include "common.h"
#include "ggml-cuda.h"
#include "ggml-sycl.h"
#ifdef GGML_USE_CANN
#include "ggml-cann.h"
#endif
#ifdef _WIN32 #ifdef _WIN32
#define WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN
@ -82,95 +76,27 @@ static T stdev(const std::vector<T> & v) {
} }
static std::string get_cpu_info() { static std::string get_cpu_info() {
std::string id; std::vector<std::string> cpu_list;
#ifdef __linux__ for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
FILE * f = fopen("/proc/cpuinfo", "r"); auto * dev = ggml_backend_dev_get(i);
if (f) { auto dev_type = ggml_backend_dev_type(dev);
char buf[1024]; if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
while (fgets(buf, sizeof(buf), f)) { cpu_list.push_back(ggml_backend_dev_description(dev));
if (strncmp(buf, "model name", 10) == 0) {
char * p = strchr(buf, ':');
if (p) {
p++;
while (std::isspace(*p)) {
p++;
}
while (std::isspace(p[strlen(p) - 1])) {
p[strlen(p) - 1] = '\0';
}
id = p;
break;
}
}
}
fclose(f);
}
#elif defined(_WIN32)
HKEY hKey;
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
0,
KEY_READ,
&hKey) != ERROR_SUCCESS) {
// fail to open registry key
return "";
}
char cpu_brand[256];
DWORD cpu_brand_size = sizeof(cpu_brand);
if (RegQueryValueExA(hKey,
TEXT("ProcessorNameString"),
NULL,
NULL,
(LPBYTE)cpu_brand,
&cpu_brand_size) == ERROR_SUCCESS) {
id.assign(cpu_brand, cpu_brand_size);
if (id.find('\0') != std::string::npos) {
id.resize(id.find('\0'));
} }
} }
RegCloseKey(hKey); return join(cpu_list, ", ");
#endif
// TODO: other platforms
return id;
} }
static std::string get_gpu_info() { static std::string get_gpu_info() {
std::string id; std::vector<std::string> gpu_list;
#ifdef GGML_USE_CUDA for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
int count = ggml_backend_cuda_get_device_count(); auto * dev = ggml_backend_dev_get(i);
for (int i = 0; i < count; i++) { auto dev_type = ggml_backend_dev_type(dev);
char buf[128]; if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
ggml_backend_cuda_get_device_description(i, buf, sizeof(buf)); gpu_list.push_back(ggml_backend_dev_description(dev));
id += buf;
if (i < count - 1) {
id += "/";
} }
} }
#endif return join(gpu_list, ", ");
#ifdef GGML_USE_SYCL
int count = ggml_backend_sycl_get_device_count();
for (int i = 0; i < count; i++) {
char buf[128];
ggml_backend_sycl_get_device_description(i, buf, sizeof(buf));
id += buf;
if (i < count - 1) {
id += "/";
}
}
#endif
#ifdef GGML_USE_CANN
uint32_t count = ggml_backend_cann_get_device_count();
for (uint32_t i = 0; i < count; i++) {
char buf[128];
ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
id += buf;
if (i < count - 1) {
id += "/";
}
}
#endif
// TODO: other backends
return id;
} }
// command line params // command line params
@ -938,29 +864,15 @@ struct test {
} }
static std::string get_backend() { static std::string get_backend() {
if (cuda) { std::vector<std::string> backends;
return GGML_CUDA_NAME; for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
auto * reg = ggml_backend_reg_get(i);
std::string name = ggml_backend_reg_name(reg);
if (name != "CPU") {
backends.push_back(ggml_backend_reg_name(reg));
}
} }
if (vulkan) { return backends.empty() ? "CPU" : join(backends, ",");
return "Vulkan";
}
if (kompute) {
return "Kompute";
}
if (metal) {
return "Metal";
}
if (sycl) {
return GGML_SYCL_NAME;
}
if (gpu_blas) {
return "GPU BLAS";
}
if (blas) {
return "BLAS";
}
return "CPU";
} }
static const std::vector<std::string> & get_fields() { static const std::vector<std::string> & get_fields() {

View File

@ -114,11 +114,12 @@ extern "C" {
// //
enum ggml_backend_dev_type { enum ggml_backend_dev_type {
// CPU device using system memory
GGML_BACKEND_DEVICE_TYPE_CPU, GGML_BACKEND_DEVICE_TYPE_CPU,
// GPU device using dedicated memory
GGML_BACKEND_DEVICE_TYPE_GPU, GGML_BACKEND_DEVICE_TYPE_GPU,
// devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication) // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
GGML_BACKEND_DEVICE_TYPE_CPU_FULL, GGML_BACKEND_DEVICE_TYPE_ACCEL
GGML_BACKEND_DEVICE_TYPE_GPU_FULL
}; };
// functionality supported by the device // functionality supported by the device
@ -167,10 +168,14 @@ extern "C" {
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index); GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name); GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
// Functions that may be obtained using ggml_backend_reg_get_proc_address // Split buffer type for tensor parallelism
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *); typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t, int); // Set the number of threads for the backend
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
// Get additional buffer types provided by the device (returns a NULL-terminated array)
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
// //
// Backend registry // Backend registry
@ -192,7 +197,7 @@ extern "C" {
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params); GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params) // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params); GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL) // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
GGML_API ggml_backend_t ggml_backend_init_best(void); GGML_API ggml_backend_t ggml_backend_init_best(void);
// //

View File

@ -28,7 +28,7 @@ GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
// split tensor buffer that splits matrices by rows across multiple devices // split tensor buffer that splits matrices by rows across multiple devices
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split); GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);

View File

@ -16,12 +16,6 @@
#if defined(__AMX_INT8__) #if defined(__AMX_INT8__)
// AMX buffer interface // AMX buffer interface
static const char * ggml_backend_amx_buffer_get_name(ggml_backend_buffer_t buffer) {
return "AMX";
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context); free(buffer->context);
} }
@ -72,7 +66,6 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
} }
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = { static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
/* .get_name = */ ggml_backend_amx_buffer_get_name,
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer, /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
/* .get_base = */ ggml_backend_amx_buffer_get_base, /* .get_base = */ ggml_backend_amx_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required /* .init_tensor = */ NULL, // no initialization required
@ -121,14 +114,14 @@ static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() { ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = { static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
/* .iface = */ { /* .iface = */ {
/* .get_name = */ ggml_backend_amx_buffer_type_get_name, /* .get_name = */ ggml_backend_amx_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
/* .is_host = */ ggml_backend_amx_buffer_type_is_host, /* .is_host = */ ggml_backend_amx_buffer_type_is_host,
}, },
/* .device = */ NULL, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
/* .context = */ NULL, /* .context = */ NULL,
}; };
@ -149,12 +142,6 @@ static void ggml_backend_amx_free(ggml_backend_t backend) {
delete backend; delete backend;
} }
static ggml_backend_buffer_type_t ggml_backend_amx_get_default_buffer_type(ggml_backend_t backend) {
return ggml_backend_amx_buffer_type();
GGML_UNUSED(backend);
}
static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context; ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
@ -187,7 +174,6 @@ static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, s
static struct ggml_backend_i ggml_backend_amx_i = { static struct ggml_backend_i ggml_backend_amx_i = {
/* .get_name = */ ggml_backend_amx_name, /* .get_name = */ ggml_backend_amx_name,
/* .free = */ ggml_backend_amx_free, /* .free = */ ggml_backend_amx_free,
/* .get_default_buffer_type = */ ggml_backend_amx_get_default_buffer_type,
/* .set_tensor_async = */ NULL, /* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL, /* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL, /* .cpy_tensor_async = */ NULL,
@ -197,9 +183,6 @@ static struct ggml_backend_i ggml_backend_amx_i = {
/* .graph_plan_update = */ NULL, /* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_amx_graph_compute, /* .graph_compute = */ ggml_backend_amx_graph_compute,
/* .supports_op = */ NULL,
/* .supports_buft = */ NULL,
/* .offload_op = */ NULL,
/* .event_record = */ NULL, /* .event_record = */ NULL,
/* .event_wait = */ NULL, /* .event_wait = */ NULL,
}; };
@ -279,7 +262,7 @@ static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t *
} }
static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) { static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
return GGML_BACKEND_DEVICE_TYPE_CPU; return GGML_BACKEND_DEVICE_TYPE_ACCEL;
GGML_UNUSED(dev); GGML_UNUSED(dev);
} }

View File

@ -22,7 +22,7 @@ extern "C" {
size_t (*get_max_size) (ggml_backend_buffer_type_t buft); size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes) // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
// (optional) check if tensor data is in host memory (defaults to false) // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
bool (*is_host) (ggml_backend_buffer_type_t buft); bool (*is_host) (ggml_backend_buffer_type_t buft);
}; };
@ -37,7 +37,6 @@ extern "C" {
// //
struct ggml_backend_buffer_i { struct ggml_backend_buffer_i {
const char * (*get_name) (ggml_backend_buffer_t buffer);
// (optional) free the buffer // (optional) free the buffer
void (*free_buffer) (ggml_backend_buffer_t buffer); void (*free_buffer) (ggml_backend_buffer_t buffer);
// base address of the buffer // base address of the buffer
@ -88,19 +87,16 @@ extern "C" {
void (*free)(ggml_backend_t backend); void (*free)(ggml_backend_t backend);
// Will be moved to the device interface
// buffer allocation
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
// (optional) asynchronous tensor data access // (optional) asynchronous tensor data access
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst); bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
// (optional) complete all pending operations // (optional) complete all pending operations (required if the backend supports async operations)
void (*synchronize)(ggml_backend_t backend); void (*synchronize)(ggml_backend_t backend);
// (optional) compute graph with a plan (not used currently) // (optional) graph plans (not used currently)
// compute graph with a plan
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
@ -111,13 +107,6 @@ extern "C" {
// compute graph (always async if supported by the backend) // compute graph (always async if supported by the backend)
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
// IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
// new backends should implement the device interface instead
// These functions are being moved to the device interface
bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op);
bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op);
// (optional) event synchronization // (optional) event synchronization
// record an event on this stream // record an event on this stream
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event); void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);

View File

@ -34,6 +34,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
} }
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
if (size == 0) {
// return a dummy buffer for zero-sized allocations
return ggml_backend_buffer_init(buft, {}, NULL, 0);
}
return buft->iface.alloc_buffer(buft, size); return buft->iface.alloc_buffer(buft, size);
} }
@ -89,7 +94,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
} }
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) { const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
return buffer->iface.get_name(buffer); return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer));
} }
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@ -108,6 +113,11 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
} }
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
// get_base is optional if the buffer is zero-sized
if (buffer->size == 0) {
return NULL;
}
void * base = buffer->iface.get_base(buffer); void * base = buffer->iface.get_base(buffer);
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
@ -122,6 +132,15 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
} }
} }
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
// clear is optional if the buffer is zero-sized
if (buffer->size == 0) {
return;
}
buffer->iface.clear(buffer, value);
}
size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) { size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer)); return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
} }
@ -134,10 +153,6 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor); return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
} }
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
buffer->iface.clear(buffer, value);
}
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) { bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer)); return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
} }
@ -198,7 +213,7 @@ void ggml_backend_free(ggml_backend_t backend) {
} }
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) { ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
return backend->iface.get_default_buffer_type(backend); return ggml_backend_dev_buffer_type(backend->device);
} }
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) { ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
@ -238,43 +253,42 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
if (size == 0) {
return;
}
GGML_ASSERT(buf != NULL && "tensor buffer not set"); GGML_ASSERT(buf != NULL && "tensor buffer not set");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
if (!size) {
return;
}
buf->iface.set_tensor(buf, tensor, data, offset, size); buf->iface.set_tensor(buf, tensor, data, offset, size);
} }
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
if (size == 0) {
return;
}
GGML_ASSERT(buf != NULL && "tensor buffer not set"); GGML_ASSERT(buf != NULL && "tensor buffer not set");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
if (!size) {
return;
}
buf->iface.get_tensor(buf, tensor, data, offset, size); buf->iface.get_tensor(buf, tensor, data, offset, size);
} }
GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf != NULL && "tensor buffer not set"); if (size == 0) {
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
if (!size) {
return; return;
} }
GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer"); GGML_ASSERT(buf != NULL && "tensor buffer not set");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
buf->iface.memset_tensor(buf, tensor, value, offset, size); buf->iface.memset_tensor(buf, tensor, value, offset, size);
} }
@ -316,32 +330,15 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct
} }
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
// helper to ease transition to device interface return ggml_backend_dev_supports_op(backend->device, op);
if (backend->device) {
return ggml_backend_dev_supports_op(backend->device, op);
}
return backend->iface.supports_op(backend, op);
} }
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
// helper to ease transition to device interface return ggml_backend_dev_supports_buft(backend->device, buft);
if (backend->device) {
return ggml_backend_dev_supports_buft(backend->device, buft);
}
return backend->iface.supports_buft(backend, buft);
} }
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) { bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
// helper to ease transition to device interface return ggml_backend_dev_offload_op(backend->device, op);
if (backend->device) {
return ggml_backend_dev_offload_op(backend->device, op);
}
if (backend->iface.offload_op != NULL) {
return backend->iface.offload_op(backend, op);
}
return false;
} }
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) { ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
@ -582,6 +579,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_VULKAN #ifdef GGML_USE_VULKAN
register_backend(ggml_backend_vk_reg()); register_backend(ggml_backend_vk_reg());
#endif #endif
#ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg());
#endif
#ifdef GGML_USE_BLAS #ifdef GGML_USE_BLAS
register_backend(ggml_backend_blas_reg()); register_backend(ggml_backend_blas_reg());
#endif #endif
@ -591,9 +591,6 @@ struct ggml_backend_registry {
#ifdef GGML_USE_AMX #ifdef GGML_USE_AMX
register_backend(ggml_backend_amx_reg()); register_backend(ggml_backend_amx_reg());
#endif #endif
#ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg());
#endif
// TODO: kompute // TODO: kompute
@ -701,9 +698,9 @@ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const
} }
ggml_backend_t ggml_backend_init_best(void) { ggml_backend_t ggml_backend_init_best(void) {
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL); ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
if (!dev) { if (!dev) {
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU_FULL); dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
} }
if (!dev) { if (!dev) {
return NULL; return NULL;
@ -711,13 +708,7 @@ ggml_backend_t ggml_backend_init_best(void) {
return ggml_backend_dev_init(dev, NULL); return ggml_backend_dev_init(dev, NULL);
} }
// backend CPU // CPU backend - buffer
static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
return "CPU";
GGML_UNUSED(buffer);
}
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
uintptr_t data = (uintptr_t)buffer->context; uintptr_t data = (uintptr_t)buffer->context;
@ -767,7 +758,6 @@ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
} }
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
/* .get_name = */ ggml_backend_cpu_buffer_get_name,
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
/* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .get_base = */ ggml_backend_cpu_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required /* .init_tensor = */ NULL, // no initialization required
@ -780,7 +770,6 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
}; };
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
/* .get_name = */ ggml_backend_cpu_buffer_get_name,
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
/* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .get_base = */ ggml_backend_cpu_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required /* .init_tensor = */ NULL, // no initialization required
@ -792,6 +781,8 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
/* .reset = */ NULL, /* .reset = */ NULL,
}; };
// CPU backend - buffer type
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "CPU"; return "CPU";
@ -799,19 +790,14 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
} }
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
auto alloc_size = size; void * data = ggml_aligned_malloc(size);
if (alloc_size == 0) {
alloc_size = 1;
}
void * data = ggml_aligned_malloc(alloc_size);
if (data == NULL) { if (data == NULL) {
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size); GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL; return NULL;
} }
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, alloc_size); return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
} }
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
@ -843,6 +829,29 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
return &ggml_backend_cpu_buffer_type; return &ggml_backend_cpu_buffer_type;
} }
static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
return "CPU_Mapped";
GGML_UNUSED(buft);
}
static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
/* .iface = */ {
/* .get_name = */ ggml_backend_cpu_buffer_from_ptr_type_get_name,
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
},
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
/* .context = */ NULL,
};
return &ggml_backend_cpu_buffer_type;
}
#ifdef GGML_USE_CPU_HBM #ifdef GGML_USE_CPU_HBM
// buffer type HBM // buffer type HBM
@ -855,18 +864,11 @@ static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffe
GGML_UNUSED(buft); GGML_UNUSED(buft);
} }
static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
return "CPU_HBM";
GGML_UNUSED(buf);
}
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
hbw_free(buffer->context); hbw_free(buffer->context);
} }
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
//void * ptr = hbw_malloc(size);
void * ptr; void * ptr;
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
if (result != 0) { if (result != 0) {
@ -876,7 +878,6 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft; buffer->buft = buft;
buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer; buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
return buffer; return buffer;
@ -899,6 +900,21 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
} }
#endif #endif
static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
static ggml_backend_buffer_type_t bufts[] = {
#ifdef GGML_USE_CPU_HBM
ggml_backend_cpu_hbm_buffer_type(),
#endif
NULL
};
return bufts;
GGML_UNUSED(device);
}
// CPU backend - backend (stream)
struct ggml_backend_cpu_context { struct ggml_backend_cpu_context {
int n_threads; int n_threads;
ggml_threadpool_t threadpool; ggml_threadpool_t threadpool;
@ -923,12 +939,6 @@ static void ggml_backend_cpu_free(ggml_backend_t backend) {
delete backend; delete backend;
} }
static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
return ggml_backend_cpu_buffer_type();
GGML_UNUSED(backend);
}
struct ggml_backend_plan_cpu { struct ggml_backend_plan_cpu {
struct ggml_cplan cplan; struct ggml_cplan cplan;
struct ggml_cgraph cgraph; struct ggml_cgraph cgraph;
@ -998,7 +1008,6 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
static const struct ggml_backend_i ggml_backend_cpu_i = { static const struct ggml_backend_i ggml_backend_cpu_i = {
/* .get_name = */ ggml_backend_cpu_get_name, /* .get_name = */ ggml_backend_cpu_get_name,
/* .free = */ ggml_backend_cpu_free, /* .free = */ ggml_backend_cpu_free,
/* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
/* .set_tensor_async = */ NULL, /* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL, /* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL, /* .cpy_tensor_async = */ NULL,
@ -1008,9 +1017,6 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
/* .graph_plan_update = */ NULL, /* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
/* .graph_compute = */ ggml_backend_cpu_graph_compute, /* .graph_compute = */ ggml_backend_cpu_graph_compute,
/* .supports_op = */ NULL,
/* .supports_buft = */ NULL,
/* .offload_op = */ NULL,
/* .event_record = */ NULL, /* .event_record = */ NULL,
/* .event_wait = */ NULL, /* .event_wait = */ NULL,
}; };
@ -1081,10 +1087,10 @@ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned"); GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size); return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
} }
//////////////////////// // CPU backend - device
struct ggml_backend_cpu_device_context { struct ggml_backend_cpu_device_context {
std::string description = "CPU"; std::string description = "CPU";
@ -1171,7 +1177,7 @@ static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t *
} }
static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) { static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
return GGML_BACKEND_DEVICE_TYPE_CPU_FULL; return GGML_BACKEND_DEVICE_TYPE_CPU;
GGML_UNUSED(dev); GGML_UNUSED(dev);
} }
@ -1189,7 +1195,7 @@ static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggm
}; };
} }
static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t dev, const char * params) { static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
return ggml_backend_cpu_init(); return ggml_backend_cpu_init();
GGML_UNUSED(dev); GGML_UNUSED(dev);
@ -1202,7 +1208,7 @@ static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_b
GGML_UNUSED(dev); GGML_UNUSED(dev);
} }
static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
return ggml_backend_cpu_buffer_from_ptr(ptr, size); return ggml_backend_cpu_buffer_from_ptr(ptr, size);
GGML_UNUSED(dev); GGML_UNUSED(dev);
@ -1244,10 +1250,10 @@ static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
/* .get_memory = */ ggml_backend_cpu_device_get_memory, /* .get_memory = */ ggml_backend_cpu_device_get_memory,
/* .get_type = */ ggml_backend_cpu_device_get_type, /* .get_type = */ ggml_backend_cpu_device_get_type,
/* .get_props = */ ggml_backend_cpu_device_get_props, /* .get_props = */ ggml_backend_cpu_device_get_props,
/* .init_backend = */ ggml_backend_cpu_device_init, /* .init_backend = */ ggml_backend_cpu_device_init_backend,
/* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type, /* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
/* .get_host_buffer_type = */ NULL, /* .get_host_buffer_type = */ NULL,
/* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_ptr, /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
/* .supports_op = */ ggml_backend_cpu_device_supports_op, /* .supports_op = */ ggml_backend_cpu_device_supports_op,
/* .supports_buft = */ ggml_backend_cpu_device_supports_buft, /* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
/* .offload_op = */ NULL, /* .offload_op = */ NULL,
@ -1256,7 +1262,7 @@ static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
/* .event_synchronize = */ NULL, /* .event_synchronize = */ NULL,
}; };
//////////////////////// // CPU backend - backend (reg)
static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) { static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
return "CPU"; return "CPU";
@ -1287,6 +1293,10 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
if (strcmp(name, "ggml_backend_set_n_threads") == 0) { if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
return (void *)ggml_backend_cpu_set_n_threads; return (void *)ggml_backend_cpu_set_n_threads;
} }
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
return (void *)ggml_backend_cpu_get_extra_bufts;
}
return NULL; return NULL;
GGML_UNUSED(reg); GGML_UNUSED(reg);
@ -1315,12 +1325,6 @@ struct ggml_backend_multi_buffer_context {
size_t n_buffers; size_t n_buffers;
}; };
static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
}
static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
for (size_t i = 0; i < ctx->n_buffers; i++) { for (size_t i = 0; i < ctx->n_buffers; i++) {
@ -1339,7 +1343,6 @@ static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_
} }
static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = { static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
/* .get_name = */ ggml_backend_multi_buffer_get_name,
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer, /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
/* .get_base = */ NULL, /* .get_base = */ NULL,
/* .init_tensor = */ NULL, /* .init_tensor = */ NULL,
@ -1368,7 +1371,7 @@ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer
} }
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) { bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
return buffer->iface.get_name == ggml_backend_multi_buffer_get_name; return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
} }
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
@ -1460,7 +1463,7 @@ struct ggml_backend_sched {
char * context_buffer; char * context_buffer;
size_t context_buffer_size; size_t context_buffer_size;
bool debug; int debug;
}; };
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor) #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@ -1500,7 +1503,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
return -1; return -1;
} }
#if 0 #if 1
#define GGML_SCHED_MAX_SPLITS_DEBUG 4096 #define GGML_SCHED_MAX_SPLITS_DEBUG 4096
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__) #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
@ -1548,7 +1551,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
if (src == NULL) { if (src == NULL) {
continue; continue;
} }
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { // skip ROPE since the rope freqs tensor is too small to choose a backend based on it
// not an ideal solution
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
// check if a backend with higher prio wants to offload the op // check if a backend with higher prio wants to offload the op
if (src_backend_id == sched->n_backends - 1) { if (src_backend_id == sched->n_backends - 1) {
@ -1595,19 +1600,21 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
if (ggml_is_view_op(node->op)) { if (ggml_is_view_op(node->op)) {
continue; continue;
} }
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); if (sched->debug > 1) {
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)); GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
for (int j = 0; j < GGML_MAX_SRC; j++) { fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
struct ggml_tensor * src = node->src[j]; for (int j = 0; j < GGML_MAX_SRC; j++) {
if (src == NULL) { struct ggml_tensor * src = node->src[j];
continue; if (src == NULL) {
continue;
}
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
} }
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src); GGML_LOG_DEBUG("\n");
GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
} }
GGML_LOG_DEBUG("\n");
} }
} }
@ -1899,11 +1906,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (src == NULL) { if (src == NULL) {
continue; continue;
} }
// check if a weight is on a different backend // check if a weight is on a different and incompatible backend
// by starting a new split, the memory of the previously offloaded weights can be reused // by starting a new split, the memory of the previously offloaded weights can be reused
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = tensor_backend_id(src); int src_backend_id = tensor_backend_id(src);
if (src_backend_id != cur_backend_id) { if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
need_new_split = true; need_new_split = true;
break; break;
} }
@ -1915,7 +1922,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
int src_backend_id = sched->hv_tensor_backend_ids[id]; int src_backend_id = sched->hv_tensor_backend_ids[id];
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) { if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
need_new_split = true; need_new_split = true;
break; break;
} }
@ -2240,7 +2246,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched)); struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL; const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
sched->n_backends = n_backends; sched->n_backends = n_backends;
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1; sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;

View File

@ -224,12 +224,6 @@ static void ggml_backend_blas_free(ggml_backend_t backend) {
delete backend; delete backend;
} }
static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
return ggml_backend_cpu_buffer_type();
GGML_UNUSED(backend);
}
static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
@ -265,7 +259,6 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
static struct ggml_backend_i blas_backend_i = { static struct ggml_backend_i blas_backend_i = {
/* .get_name = */ ggml_backend_blas_get_name, /* .get_name = */ ggml_backend_blas_get_name,
/* .free = */ ggml_backend_blas_free, /* .free = */ ggml_backend_blas_free,
/* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type,
/* .set_tensor_async = */ NULL, /* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL, /* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL, /* .cpy_tensor_async = */ NULL,
@ -275,9 +268,6 @@ static struct ggml_backend_i blas_backend_i = {
/* .graph_plan_update = */ NULL, /* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_blas_graph_compute, /* .graph_compute = */ ggml_backend_blas_graph_compute,
/* .supports_op = */ NULL,
/* .supports_buft = */ NULL,
/* .offload_op = */ NULL,
/* .event_record = */ NULL, /* .event_record = */ NULL,
/* .event_wait = */ NULL, /* .event_wait = */ NULL,
}; };
@ -356,7 +346,7 @@ static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t *
} }
static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) { static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) {
return GGML_BACKEND_DEVICE_TYPE_CPU; return GGML_BACKEND_DEVICE_TYPE_ACCEL;
GGML_UNUSED(dev); GGML_UNUSED(dev);
} }
@ -374,7 +364,7 @@ static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct gg
}; };
} }
static ggml_backend_t ggml_backend_blas_device_init(ggml_backend_dev_t dev, const char * params) { static ggml_backend_t ggml_backend_blas_device_init_backend(ggml_backend_dev_t dev, const char * params) {
return ggml_backend_blas_init(); return ggml_backend_blas_init();
GGML_UNUSED(dev); GGML_UNUSED(dev);
@ -387,7 +377,7 @@ static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_
GGML_UNUSED(dev); GGML_UNUSED(dev);
} }
static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
return ggml_backend_cpu_buffer_from_ptr(ptr, size); return ggml_backend_cpu_buffer_from_ptr(ptr, size);
GGML_UNUSED(dev); GGML_UNUSED(dev);
@ -456,10 +446,10 @@ static const struct ggml_backend_device_i ggml_backend_blas_device_i = {
/* .get_memory = */ ggml_backend_blas_device_get_memory, /* .get_memory = */ ggml_backend_blas_device_get_memory,
/* .get_type = */ ggml_backend_blas_device_get_type, /* .get_type = */ ggml_backend_blas_device_get_type,
/* .get_props = */ ggml_backend_blas_device_get_props, /* .get_props = */ ggml_backend_blas_device_get_props,
/* .init_backend = */ ggml_backend_blas_device_init, /* .init_backend = */ ggml_backend_blas_device_init_backend,
/* .get_buffer_type = */ ggml_backend_blas_device_get_buffer_type, /* .get_buffer_type = */ ggml_backend_blas_device_get_buffer_type,
/* .get_host_buffer_type = */ NULL, /* .get_host_buffer_type = */ NULL,
/* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_ptr, /* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_host_ptr,
/* .supports_op = */ ggml_backend_blas_device_supports_op, /* .supports_op = */ ggml_backend_blas_device_supports_op,
/* .supports_buft = */ ggml_backend_blas_device_supports_buft, /* .supports_buft = */ ggml_backend_blas_device_supports_buft,
/* .offload_op = */ NULL, /* .offload_op = */ NULL,

View File

@ -489,23 +489,6 @@ struct ggml_backend_cann_buffer_context {
~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); } ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
}; };
/**
* @brief Retrieve the name associated with a CANN buffer.
*
* This function returns the name of a CANN buffer, which is stored in the
* context of the buffer.
*
* @param buffer The CANN buffer whose name is to be retrieved.
* @return A pointer to a C-string containing the name of the buffer.
*/
static const char* ggml_backend_cann_buffer_get_name(
ggml_backend_buffer_t buffer) {
return "CANN";
GGML_UNUSED(buffer);
}
/** /**
* @brief Check if a buffer is a CANN buffer. * @brief Check if a buffer is a CANN buffer.
* *
@ -515,9 +498,10 @@ static const char* ggml_backend_cann_buffer_get_name(
* @param buffer The buffer to check. * @param buffer The buffer to check.
* @return true if the buffer is a CANN buffer, false otherwise. * @return true if the buffer is a CANN buffer, false otherwise.
*/ */
static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
static bool ggml_backend_buffer_is_cann( static bool ggml_backend_buffer_is_cann(
ggml_backend_buffer_t buffer) { ggml_backend_buffer_t buffer) {
return buffer->iface.get_name == ggml_backend_cann_buffer_get_name; return ggml_backend_buft_is_cann(buffer->buft);
} }
/** /**
@ -965,7 +949,6 @@ static void ggml_backend_cann_buffer_clear(
* on a CANN buffer within the backend. * on a CANN buffer within the backend.
*/ */
static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = { static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
/* .get_name = */ ggml_backend_cann_buffer_get_name,
/* .free_buffer = */ ggml_backend_cann_buffer_free_buffer, /* .free_buffer = */ ggml_backend_cann_buffer_free_buffer,
/* .get_base = */ ggml_backend_cann_buffer_get_base, /* .get_base = */ ggml_backend_cann_buffer_get_base,
/* .init_tensor = */ ggml_backend_cann_buffer_init_tensor, /* .init_tensor = */ ggml_backend_cann_buffer_init_tensor,
@ -999,9 +982,10 @@ struct ggml_backend_cann_buffer_type_context {
*/ */
static const char* ggml_backend_cann_buffer_type_name( static const char* ggml_backend_cann_buffer_type_name(
ggml_backend_buffer_type_t buft) { ggml_backend_buffer_type_t buft) {
return "CANN"; ggml_backend_cann_buffer_type_context* buft_ctx =
(ggml_backend_cann_buffer_type_context*)buft->context;
GGML_UNUSED(buft); return buft_ctx->name.c_str();
} }
/** /**
@ -1465,24 +1449,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
delete backend; delete backend;
} }
/**
* @brief Retrieves the default buffer type associated with the CANN backend.
*
* This function returns the buffer type specific to the device associated
* with the CANN backend. It is used to allocate buffers for computations
* performed by the backend.
*
* @param backend Pointer to the CANN backend structure.
* @return Pointer to the buffer type structure for the CANN backend.
*/
static ggml_backend_buffer_type_t
ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
ggml_backend_cann_context* cann_ctx =
(ggml_backend_cann_context*)backend->context;
return ggml_backend_cann_buffer_type(cann_ctx->device);
}
/** /**
* @brief Sets tensor data asynchronously in the CANN backend. * @brief Sets tensor data asynchronously in the CANN backend.
* *
@ -1863,7 +1829,6 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
static const ggml_backend_i ggml_backend_cann_interface = { static const ggml_backend_i ggml_backend_cann_interface = {
/* .get_name = */ ggml_backend_cann_name, /* .get_name = */ ggml_backend_cann_name,
/* .free = */ ggml_backend_cann_free, /* .free = */ ggml_backend_cann_free,
/* .get_default_buffer_type = */ ggml_backend_cann_get_default_buffer_type,
/* .set_tensor_async = */ ggml_backend_cann_set_tensor_async, /* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cann_get_tensor_async, /* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
/* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async, /* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
@ -1873,9 +1838,6 @@ static const ggml_backend_i ggml_backend_cann_interface = {
/* .graph_plan_update = */ NULL, /* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_cann_graph_compute, /* .graph_compute = */ ggml_backend_cann_graph_compute,
/* .supports_op = */ NULL, // moved to device
/* .supports_buft = */ NULL, // moved to device
/* .offload_op = */ NULL, // moved to device
/* .event_record = */ ggml_backend_cann_event_record, /* .event_record = */ ggml_backend_cann_event_record,
/* .event_wait = */ ggml_backend_cann_event_wait, /* .event_wait = */ ggml_backend_cann_event_wait,
}; };
@ -1918,7 +1880,7 @@ static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t *
static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) { static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) {
GGML_UNUSED(dev); GGML_UNUSED(dev);
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; return GGML_BACKEND_DEVICE_TYPE_GPU;
} }
static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {

View File

@ -421,20 +421,15 @@ struct ggml_backend_cuda_buffer_context {
} }
}; };
static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
return ctx->name.c_str();
}
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
}
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
delete ctx; delete ctx;
} }
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
return buffer->iface.free_buffer == ggml_backend_cuda_buffer_free_buffer;
}
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
return ctx->dev_ptr; return ctx->dev_ptr;
@ -515,7 +510,6 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
} }
static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
/* .get_name = */ ggml_backend_cuda_buffer_get_name,
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
/* .get_base = */ ggml_backend_cuda_buffer_get_base, /* .get_base = */ ggml_backend_cuda_buffer_get_base,
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
@ -548,8 +542,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
ggml_cuda_set_device(buft_ctx->device); ggml_cuda_set_device(buft_ctx->device);
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
void * dev_ptr; void * dev_ptr;
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device); cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
if (err != cudaSuccess) { if (err != cudaSuccess) {
@ -657,7 +649,9 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
} }
struct ggml_backend_cuda_split_buffer_type_context { struct ggml_backend_cuda_split_buffer_type_context {
int main_device;
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split; std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
std::string name;
}; };
struct ggml_backend_cuda_split_buffer_context { struct ggml_backend_cuda_split_buffer_context {
@ -680,16 +674,6 @@ struct ggml_backend_cuda_split_buffer_context {
std::vector<ggml_tensor_extra_gpu *> tensor_extras; std::vector<ggml_tensor_extra_gpu *> tensor_extras;
}; };
static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
return GGML_CUDA_NAME "_Split";
GGML_UNUSED(buffer);
}
static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
}
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
@ -833,7 +817,6 @@ static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, u
} }
static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
/* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer, /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base, /* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
/* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor, /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
@ -848,9 +831,9 @@ static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
// cuda split buffer type // cuda split buffer type
static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) { static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return GGML_CUDA_NAME "_Split"; ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
GGML_UNUSED(buft); return ctx->name.c_str();
} }
static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) { static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
@ -915,11 +898,11 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
}; };
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
static std::mutex mutex; static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex); std::lock_guard<std::mutex> lock(mutex);
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map; static std::map<std::pair<int, std::array<float, GGML_CUDA_MAX_DEVICES>>, struct ggml_backend_buffer_type> buft_map;
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {}; std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
@ -937,18 +920,23 @@ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * ten
} }
} }
auto it = buft_map.find(tensor_split_arr); auto it = buft_map.find({main_device, tensor_split_arr});
if (it != buft_map.end()) { if (it != buft_map.end()) {
return &it->second; return &it->second;
} }
auto * ctx = new ggml_backend_cuda_split_buffer_type_context{
main_device,
tensor_split_arr,
GGML_CUDA_NAME + std::to_string(main_device) + "_Split",
};
struct ggml_backend_buffer_type buft { struct ggml_backend_buffer_type buft {
/* .iface = */ ggml_backend_cuda_split_buffer_type_interface, /* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0), /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), main_device),
/* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr}, /* .context = */ ctx,
}; };
auto result = buft_map.emplace(tensor_split_arr, buft); auto result = buft_map.emplace(std::make_pair(main_device, tensor_split_arr), buft);
return &result.first->second; return &result.first->second;
} }
@ -960,12 +948,6 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
GGML_UNUSED(buft); GGML_UNUSED(buft);
} }
static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
return GGML_CUDA_NAME "_Host";
GGML_UNUSED(buffer);
}
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
CUDA_CHECK(cudaFreeHost(buffer->context)); CUDA_CHECK(cudaFreeHost(buffer->context));
} }
@ -998,7 +980,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft; buffer->buft = buft;
buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer; buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
return buffer; return buffer;
@ -1400,7 +1381,7 @@ static void ggml_cuda_op_mul_mat(
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING); const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer); const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
GGML_ASSERT(!(split && ne02 > 1)); GGML_ASSERT(!(split && ne02 > 1));
GGML_ASSERT(!(split && ne03 > 1)); GGML_ASSERT(!(split && ne03 > 1));
GGML_ASSERT(!(split && ne02 < ne12)); GGML_ASSERT(!(split && ne02 < ne12));
@ -1890,7 +1871,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
} }
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer); const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type) bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
@ -2017,7 +1998,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
GGML_TENSOR_BINARY_OP_LOCALS GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers"); GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
@ -2150,7 +2131,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) { static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
// why is this here instead of mul_mat? // why is this here instead of mul_mat?
if (dst->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(dst->src[0]->buffer)) { if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device); ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
} }
@ -2371,12 +2352,6 @@ static void ggml_backend_cuda_free(ggml_backend_t backend) {
delete backend; delete backend;
} }
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
}
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
@ -2582,7 +2557,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
continue; continue;
} }
if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) { if (node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_cuda_split(node->src[0]->buffer->buft)) {
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
#ifndef NDEBUG #ifndef NDEBUG
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__); GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
@ -2669,7 +2644,8 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
for (int j = 0; j < GGML_MAX_SRC; j++) { for (int j = 0; j < GGML_MAX_SRC; j++) {
if (node->src[j] != nullptr) { if (node->src[j] != nullptr) {
assert(node->src[j]->buffer); assert(node->src[j]->buffer);
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer)); assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
} }
} }
#endif #endif
@ -2762,7 +2738,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info); cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
if (stat == cudaErrorGraphExecUpdateFailure) { if (stat == cudaErrorGraphExecUpdateFailure) {
#ifndef NDEBUG #ifndef NDEBUG
GGML_LOG_ERROR("%s: CUDA graph update failed\n", __func__); GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
#endif #endif
// The pre-existing graph exec cannot be updated due to violated constraints // The pre-existing graph exec cannot be updated due to violated constraints
// so instead clear error and re-instantiate // so instead clear error and re-instantiate
@ -2811,7 +2787,6 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
static const ggml_backend_i ggml_backend_cuda_interface = { static const ggml_backend_i ggml_backend_cuda_interface = {
/* .get_name = */ ggml_backend_cuda_get_name, /* .get_name = */ ggml_backend_cuda_get_name,
/* .free = */ ggml_backend_cuda_free, /* .free = */ ggml_backend_cuda_free,
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async, /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async, /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async, /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
@ -2821,9 +2796,6 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
/* .graph_plan_update = */ NULL, /* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .graph_compute = */ ggml_backend_cuda_graph_compute,
/* .supports_op = */ NULL, // moved to device
/* .supports_buft = */ NULL, // moved to device
/* .offload_op = */ NULL, // moved to device
/* .event_record = */ ggml_backend_cuda_event_record, /* .event_record = */ ggml_backend_cuda_event_record,
/* .event_wait = */ ggml_backend_cuda_event_wait, /* .event_wait = */ ggml_backend_cuda_event_wait,
}; };
@ -2913,7 +2885,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) { static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
GGML_UNUSED(dev); GGML_UNUSED(dev);
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; return GGML_BACKEND_DEVICE_TYPE_GPU;
} }
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
@ -2937,7 +2909,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
}; };
} }
static ggml_backend_t ggml_backend_cuda_device_init(ggml_backend_dev_t dev, const char * params) { static ggml_backend_t ggml_backend_cuda_device_init_backend(ggml_backend_dev_t dev, const char * params) {
GGML_UNUSED(params); GGML_UNUSED(params);
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
return ggml_backend_cuda_init(ctx->device); return ggml_backend_cuda_init(ctx->device);
@ -2953,18 +2925,29 @@ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(
return ggml_backend_cuda_host_buffer_type(); return ggml_backend_cuda_host_buffer_type();
} }
static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
GGML_UNUSED(dev);
GGML_UNUSED(ptr);
GGML_UNUSED(size);
GGML_UNUSED(max_tensor_size);
return nullptr;
}
// TODO: move these functions here // TODO: move these functions here
static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context; ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
// split buffers can only be used with GGML_OP_MUL_MAT
if (op->op != GGML_OP_MUL_MAT) {
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda_split(op->src[i]->buffer->buft)) {
return false;
}
}
}
// check if all the sources are allocated on this device
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda(op->src[i]->buffer->buft)) {
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)op->src[i]->buffer->buft->context;
if (buft_ctx->device != dev_ctx->device) {
return false;
}
}
}
switch (op->op) { switch (op->op) {
case GGML_OP_UNARY: case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) { switch (ggml_get_unary_op(op)) {
@ -3190,24 +3173,27 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
} }
static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
if (ggml_backend_buft_is_cuda_split(buft)) { return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
return true; }
}
if (ggml_backend_buft_is_cuda(buft)) { static int64_t get_op_batch_size(const ggml_tensor * op) {
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context; switch (op->op) {
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; case GGML_OP_GET_ROWS:
return buft_ctx->device == dev_ctx->device; return 0;
case GGML_OP_MUL_MAT:
return op->ne[1];
case GGML_OP_MUL_MAT_ID:
case GGML_OP_ROPE:
return op->ne[2];
default:
return ggml_nrows(op);
} }
return false;
} }
static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
const int min_batch_size = 32; const int min_batch_size = 32;
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || return get_op_batch_size(op) >= min_batch_size;
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
GGML_UNUSED(dev); GGML_UNUSED(dev);
} }
@ -3248,10 +3234,10 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
/* .get_memory = */ ggml_backend_cuda_device_get_memory, /* .get_memory = */ ggml_backend_cuda_device_get_memory,
/* .get_type = */ ggml_backend_cuda_device_get_type, /* .get_type = */ ggml_backend_cuda_device_get_type,
/* .get_props = */ ggml_backend_cuda_device_get_props, /* .get_props = */ ggml_backend_cuda_device_get_props,
/* .init_backend = */ ggml_backend_cuda_device_init, /* .init_backend = */ ggml_backend_cuda_device_init_backend,
/* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type, /* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type,
/* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type, /* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type,
/* .buffer_from_host_ptr = */ ggml_backend_cuda_device_buffer_from_host_ptr, /* .buffer_from_host_ptr = */ NULL,
/* .supports_op = */ ggml_backend_cuda_device_supports_op, /* .supports_op = */ ggml_backend_cuda_device_supports_op,
/* .supports_buft = */ ggml_backend_cuda_device_supports_buft, /* .supports_buft = */ ggml_backend_cuda_device_supports_buft,
/* .offload_op = */ ggml_backend_cuda_device_offload_op, /* .offload_op = */ ggml_backend_cuda_device_offload_op,

View File

@ -1820,11 +1820,6 @@ static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) {
} }
} }
static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) {
auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buffer->buft->context);
return ctx->name.c_str();
}
static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
auto * memory = (ggml_vk_memory *)buffer->context; auto * memory = (ggml_vk_memory *)buffer->context;
if (ggml_vk_has_device()) { if (ggml_vk_has_device()) {
@ -1868,7 +1863,6 @@ static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint
} }
static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = { static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
/* .get_name = */ ggml_backend_kompute_buffer_get_name,
/* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer, /* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer,
/* .get_base = */ ggml_backend_kompute_buffer_get_base, /* .get_base = */ ggml_backend_kompute_buffer_get_base,
/* .init_tensor = */ NULL, /* .init_tensor = */ NULL,
@ -1953,11 +1947,6 @@ static void ggml_backend_kompute_free(ggml_backend_t backend) {
delete backend; delete backend;
} }
static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) {
auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
return ggml_backend_kompute_buffer_type(ctx->device);
}
static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
auto * ctx = static_cast<ggml_kompute_context *>(backend->context); auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
ggml_vk_graph_compute(ctx, cgraph); ggml_vk_graph_compute(ctx, cgraph);
@ -1977,7 +1966,6 @@ static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_back
static struct ggml_backend_i kompute_backend_i = { static struct ggml_backend_i kompute_backend_i = {
/* .get_name = */ ggml_backend_kompute_name, /* .get_name = */ ggml_backend_kompute_name,
/* .free = */ ggml_backend_kompute_free, /* .free = */ ggml_backend_kompute_free,
/* .get_default_buffer_type = */ ggml_backend_kompute_get_default_buffer_type,
/* .set_tensor_async = */ NULL, /* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL, /* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL, /* .cpy_tensor_async = */ NULL,
@ -1987,9 +1975,6 @@ static struct ggml_backend_i kompute_backend_i = {
/* .graph_plan_update = */ NULL, /* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_kompute_graph_compute, /* .graph_compute = */ ggml_backend_kompute_graph_compute,
/* .supports_op = */ ggml_backend_kompute_supports_op,
/* .supports_buft = */ ggml_backend_kompute_supports_buft,
/* .offload_op = */ NULL,
/* .event_record = */ NULL, /* .event_record = */ NULL,
/* .event_wait = */ NULL, /* .event_wait = */ NULL,
}; };

View File

@ -3247,12 +3247,6 @@ static enum ggml_status ggml_metal_graph_compute(
// backend interface // backend interface
static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
return "Metal";
UNUSED(buffer);
}
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
@ -3307,7 +3301,6 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
} }
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
/* .get_name = */ ggml_backend_metal_buffer_get_name,
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer, /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
/* .get_base = */ ggml_backend_metal_buffer_get_base, /* .get_base = */ ggml_backend_metal_buffer_get_base,
/* .init_tensor = */ NULL, /* .init_tensor = */ NULL,
@ -3432,6 +3425,29 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
return &ggml_backend_buffer_type_metal; return &ggml_backend_buffer_type_metal;
} }
static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
return "Metal_Mapped";
UNUSED(buft);
}
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = {
/* .iface = */ {
/* .get_name = */ ggml_backend_metal_buffer_from_ptr_type_get_name,
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
},
/* .device = */ &g_ggml_backend_metal_device,
/* .context = */ NULL,
};
return &ggml_backend_buffer_from_ptr_type_metal;
}
// TODO: obsoleted by ggml_backend_metal_device_buffer_from_ptr // TODO: obsoleted by ggml_backend_metal_device_buffer_from_ptr
ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) { ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context)); struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
@ -3508,7 +3524,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
} }
} }
return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size); return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
} }
// backend // backend
@ -3529,12 +3545,6 @@ static void ggml_backend_metal_free(ggml_backend_t backend) {
free(backend); free(backend);
} }
static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
return ggml_backend_metal_buffer_type();
UNUSED(backend);
}
static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
return ggml_metal_graph_compute(backend, cgraph); return ggml_metal_graph_compute(backend, cgraph);
} }
@ -3601,7 +3611,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
static struct ggml_backend_i ggml_backend_metal_i = { static struct ggml_backend_i ggml_backend_metal_i = {
/* .get_name = */ ggml_backend_metal_name, /* .get_name = */ ggml_backend_metal_name,
/* .free = */ ggml_backend_metal_free, /* .free = */ ggml_backend_metal_free,
/* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type,
/* .set_tensor_async = */ NULL, /* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL, /* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL, /* .cpy_tensor_async = */ NULL,
@ -3611,9 +3620,6 @@ static struct ggml_backend_i ggml_backend_metal_i = {
/* .graph_plan_update = */ NULL, /* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_metal_graph_compute, /* .graph_compute = */ ggml_backend_metal_graph_compute,
/* .supports_op = */ NULL,
/* .supports_buft = */ NULL,
/* .offload_op = */ NULL,
/* .event_record = */ NULL, /* .event_record = */ NULL,
/* .event_wait = */ NULL, /* .event_wait = */ NULL,
}; };
@ -3708,7 +3714,7 @@ static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t
} }
static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backend_dev_t dev) { static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backend_dev_t dev) {
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; return GGML_BACKEND_DEVICE_TYPE_GPU;
GGML_UNUSED(dev); GGML_UNUSED(dev);
} }

View File

@ -178,7 +178,6 @@ struct ggml_backend_rpc_buffer_context {
std::shared_ptr<socket_t> sock; std::shared_ptr<socket_t> sock;
std::unordered_map<ggml_backend_buffer_t, void *> base_cache; std::unordered_map<ggml_backend_buffer_t, void *> base_cache;
uint64_t remote_ptr; uint64_t remote_ptr;
std::string name;
}; };
// RPC helper functions // RPC helper functions
@ -409,11 +408,6 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
return sock; return sock;
} }
static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) {
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
return ctx->name.c_str();
}
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
rpc_msg_free_buffer_req request = {ctx->remote_ptr}; rpc_msg_free_buffer_req request = {ctx->remote_ptr};
@ -524,7 +518,6 @@ static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
} }
static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = { static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
/* .get_name = */ ggml_backend_rpc_buffer_get_name,
/* .free_buffer = */ ggml_backend_rpc_buffer_free_buffer, /* .free_buffer = */ ggml_backend_rpc_buffer_free_buffer,
/* .get_base = */ ggml_backend_rpc_buffer_get_base, /* .get_base = */ ggml_backend_rpc_buffer_get_base,
/* .init_tensor = */ ggml_backend_rpc_buffer_init_tensor, /* .init_tensor = */ ggml_backend_rpc_buffer_init_tensor,
@ -551,7 +544,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
if (response.remote_ptr != 0) { if (response.remote_ptr != 0) {
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
ggml_backend_rpc_buffer_interface, ggml_backend_rpc_buffer_interface,
new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"}, new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr},
response.remote_size); response.remote_size);
return buffer; return buffer;
} else { } else {
@ -609,11 +602,6 @@ static void ggml_backend_rpc_free(ggml_backend_t backend) {
delete backend; delete backend;
} }
static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
}
static void ggml_backend_rpc_synchronize(ggml_backend_t backend) { static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
UNUSED(backend); UNUSED(backend);
// this is no-op because we don't have any async operations // this is no-op because we don't have any async operations
@ -670,7 +658,6 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g
static ggml_backend_i ggml_backend_rpc_interface = { static ggml_backend_i ggml_backend_rpc_interface = {
/* .get_name = */ ggml_backend_rpc_name, /* .get_name = */ ggml_backend_rpc_name,
/* .free = */ ggml_backend_rpc_free, /* .free = */ ggml_backend_rpc_free,
/* .get_default_buffer_type = */ ggml_backend_rpc_get_default_buffer_type,
/* .set_tensor_async = */ NULL, /* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL, /* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL, /* .cpy_tensor_async = */ NULL,
@ -680,9 +667,6 @@ static ggml_backend_i ggml_backend_rpc_interface = {
/* .graph_plan_update = */ NULL, /* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_rpc_graph_compute, /* .graph_compute = */ ggml_backend_rpc_graph_compute,
/* .supports_op = */ NULL,
/* .supports_buft = */ NULL,
/* .offload_op = */ NULL,
/* .event_record = */ NULL, /* .event_record = */ NULL,
/* .event_wait = */ NULL, /* .event_wait = */ NULL,
}; };
@ -1278,7 +1262,7 @@ static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t *
static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) { static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
// TODO: obtain value from the server // TODO: obtain value from the server
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; return GGML_BACKEND_DEVICE_TYPE_GPU;
UNUSED(dev); UNUSED(dev);
} }

View File

@ -249,13 +249,10 @@ struct ggml_backend_sycl_buffer_context {
} }
}; };
static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) { static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_type_t buft);
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
return ctx->name.c_str();
}
static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) { static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name; return buffer->buft->iface.get_name == ggml_backend_sycl_buffer_type_get_name;
} }
static void static void
@ -440,7 +437,6 @@ catch (sycl::exception const &exc) {
} }
static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = { static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
/* .get_name = */ ggml_backend_sycl_buffer_get_name,
/* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer, /* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer,
/* .get_base = */ ggml_backend_sycl_buffer_get_base, /* .get_base = */ ggml_backend_sycl_buffer_get_base,
/* .init_tensor = */ ggml_backend_sycl_buffer_init_tensor, /* .init_tensor = */ ggml_backend_sycl_buffer_init_tensor,
@ -698,16 +694,6 @@ struct ggml_backend_sycl_split_buffer_context {
std::vector<queue_ptr> streams; std::vector<queue_ptr> streams;
}; };
static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) {
return GGML_SYCL_NAME "_Split";
GGML_UNUSED(buffer);
}
static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name;
}
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
delete ctx; delete ctx;
@ -915,7 +901,6 @@ static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, u
} }
static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = { static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
/* .get_name = */ ggml_backend_sycl_split_buffer_get_name,
/* .free_buffer = */ ggml_backend_sycl_split_buffer_free_buffer, /* .free_buffer = */ ggml_backend_sycl_split_buffer_free_buffer,
/* .get_base = */ ggml_backend_sycl_split_buffer_get_base, /* .get_base = */ ggml_backend_sycl_split_buffer_get_base,
/* .init_tensor = */ ggml_backend_sycl_split_buffer_init_tensor, /* .init_tensor = */ ggml_backend_sycl_split_buffer_init_tensor,
@ -935,6 +920,10 @@ static const char * ggml_backend_sycl_split_buffer_type_get_name(ggml_backend_bu
GGML_UNUSED(buft); GGML_UNUSED(buft);
} }
static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
return buffer->buft->iface.get_name == ggml_backend_sycl_split_buffer_type_get_name;
}
static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
// instead, we allocate them for each tensor separately in init_tensor // instead, we allocate them for each tensor separately in init_tensor
@ -1040,12 +1029,6 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
GGML_UNUSED(buft); GGML_UNUSED(buft);
} }
static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) {
return GGML_SYCL_NAME "_Host";
GGML_UNUSED(buffer);
}
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_sycl_host_free(buffer->context); ggml_sycl_host_free(buffer->context);
} }
@ -1061,7 +1044,6 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm
// FIXME: this is a hack to avoid having to implement a new buffer type // FIXME: this is a hack to avoid having to implement a new buffer type
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft; buffer->buft = buft;
buffer->iface.get_name = ggml_backend_sycl_host_buffer_name;
buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer; buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer;
return buffer; return buffer;
@ -4889,12 +4871,6 @@ static void ggml_backend_sycl_free(ggml_backend_t backend) {
delete backend; delete backend;
} }
static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
return ggml_backend_sycl_buffer_type(sycl_ctx->device);
}
static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend, static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
ggml_tensor *tensor, ggml_tensor *tensor,
const void *data, size_t offset, const void *data, size_t offset,
@ -5031,7 +5007,6 @@ static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_ev
static ggml_backend_i ggml_backend_sycl_interface = { static ggml_backend_i ggml_backend_sycl_interface = {
/* .get_name = */ ggml_backend_sycl_get_name, /* .get_name = */ ggml_backend_sycl_get_name,
/* .free = */ ggml_backend_sycl_free, /* .free = */ ggml_backend_sycl_free,
/* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
/* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async, /* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async, /* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
/* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async, /* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
@ -5043,9 +5018,6 @@ static ggml_backend_i ggml_backend_sycl_interface = {
/* .graph_plan_update = */ NULL, /* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_sycl_graph_compute, /* .graph_compute = */ ggml_backend_sycl_graph_compute,
/* .supports_op = */ NULL, // moved to device
/* .supports_buft = */ NULL, // moved to device
/* .offload_op = */ NULL, // moved to device
/* .event_record = */ ggml_backend_sycl_event_record, /* .event_record = */ ggml_backend_sycl_event_record,
/* .event_wait = */ ggml_backend_sycl_event_wait, /* .event_wait = */ ggml_backend_sycl_event_wait,
}; };
@ -5092,7 +5064,7 @@ static void ggml_backend_sycl_device_get_memory(ggml_backend_dev_t dev, size_t *
static enum ggml_backend_dev_type ggml_backend_sycl_device_get_type(ggml_backend_dev_t dev) { static enum ggml_backend_dev_type ggml_backend_sycl_device_get_type(ggml_backend_dev_t dev) {
GGML_UNUSED(dev); GGML_UNUSED(dev);
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; return GGML_BACKEND_DEVICE_TYPE_GPU;
} }
static void ggml_backend_sycl_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { static void ggml_backend_sycl_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
@ -5388,12 +5360,14 @@ static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t re
return ctx->devices[index]; return ctx->devices[index];
} }
static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) {
{
GGML_UNUSED(reg); GGML_UNUSED(reg);
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
return (void *)ggml_backend_sycl_split_buffer_type; // TODO: update to the current function signature
} //if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
// return (void *)ggml_backend_sycl_split_buffer_type;
//}
// SYCL doesn't support registering host memory, left here for reference // SYCL doesn't support registering host memory, left here for reference
// "ggml_backend_register_host_buffer" // "ggml_backend_register_host_buffer"
// "ggml_backend_unregister_host_buffer" // "ggml_backend_unregister_host_buffer"

View File

@ -6247,13 +6247,8 @@ static void ggml_vk_get_device_description(int device, char * description, size_
// device backend // device backend
static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
return ctx->name.c_str();
}
static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) { static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
return buffer->iface.get_name == ggml_backend_vk_buffer_get_name; return buffer->buft->iface.get_name == ggml_backend_vk_buffer_type_name;
} }
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@ -6317,7 +6312,6 @@ static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t v
} }
static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = { static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
/* .get_name = */ ggml_backend_vk_buffer_get_name,
/* .free_buffer = */ ggml_backend_vk_buffer_free_buffer, /* .free_buffer = */ ggml_backend_vk_buffer_free_buffer,
/* .get_base = */ ggml_backend_vk_buffer_get_base, /* .get_base = */ ggml_backend_vk_buffer_get_base,
/* .init_tensor = */ ggml_backend_vk_buffer_init_tensor, /* .init_tensor = */ ggml_backend_vk_buffer_init_tensor,
@ -6413,7 +6407,6 @@ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft; buffer->buft = buft;
buffer->iface.get_name = ggml_backend_vk_host_buffer_name;
buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer; buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer;
return buffer; return buffer;
@ -6646,7 +6639,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
static ggml_backend_i ggml_backend_vk_interface = { static ggml_backend_i ggml_backend_vk_interface = {
/* .get_name = */ ggml_backend_vk_name, /* .get_name = */ ggml_backend_vk_name,
/* .free = */ ggml_backend_vk_free, /* .free = */ ggml_backend_vk_free,
/* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async, /* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
/* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async, /* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async, /* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
@ -6656,9 +6648,6 @@ static ggml_backend_i ggml_backend_vk_interface = {
/* .graph_plan_update = */ NULL, /* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_vk_graph_compute, /* .graph_compute = */ ggml_backend_vk_graph_compute,
/* .supports_op = */ NULL,
/* .supports_buft = */ NULL,
/* .offload_op = */ NULL,
/* .event_record = */ NULL, /* .event_record = */ NULL,
/* .event_wait = */ NULL, /* .event_wait = */ NULL,
}; };
@ -6717,7 +6706,7 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total
////////////////////////// //////////////////////////
struct ggml_backend_vk_device_context { struct ggml_backend_vk_device_context {
int device; size_t device;
std::string name; std::string name;
std::string description; std::string description;
}; };
@ -6749,7 +6738,7 @@ static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(gg
static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) { static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
UNUSED(dev); UNUSED(dev);
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; return GGML_BACKEND_DEVICE_TYPE_GPU;
} }
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
@ -6758,9 +6747,10 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
props->type = ggml_backend_vk_device_get_type(dev); props->type = ggml_backend_vk_device_get_type(dev);
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total); ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = { props->caps = {
/* async */ false, /* .async = */ false,
/* host_buffer */ true, /* .host_buffer = */ true,
/* events */ false, /* .buffer_from_host_ptr = */ false,
/* .events = */ false,
}; };
} }
@ -6949,7 +6939,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
static std::mutex mutex; static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex); std::lock_guard<std::mutex> lock(mutex);
if (!initialized) { if (!initialized) {
for (size_t i = 0; i < ggml_backend_vk_get_device_count(); i++) { for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context; ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
char desc[256]; char desc[256];
ggml_backend_vk_get_device_description(i, desc, sizeof(desc)); ggml_backend_vk_get_device_description(i, desc, sizeof(desc));

View File

@ -4028,7 +4028,9 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
__func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size); __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
assert(false); #ifndef NDEBUG
GGML_ABORT("not enough space in the context's memory pool");
#endif
return NULL; return NULL;
} }

View File

@ -205,7 +205,7 @@ extern "C" {
enum llama_split_mode { enum llama_split_mode {
LLAMA_SPLIT_MODE_NONE = 0, // single GPU LLAMA_SPLIT_MODE_NONE = 0, // single GPU
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
}; };
// TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979) // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
@ -274,10 +274,7 @@ extern "C" {
int32_t n_gpu_layers; // number of layers to store in VRAM int32_t n_gpu_layers; // number of layers to store in VRAM
enum llama_split_mode split_mode; // how to split the model across multiple GPUs enum llama_split_mode split_mode; // how to split the model across multiple GPUs
// main_gpu interpretation depends on split_mode: // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
// LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
// LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
// LLAMA_SPLIT_MODE_LAYER: ignored
int32_t main_gpu; int32_t main_gpu;
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()

View File

@ -20,7 +20,7 @@ logger = logging.getLogger("compare-llama-bench")
# Properties by which to differentiate results per commit: # Properties by which to differentiate results per commit:
KEY_PROPERTIES = [ KEY_PROPERTIES = [
"cpu_info", "gpu_info", "n_gpu_layers", "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "cpu_info", "gpu_info", "n_gpu_layers", "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
"blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "embeddings", "n_threads", "blas", "model_filename", "model_type", "n_batch", "n_ubatch", "embeddings", "n_threads",
"type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen" "type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
] ]

File diff suppressed because it is too large Load Diff