mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-14 06:49:54 +00:00
llama : refactor model loader with backend registry (#10026)
This commit is contained in:
parent
8f275a7c45
commit
c5b0f4b5d9
@ -21,12 +21,6 @@
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "ggml-cuda.h"
|
||||
#include "ggml-sycl.h"
|
||||
|
||||
#ifdef GGML_USE_CANN
|
||||
#include "ggml-cann.h"
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
@ -82,95 +76,27 @@ static T stdev(const std::vector<T> & v) {
|
||||
}
|
||||
|
||||
static std::string get_cpu_info() {
|
||||
std::string id;
|
||||
#ifdef __linux__
|
||||
FILE * f = fopen("/proc/cpuinfo", "r");
|
||||
if (f) {
|
||||
char buf[1024];
|
||||
while (fgets(buf, sizeof(buf), f)) {
|
||||
if (strncmp(buf, "model name", 10) == 0) {
|
||||
char * p = strchr(buf, ':');
|
||||
if (p) {
|
||||
p++;
|
||||
while (std::isspace(*p)) {
|
||||
p++;
|
||||
}
|
||||
while (std::isspace(p[strlen(p) - 1])) {
|
||||
p[strlen(p) - 1] = '\0';
|
||||
}
|
||||
id = p;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
HKEY hKey;
|
||||
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
|
||||
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
|
||||
0,
|
||||
KEY_READ,
|
||||
&hKey) != ERROR_SUCCESS) {
|
||||
// fail to open registry key
|
||||
return "";
|
||||
}
|
||||
char cpu_brand[256];
|
||||
DWORD cpu_brand_size = sizeof(cpu_brand);
|
||||
if (RegQueryValueExA(hKey,
|
||||
TEXT("ProcessorNameString"),
|
||||
NULL,
|
||||
NULL,
|
||||
(LPBYTE)cpu_brand,
|
||||
&cpu_brand_size) == ERROR_SUCCESS) {
|
||||
id.assign(cpu_brand, cpu_brand_size);
|
||||
if (id.find('\0') != std::string::npos) {
|
||||
id.resize(id.find('\0'));
|
||||
std::vector<std::string> cpu_list;
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||
auto * dev = ggml_backend_dev_get(i);
|
||||
auto dev_type = ggml_backend_dev_type(dev);
|
||||
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
|
||||
cpu_list.push_back(ggml_backend_dev_description(dev));
|
||||
}
|
||||
}
|
||||
RegCloseKey(hKey);
|
||||
#endif
|
||||
// TODO: other platforms
|
||||
return id;
|
||||
return join(cpu_list, ", ");
|
||||
}
|
||||
|
||||
static std::string get_gpu_info() {
|
||||
std::string id;
|
||||
#ifdef GGML_USE_CUDA
|
||||
int count = ggml_backend_cuda_get_device_count();
|
||||
for (int i = 0; i < count; i++) {
|
||||
char buf[128];
|
||||
ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
|
||||
id += buf;
|
||||
if (i < count - 1) {
|
||||
id += "/";
|
||||
std::vector<std::string> gpu_list;
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||
auto * dev = ggml_backend_dev_get(i);
|
||||
auto dev_type = ggml_backend_dev_type(dev);
|
||||
if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
|
||||
gpu_list.push_back(ggml_backend_dev_description(dev));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef GGML_USE_SYCL
|
||||
int count = ggml_backend_sycl_get_device_count();
|
||||
for (int i = 0; i < count; i++) {
|
||||
char buf[128];
|
||||
ggml_backend_sycl_get_device_description(i, buf, sizeof(buf));
|
||||
id += buf;
|
||||
if (i < count - 1) {
|
||||
id += "/";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef GGML_USE_CANN
|
||||
uint32_t count = ggml_backend_cann_get_device_count();
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
char buf[128];
|
||||
ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
|
||||
id += buf;
|
||||
if (i < count - 1) {
|
||||
id += "/";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// TODO: other backends
|
||||
return id;
|
||||
return join(gpu_list, ", ");
|
||||
}
|
||||
|
||||
// command line params
|
||||
@ -938,29 +864,15 @@ struct test {
|
||||
}
|
||||
|
||||
static std::string get_backend() {
|
||||
if (cuda) {
|
||||
return GGML_CUDA_NAME;
|
||||
std::vector<std::string> backends;
|
||||
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
||||
auto * reg = ggml_backend_reg_get(i);
|
||||
std::string name = ggml_backend_reg_name(reg);
|
||||
if (name != "CPU") {
|
||||
backends.push_back(ggml_backend_reg_name(reg));
|
||||
}
|
||||
}
|
||||
if (vulkan) {
|
||||
return "Vulkan";
|
||||
}
|
||||
if (kompute) {
|
||||
return "Kompute";
|
||||
}
|
||||
if (metal) {
|
||||
return "Metal";
|
||||
}
|
||||
if (sycl) {
|
||||
return GGML_SYCL_NAME;
|
||||
}
|
||||
if (gpu_blas) {
|
||||
return "GPU BLAS";
|
||||
}
|
||||
if (blas) {
|
||||
return "BLAS";
|
||||
}
|
||||
|
||||
return "CPU";
|
||||
return backends.empty() ? "CPU" : join(backends, ",");
|
||||
}
|
||||
|
||||
static const std::vector<std::string> & get_fields() {
|
||||
|
@ -114,11 +114,12 @@ extern "C" {
|
||||
//
|
||||
|
||||
enum ggml_backend_dev_type {
|
||||
// CPU device using system memory
|
||||
GGML_BACKEND_DEVICE_TYPE_CPU,
|
||||
// GPU device using dedicated memory
|
||||
GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||
// devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
|
||||
GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
|
||||
GGML_BACKEND_DEVICE_TYPE_GPU_FULL
|
||||
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
|
||||
GGML_BACKEND_DEVICE_TYPE_ACCEL
|
||||
};
|
||||
|
||||
// functionality supported by the device
|
||||
@ -167,10 +168,14 @@ extern "C" {
|
||||
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
|
||||
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
|
||||
|
||||
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
|
||||
|
||||
// Functions that may be obtained using ggml_backend_reg_get_proc_address
|
||||
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
|
||||
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t, int);
|
||||
// Split buffer type for tensor parallelism
|
||||
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
|
||||
// Set the number of threads for the backend
|
||||
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
|
||||
// Get additional buffer types provided by the device (returns a NULL-terminated array)
|
||||
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
|
||||
|
||||
//
|
||||
// Backend registry
|
||||
@ -192,7 +197,7 @@ extern "C" {
|
||||
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
|
||||
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
|
||||
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
|
||||
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
|
||||
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
|
||||
GGML_API ggml_backend_t ggml_backend_init_best(void);
|
||||
|
||||
//
|
||||
|
@ -28,7 +28,7 @@ GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||
|
||||
// split tensor buffer that splits matrices by rows across multiple devices
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
|
||||
|
||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||
|
@ -16,12 +16,6 @@
|
||||
#if defined(__AMX_INT8__)
|
||||
|
||||
// AMX buffer interface
|
||||
static const char * ggml_backend_amx_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
return "AMX";
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
free(buffer->context);
|
||||
}
|
||||
@ -72,7 +66,6 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
|
||||
/* .get_name = */ ggml_backend_amx_buffer_get_name,
|
||||
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_amx_buffer_get_base,
|
||||
/* .init_tensor = */ NULL, // no initialization required
|
||||
@ -121,14 +114,14 @@ static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft
|
||||
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
|
||||
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
|
||||
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
|
||||
},
|
||||
/* .device = */ NULL,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
@ -149,12 +142,6 @@ static void ggml_backend_amx_free(ggml_backend_t backend) {
|
||||
delete backend;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_amx_get_default_buffer_type(ggml_backend_t backend) {
|
||||
return ggml_backend_amx_buffer_type();
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
|
||||
|
||||
@ -187,7 +174,6 @@ static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, s
|
||||
static struct ggml_backend_i ggml_backend_amx_i = {
|
||||
/* .get_name = */ ggml_backend_amx_name,
|
||||
/* .free = */ ggml_backend_amx_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_amx_get_default_buffer_type,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
@ -197,9 +183,6 @@ static struct ggml_backend_i ggml_backend_amx_i = {
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_amx_graph_compute,
|
||||
/* .supports_op = */ NULL,
|
||||
/* .supports_buft = */ NULL,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
};
|
||||
@ -279,7 +262,7 @@ static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t *
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
|
||||
return GGML_BACKEND_DEVICE_TYPE_CPU;
|
||||
return GGML_BACKEND_DEVICE_TYPE_ACCEL;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
@ -22,7 +22,7 @@ extern "C" {
|
||||
size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
|
||||
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
|
||||
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
||||
// (optional) check if tensor data is in host memory (defaults to false)
|
||||
// (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
|
||||
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
||||
};
|
||||
|
||||
@ -37,7 +37,6 @@ extern "C" {
|
||||
//
|
||||
|
||||
struct ggml_backend_buffer_i {
|
||||
const char * (*get_name) (ggml_backend_buffer_t buffer);
|
||||
// (optional) free the buffer
|
||||
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
||||
// base address of the buffer
|
||||
@ -88,19 +87,16 @@ extern "C" {
|
||||
|
||||
void (*free)(ggml_backend_t backend);
|
||||
|
||||
// Will be moved to the device interface
|
||||
// buffer allocation
|
||||
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
|
||||
|
||||
// (optional) asynchronous tensor data access
|
||||
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
// (optional) complete all pending operations
|
||||
// (optional) complete all pending operations (required if the backend supports async operations)
|
||||
void (*synchronize)(ggml_backend_t backend);
|
||||
|
||||
// (optional) compute graph with a plan (not used currently)
|
||||
// (optional) graph plans (not used currently)
|
||||
// compute graph with a plan
|
||||
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
||||
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
||||
@ -111,13 +107,6 @@ extern "C" {
|
||||
// compute graph (always async if supported by the backend)
|
||||
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
|
||||
// IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
|
||||
// new backends should implement the device interface instead
|
||||
// These functions are being moved to the device interface
|
||||
bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
||||
bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
|
||||
// (optional) event synchronization
|
||||
// record an event on this stream
|
||||
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
|
||||
|
@ -34,6 +34,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
if (size == 0) {
|
||||
// return a dummy buffer for zero-sized allocations
|
||||
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
||||
}
|
||||
|
||||
return buft->iface.alloc_buffer(buft, size);
|
||||
}
|
||||
|
||||
@ -89,7 +94,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
}
|
||||
|
||||
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.get_name(buffer);
|
||||
return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer));
|
||||
}
|
||||
|
||||
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
@ -108,6 +113,11 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
||||
}
|
||||
|
||||
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
// get_base is optional if the buffer is zero-sized
|
||||
if (buffer->size == 0) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void * base = buffer->iface.get_base(buffer);
|
||||
|
||||
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
||||
@ -122,6 +132,15 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
// clear is optional if the buffer is zero-sized
|
||||
if (buffer->size == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
buffer->iface.clear(buffer, value);
|
||||
}
|
||||
|
||||
size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
|
||||
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
||||
}
|
||||
@ -134,10 +153,6 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
|
||||
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
||||
}
|
||||
|
||||
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
buffer->iface.clear(buffer, value);
|
||||
}
|
||||
|
||||
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
||||
return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
|
||||
}
|
||||
@ -198,7 +213,7 @@ void ggml_backend_free(ggml_backend_t backend) {
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
|
||||
return backend->iface.get_default_buffer_type(backend);
|
||||
return ggml_backend_dev_buffer_type(backend->device);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
|
||||
@ -238,43 +253,42 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
|
||||
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||
|
||||
if (!size) {
|
||||
return;
|
||||
}
|
||||
|
||||
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
||||
}
|
||||
|
||||
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
||||
|
||||
if (!size) {
|
||||
return;
|
||||
}
|
||||
|
||||
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
||||
}
|
||||
|
||||
GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
|
||||
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||
|
||||
if (!size) {
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
|
||||
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||
GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
|
||||
|
||||
buf->iface.memset_tensor(buf, tensor, value, offset, size);
|
||||
}
|
||||
@ -316,32 +330,15 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct
|
||||
}
|
||||
|
||||
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||
// helper to ease transition to device interface
|
||||
if (backend->device) {
|
||||
return ggml_backend_dev_supports_op(backend->device, op);
|
||||
}
|
||||
|
||||
return backend->iface.supports_op(backend, op);
|
||||
return ggml_backend_dev_supports_op(backend->device, op);
|
||||
}
|
||||
|
||||
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||
// helper to ease transition to device interface
|
||||
if (backend->device) {
|
||||
return ggml_backend_dev_supports_buft(backend->device, buft);
|
||||
}
|
||||
return backend->iface.supports_buft(backend, buft);
|
||||
return ggml_backend_dev_supports_buft(backend->device, buft);
|
||||
}
|
||||
|
||||
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||
// helper to ease transition to device interface
|
||||
if (backend->device) {
|
||||
return ggml_backend_dev_offload_op(backend->device, op);
|
||||
}
|
||||
|
||||
if (backend->iface.offload_op != NULL) {
|
||||
return backend->iface.offload_op(backend, op);
|
||||
}
|
||||
return false;
|
||||
return ggml_backend_dev_offload_op(backend->device, op);
|
||||
}
|
||||
|
||||
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
|
||||
@ -582,6 +579,9 @@ struct ggml_backend_registry {
|
||||
#ifdef GGML_USE_VULKAN
|
||||
register_backend(ggml_backend_vk_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_CANN
|
||||
register_backend(ggml_backend_cann_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_BLAS
|
||||
register_backend(ggml_backend_blas_reg());
|
||||
#endif
|
||||
@ -591,9 +591,6 @@ struct ggml_backend_registry {
|
||||
#ifdef GGML_USE_AMX
|
||||
register_backend(ggml_backend_amx_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_CANN
|
||||
register_backend(ggml_backend_cann_reg());
|
||||
#endif
|
||||
|
||||
// TODO: kompute
|
||||
|
||||
@ -701,9 +698,9 @@ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_init_best(void) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL);
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
|
||||
if (!dev) {
|
||||
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU_FULL);
|
||||
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
}
|
||||
if (!dev) {
|
||||
return NULL;
|
||||
@ -711,13 +708,7 @@ ggml_backend_t ggml_backend_init_best(void) {
|
||||
return ggml_backend_dev_init(dev, NULL);
|
||||
}
|
||||
|
||||
// backend CPU
|
||||
|
||||
static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
return "CPU";
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
// CPU backend - buffer
|
||||
|
||||
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
uintptr_t data = (uintptr_t)buffer->context;
|
||||
@ -767,7 +758,6 @@ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
||||
}
|
||||
|
||||
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
||||
/* .get_name = */ ggml_backend_cpu_buffer_get_name,
|
||||
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
||||
/* .init_tensor = */ NULL, // no initialization required
|
||||
@ -780,7 +770,6 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
||||
};
|
||||
|
||||
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
||||
/* .get_name = */ ggml_backend_cpu_buffer_get_name,
|
||||
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
||||
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
||||
/* .init_tensor = */ NULL, // no initialization required
|
||||
@ -792,6 +781,8 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
||||
/* .reset = */ NULL,
|
||||
};
|
||||
|
||||
// CPU backend - buffer type
|
||||
|
||||
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CPU";
|
||||
|
||||
@ -799,19 +790,14 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
auto alloc_size = size;
|
||||
if (alloc_size == 0) {
|
||||
alloc_size = 1;
|
||||
}
|
||||
|
||||
void * data = ggml_aligned_malloc(alloc_size);
|
||||
void * data = ggml_aligned_malloc(size);
|
||||
|
||||
if (data == NULL) {
|
||||
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
|
||||
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, alloc_size);
|
||||
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
@ -843,6 +829,29 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
||||
return &ggml_backend_cpu_buffer_type;
|
||||
}
|
||||
|
||||
static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CPU_Mapped";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
|
||||
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_cpu_buffer_from_ptr_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||
},
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_cpu_buffer_type;
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
|
||||
// buffer type HBM
|
||||
@ -855,18 +864,11 @@ static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffe
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
|
||||
return "CPU_HBM";
|
||||
|
||||
GGML_UNUSED(buf);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
hbw_free(buffer->context);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
//void * ptr = hbw_malloc(size);
|
||||
void * ptr;
|
||||
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
||||
if (result != 0) {
|
||||
@ -876,7 +878,6 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
|
||||
|
||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
buffer->buft = buft;
|
||||
buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
|
||||
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
||||
|
||||
return buffer;
|
||||
@ -899,6 +900,21 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
||||
}
|
||||
#endif
|
||||
|
||||
static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
|
||||
static ggml_backend_buffer_type_t bufts[] = {
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
ggml_backend_cpu_hbm_buffer_type(),
|
||||
#endif
|
||||
NULL
|
||||
};
|
||||
|
||||
return bufts;
|
||||
|
||||
GGML_UNUSED(device);
|
||||
}
|
||||
|
||||
// CPU backend - backend (stream)
|
||||
|
||||
struct ggml_backend_cpu_context {
|
||||
int n_threads;
|
||||
ggml_threadpool_t threadpool;
|
||||
@ -923,12 +939,6 @@ static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
||||
delete backend;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
||||
return ggml_backend_cpu_buffer_type();
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
struct ggml_backend_plan_cpu {
|
||||
struct ggml_cplan cplan;
|
||||
struct ggml_cgraph cgraph;
|
||||
@ -998,7 +1008,6 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
|
||||
static const struct ggml_backend_i ggml_backend_cpu_i = {
|
||||
/* .get_name = */ ggml_backend_cpu_get_name,
|
||||
/* .free = */ ggml_backend_cpu_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
@ -1008,9 +1017,6 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
||||
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
||||
/* .supports_op = */ NULL,
|
||||
/* .supports_buft = */ NULL,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
};
|
||||
@ -1081,10 +1087,10 @@ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
||||
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
||||
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
||||
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
||||
}
|
||||
|
||||
////////////////////////
|
||||
// CPU backend - device
|
||||
|
||||
struct ggml_backend_cpu_device_context {
|
||||
std::string description = "CPU";
|
||||
@ -1171,7 +1177,7 @@ static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t *
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
|
||||
return GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
|
||||
return GGML_BACKEND_DEVICE_TYPE_CPU;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
@ -1189,7 +1195,7 @@ static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggm
|
||||
};
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
|
||||
return ggml_backend_cpu_init();
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
@ -1202,7 +1208,7 @@ static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_b
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
@ -1244,10 +1250,10 @@ static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
|
||||
/* .get_memory = */ ggml_backend_cpu_device_get_memory,
|
||||
/* .get_type = */ ggml_backend_cpu_device_get_type,
|
||||
/* .get_props = */ ggml_backend_cpu_device_get_props,
|
||||
/* .init_backend = */ ggml_backend_cpu_device_init,
|
||||
/* .init_backend = */ ggml_backend_cpu_device_init_backend,
|
||||
/* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ NULL,
|
||||
/* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_ptr,
|
||||
/* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
|
||||
/* .supports_op = */ ggml_backend_cpu_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
|
||||
/* .offload_op = */ NULL,
|
||||
@ -1256,7 +1262,7 @@ static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
|
||||
/* .event_synchronize = */ NULL,
|
||||
};
|
||||
|
||||
////////////////////////
|
||||
// CPU backend - backend (reg)
|
||||
|
||||
static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
|
||||
return "CPU";
|
||||
@ -1287,6 +1293,10 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
|
||||
if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
||||
return (void *)ggml_backend_cpu_set_n_threads;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
|
||||
return (void *)ggml_backend_cpu_get_extra_bufts;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
@ -1315,12 +1325,6 @@ struct ggml_backend_multi_buffer_context {
|
||||
size_t n_buffers;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
||||
|
||||
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
|
||||
}
|
||||
|
||||
static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
||||
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
||||
@ -1339,7 +1343,6 @@ static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_
|
||||
}
|
||||
|
||||
static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
|
||||
/* .get_name = */ ggml_backend_multi_buffer_get_name,
|
||||
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
|
||||
/* .get_base = */ NULL,
|
||||
/* .init_tensor = */ NULL,
|
||||
@ -1368,7 +1371,7 @@ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer
|
||||
}
|
||||
|
||||
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
|
||||
return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
|
||||
}
|
||||
|
||||
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
||||
@ -1460,7 +1463,7 @@ struct ggml_backend_sched {
|
||||
char * context_buffer;
|
||||
size_t context_buffer_size;
|
||||
|
||||
bool debug;
|
||||
int debug;
|
||||
};
|
||||
|
||||
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
||||
@ -1500,7 +1503,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
|
||||
return -1;
|
||||
}
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
#define GGML_SCHED_MAX_SPLITS_DEBUG 4096
|
||||
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
||||
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
||||
@ -1548,7 +1551,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||
if (src == NULL) {
|
||||
continue;
|
||||
}
|
||||
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||
// skip ROPE since the rope freqs tensor is too small to choose a backend based on it
|
||||
// not an ideal solution
|
||||
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
||||
// check if a backend with higher prio wants to offload the op
|
||||
if (src_backend_id == sched->n_backends - 1) {
|
||||
@ -1595,19 +1600,21 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
||||
if (ggml_is_view_op(node->op)) {
|
||||
continue;
|
||||
}
|
||||
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
||||
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
||||
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
continue;
|
||||
if (sched->debug > 1) {
|
||||
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
||||
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
||||
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
continue;
|
||||
}
|
||||
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
||||
GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
||||
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
||||
}
|
||||
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
||||
GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
||||
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
||||
GGML_LOG_DEBUG("\n");
|
||||
}
|
||||
GGML_LOG_DEBUG("\n");
|
||||
}
|
||||
}
|
||||
|
||||
@ -1899,11 +1906,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
if (src == NULL) {
|
||||
continue;
|
||||
}
|
||||
// check if a weight is on a different backend
|
||||
// check if a weight is on a different and incompatible backend
|
||||
// by starting a new split, the memory of the previously offloaded weights can be reused
|
||||
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||
int src_backend_id = tensor_backend_id(src);
|
||||
if (src_backend_id != cur_backend_id) {
|
||||
if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
|
||||
need_new_split = true;
|
||||
break;
|
||||
}
|
||||
@ -1915,7 +1922,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
||||
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
||||
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
||||
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
||||
need_new_split = true;
|
||||
break;
|
||||
}
|
||||
@ -2240,7 +2246,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
|
||||
struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
|
||||
|
||||
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
|
||||
const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
|
||||
sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
|
||||
sched->n_backends = n_backends;
|
||||
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
||||
|
||||
|
@ -224,12 +224,6 @@ static void ggml_backend_blas_free(ggml_backend_t backend) {
|
||||
delete backend;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
|
||||
return ggml_backend_cpu_buffer_type();
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
|
||||
|
||||
@ -265,7 +259,6 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
|
||||
static struct ggml_backend_i blas_backend_i = {
|
||||
/* .get_name = */ ggml_backend_blas_get_name,
|
||||
/* .free = */ ggml_backend_blas_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
@ -275,9 +268,6 @@ static struct ggml_backend_i blas_backend_i = {
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_blas_graph_compute,
|
||||
/* .supports_op = */ NULL,
|
||||
/* .supports_buft = */ NULL,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
};
|
||||
@ -356,7 +346,7 @@ static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t *
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) {
|
||||
return GGML_BACKEND_DEVICE_TYPE_CPU;
|
||||
return GGML_BACKEND_DEVICE_TYPE_ACCEL;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
@ -374,7 +364,7 @@ static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct gg
|
||||
};
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_blas_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
static ggml_backend_t ggml_backend_blas_device_init_backend(ggml_backend_dev_t dev, const char * params) {
|
||||
return ggml_backend_blas_init();
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
@ -387,7 +377,7 @@ static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
@ -456,10 +446,10 @@ static const struct ggml_backend_device_i ggml_backend_blas_device_i = {
|
||||
/* .get_memory = */ ggml_backend_blas_device_get_memory,
|
||||
/* .get_type = */ ggml_backend_blas_device_get_type,
|
||||
/* .get_props = */ ggml_backend_blas_device_get_props,
|
||||
/* .init_backend = */ ggml_backend_blas_device_init,
|
||||
/* .init_backend = */ ggml_backend_blas_device_init_backend,
|
||||
/* .get_buffer_type = */ ggml_backend_blas_device_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ NULL,
|
||||
/* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_ptr,
|
||||
/* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_host_ptr,
|
||||
/* .supports_op = */ ggml_backend_blas_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_blas_device_supports_buft,
|
||||
/* .offload_op = */ NULL,
|
||||
|
@ -489,23 +489,6 @@ struct ggml_backend_cann_buffer_context {
|
||||
~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Retrieve the name associated with a CANN buffer.
|
||||
*
|
||||
* This function returns the name of a CANN buffer, which is stored in the
|
||||
* context of the buffer.
|
||||
*
|
||||
* @param buffer The CANN buffer whose name is to be retrieved.
|
||||
* @return A pointer to a C-string containing the name of the buffer.
|
||||
*/
|
||||
|
||||
static const char* ggml_backend_cann_buffer_get_name(
|
||||
ggml_backend_buffer_t buffer) {
|
||||
return "CANN";
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Check if a buffer is a CANN buffer.
|
||||
*
|
||||
@ -515,9 +498,10 @@ static const char* ggml_backend_cann_buffer_get_name(
|
||||
* @param buffer The buffer to check.
|
||||
* @return true if the buffer is a CANN buffer, false otherwise.
|
||||
*/
|
||||
static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
|
||||
static bool ggml_backend_buffer_is_cann(
|
||||
ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.get_name == ggml_backend_cann_buffer_get_name;
|
||||
return ggml_backend_buft_is_cann(buffer->buft);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -965,7 +949,6 @@ static void ggml_backend_cann_buffer_clear(
|
||||
* on a CANN buffer within the backend.
|
||||
*/
|
||||
static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
|
||||
/* .get_name = */ ggml_backend_cann_buffer_get_name,
|
||||
/* .free_buffer = */ ggml_backend_cann_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_cann_buffer_get_base,
|
||||
/* .init_tensor = */ ggml_backend_cann_buffer_init_tensor,
|
||||
@ -999,9 +982,10 @@ struct ggml_backend_cann_buffer_type_context {
|
||||
*/
|
||||
static const char* ggml_backend_cann_buffer_type_name(
|
||||
ggml_backend_buffer_type_t buft) {
|
||||
return "CANN";
|
||||
ggml_backend_cann_buffer_type_context* buft_ctx =
|
||||
(ggml_backend_cann_buffer_type_context*)buft->context;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
return buft_ctx->name.c_str();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1465,24 +1449,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
|
||||
delete backend;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Retrieves the default buffer type associated with the CANN backend.
|
||||
*
|
||||
* This function returns the buffer type specific to the device associated
|
||||
* with the CANN backend. It is used to allocate buffers for computations
|
||||
* performed by the backend.
|
||||
*
|
||||
* @param backend Pointer to the CANN backend structure.
|
||||
* @return Pointer to the buffer type structure for the CANN backend.
|
||||
*/
|
||||
static ggml_backend_buffer_type_t
|
||||
ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
|
||||
ggml_backend_cann_context* cann_ctx =
|
||||
(ggml_backend_cann_context*)backend->context;
|
||||
|
||||
return ggml_backend_cann_buffer_type(cann_ctx->device);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Sets tensor data asynchronously in the CANN backend.
|
||||
*
|
||||
@ -1863,7 +1829,6 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
|
||||
static const ggml_backend_i ggml_backend_cann_interface = {
|
||||
/* .get_name = */ ggml_backend_cann_name,
|
||||
/* .free = */ ggml_backend_cann_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_cann_get_default_buffer_type,
|
||||
/* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
|
||||
/* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
|
||||
/* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
|
||||
@ -1873,9 +1838,6 @@ static const ggml_backend_i ggml_backend_cann_interface = {
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_cann_graph_compute,
|
||||
/* .supports_op = */ NULL, // moved to device
|
||||
/* .supports_buft = */ NULL, // moved to device
|
||||
/* .offload_op = */ NULL, // moved to device
|
||||
/* .event_record = */ ggml_backend_cann_event_record,
|
||||
/* .event_wait = */ ggml_backend_cann_event_wait,
|
||||
};
|
||||
@ -1918,7 +1880,7 @@ static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t *
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) {
|
||||
GGML_UNUSED(dev);
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||
}
|
||||
|
||||
static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
|
@ -421,20 +421,15 @@ struct ggml_backend_cuda_buffer_context {
|
||||
}
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.free_buffer == ggml_backend_cuda_buffer_free_buffer;
|
||||
}
|
||||
|
||||
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
return ctx->dev_ptr;
|
||||
@ -515,7 +510,6 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
||||
}
|
||||
|
||||
static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_buffer_get_name,
|
||||
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
||||
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
||||
@ -548,8 +542,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
|
||||
|
||||
ggml_cuda_set_device(buft_ctx->device);
|
||||
|
||||
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
||||
|
||||
void * dev_ptr;
|
||||
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
|
||||
if (err != cudaSuccess) {
|
||||
@ -657,7 +649,9 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
|
||||
}
|
||||
|
||||
struct ggml_backend_cuda_split_buffer_type_context {
|
||||
int main_device;
|
||||
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
||||
std::string name;
|
||||
};
|
||||
|
||||
struct ggml_backend_cuda_split_buffer_context {
|
||||
@ -680,16 +674,6 @@ struct ggml_backend_cuda_split_buffer_context {
|
||||
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
return GGML_CUDA_NAME "_Split";
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
||||
GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||
@ -833,7 +817,6 @@ static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, u
|
||||
}
|
||||
|
||||
static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
|
||||
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
|
||||
/* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
|
||||
@ -848,9 +831,9 @@ static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
||||
// cuda split buffer type
|
||||
|
||||
static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return GGML_CUDA_NAME "_Split";
|
||||
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
|
||||
@ -915,11 +898,11 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
|
||||
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
||||
};
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
|
||||
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
||||
static std::map<std::pair<int, std::array<float, GGML_CUDA_MAX_DEVICES>>, struct ggml_backend_buffer_type> buft_map;
|
||||
|
||||
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
|
||||
|
||||
@ -937,18 +920,23 @@ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * ten
|
||||
}
|
||||
}
|
||||
|
||||
auto it = buft_map.find(tensor_split_arr);
|
||||
auto it = buft_map.find({main_device, tensor_split_arr});
|
||||
if (it != buft_map.end()) {
|
||||
return &it->second;
|
||||
}
|
||||
auto * ctx = new ggml_backend_cuda_split_buffer_type_context{
|
||||
main_device,
|
||||
tensor_split_arr,
|
||||
GGML_CUDA_NAME + std::to_string(main_device) + "_Split",
|
||||
};
|
||||
|
||||
struct ggml_backend_buffer_type buft {
|
||||
/* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0),
|
||||
/* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), main_device),
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
auto result = buft_map.emplace(tensor_split_arr, buft);
|
||||
auto result = buft_map.emplace(std::make_pair(main_device, tensor_split_arr), buft);
|
||||
return &result.first->second;
|
||||
}
|
||||
|
||||
@ -960,12 +948,6 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
return GGML_CUDA_NAME "_Host";
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
CUDA_CHECK(cudaFreeHost(buffer->context));
|
||||
}
|
||||
@ -998,7 +980,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
||||
|
||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
buffer->buft = buft;
|
||||
buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
|
||||
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
||||
|
||||
return buffer;
|
||||
@ -1400,7 +1381,7 @@ static void ggml_cuda_op_mul_mat(
|
||||
|
||||
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
||||
|
||||
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
||||
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
|
||||
GGML_ASSERT(!(split && ne02 > 1));
|
||||
GGML_ASSERT(!(split && ne03 > 1));
|
||||
GGML_ASSERT(!(split && ne02 < ne12));
|
||||
@ -1890,7 +1871,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
||||
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
|
||||
|
||||
bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
@ -2017,7 +1998,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");
|
||||
GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
@ -2150,7 +2131,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
|
||||
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
|
||||
// why is this here instead of mul_mat?
|
||||
if (dst->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(dst->src[0]->buffer)) {
|
||||
if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
|
||||
ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
|
||||
}
|
||||
|
||||
@ -2371,12 +2352,6 @@ static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
||||
delete backend;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
@ -2582,7 +2557,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
continue;
|
||||
}
|
||||
|
||||
if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
||||
if (node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_cuda_split(node->src[0]->buffer->buft)) {
|
||||
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
||||
@ -2669,7 +2644,8 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
if (node->src[j] != nullptr) {
|
||||
assert(node->src[j]->buffer);
|
||||
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
|
||||
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
|
||||
ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -2762,7 +2738,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
||||
if (stat == cudaErrorGraphExecUpdateFailure) {
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
|
||||
GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
|
||||
#endif
|
||||
// The pre-existing graph exec cannot be updated due to violated constraints
|
||||
// so instead clear error and re-instantiate
|
||||
@ -2811,7 +2787,6 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
|
||||
static const ggml_backend_i ggml_backend_cuda_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_get_name,
|
||||
/* .free = */ ggml_backend_cuda_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
|
||||
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
||||
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
||||
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
|
||||
@ -2821,9 +2796,6 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
||||
/* .supports_op = */ NULL, // moved to device
|
||||
/* .supports_buft = */ NULL, // moved to device
|
||||
/* .offload_op = */ NULL, // moved to device
|
||||
/* .event_record = */ ggml_backend_cuda_event_record,
|
||||
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
||||
};
|
||||
@ -2913,7 +2885,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
|
||||
GGML_UNUSED(dev);
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
@ -2937,7 +2909,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
};
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_cuda_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
static ggml_backend_t ggml_backend_cuda_device_init_backend(ggml_backend_dev_t dev, const char * params) {
|
||||
GGML_UNUSED(params);
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
return ggml_backend_cuda_init(ctx->device);
|
||||
@ -2953,18 +2925,29 @@ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(
|
||||
return ggml_backend_cuda_host_buffer_type();
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(ptr);
|
||||
GGML_UNUSED(size);
|
||||
GGML_UNUSED(max_tensor_size);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// TODO: move these functions here
|
||||
static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
|
||||
|
||||
// split buffers can only be used with GGML_OP_MUL_MAT
|
||||
if (op->op != GGML_OP_MUL_MAT) {
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda_split(op->src[i]->buffer->buft)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// check if all the sources are allocated on this device
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda(op->src[i]->buffer->buft)) {
|
||||
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)op->src[i]->buffer->buft->context;
|
||||
if (buft_ctx->device != dev_ctx->device) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
switch (op->op) {
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
@ -3190,24 +3173,27 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
if (ggml_backend_buft_is_cuda_split(buft)) {
|
||||
return true;
|
||||
}
|
||||
return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
|
||||
}
|
||||
|
||||
if (ggml_backend_buft_is_cuda(buft)) {
|
||||
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
||||
return buft_ctx->device == dev_ctx->device;
|
||||
static int64_t get_op_batch_size(const ggml_tensor * op) {
|
||||
switch (op->op) {
|
||||
case GGML_OP_GET_ROWS:
|
||||
return 0;
|
||||
case GGML_OP_MUL_MAT:
|
||||
return op->ne[1];
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
case GGML_OP_ROPE:
|
||||
return op->ne[2];
|
||||
default:
|
||||
return ggml_nrows(op);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
const int min_batch_size = 32;
|
||||
|
||||
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
||||
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
||||
return get_op_batch_size(op) >= min_batch_size;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
@ -3248,10 +3234,10 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||
/* .get_memory = */ ggml_backend_cuda_device_get_memory,
|
||||
/* .get_type = */ ggml_backend_cuda_device_get_type,
|
||||
/* .get_props = */ ggml_backend_cuda_device_get_props,
|
||||
/* .init_backend = */ ggml_backend_cuda_device_init,
|
||||
/* .init_backend = */ ggml_backend_cuda_device_init_backend,
|
||||
/* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type,
|
||||
/* .buffer_from_host_ptr = */ ggml_backend_cuda_device_buffer_from_host_ptr,
|
||||
/* .buffer_from_host_ptr = */ NULL,
|
||||
/* .supports_op = */ ggml_backend_cuda_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_cuda_device_supports_buft,
|
||||
/* .offload_op = */ ggml_backend_cuda_device_offload_op,
|
||||
|
@ -1820,11 +1820,6 @@ static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) {
|
||||
}
|
||||
}
|
||||
|
||||
static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buffer->buft->context);
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
auto * memory = (ggml_vk_memory *)buffer->context;
|
||||
if (ggml_vk_has_device()) {
|
||||
@ -1868,7 +1863,6 @@ static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
|
||||
/* .get_name = */ ggml_backend_kompute_buffer_get_name,
|
||||
/* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_kompute_buffer_get_base,
|
||||
/* .init_tensor = */ NULL,
|
||||
@ -1953,11 +1947,6 @@ static void ggml_backend_kompute_free(ggml_backend_t backend) {
|
||||
delete backend;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) {
|
||||
auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
|
||||
return ggml_backend_kompute_buffer_type(ctx->device);
|
||||
}
|
||||
|
||||
static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
|
||||
ggml_vk_graph_compute(ctx, cgraph);
|
||||
@ -1977,7 +1966,6 @@ static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_back
|
||||
static struct ggml_backend_i kompute_backend_i = {
|
||||
/* .get_name = */ ggml_backend_kompute_name,
|
||||
/* .free = */ ggml_backend_kompute_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_kompute_get_default_buffer_type,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
@ -1987,9 +1975,6 @@ static struct ggml_backend_i kompute_backend_i = {
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
||||
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_kompute_supports_buft,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
};
|
||||
|
@ -3247,12 +3247,6 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
|
||||
// backend interface
|
||||
|
||||
static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
return "Metal";
|
||||
|
||||
UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||
|
||||
@ -3307,7 +3301,6 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
|
||||
}
|
||||
|
||||
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
||||
/* .get_name = */ ggml_backend_metal_buffer_get_name,
|
||||
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
||||
/* .init_tensor = */ NULL,
|
||||
@ -3432,6 +3425,29 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
||||
return &ggml_backend_buffer_type_metal;
|
||||
}
|
||||
|
||||
static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "Metal_Mapped";
|
||||
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
|
||||
static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_metal_buffer_from_ptr_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
|
||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
||||
},
|
||||
/* .device = */ &g_ggml_backend_metal_device,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_buffer_from_ptr_type_metal;
|
||||
}
|
||||
|
||||
// TODO: obsoleted by ggml_backend_metal_device_buffer_from_ptr
|
||||
ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
|
||||
struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
|
||||
@ -3508,7 +3524,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
|
||||
}
|
||||
}
|
||||
|
||||
return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
|
||||
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
|
||||
}
|
||||
|
||||
// backend
|
||||
@ -3529,12 +3545,6 @@ static void ggml_backend_metal_free(ggml_backend_t backend) {
|
||||
free(backend);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
|
||||
return ggml_backend_metal_buffer_type();
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
return ggml_metal_graph_compute(backend, cgraph);
|
||||
}
|
||||
@ -3601,7 +3611,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
||||
static struct ggml_backend_i ggml_backend_metal_i = {
|
||||
/* .get_name = */ ggml_backend_metal_name,
|
||||
/* .free = */ ggml_backend_metal_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
@ -3611,9 +3620,6 @@ static struct ggml_backend_i ggml_backend_metal_i = {
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
||||
/* .supports_op = */ NULL,
|
||||
/* .supports_buft = */ NULL,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
};
|
||||
@ -3708,7 +3714,7 @@ static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backend_dev_t dev) {
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
@ -178,7 +178,6 @@ struct ggml_backend_rpc_buffer_context {
|
||||
std::shared_ptr<socket_t> sock;
|
||||
std::unordered_map<ggml_backend_buffer_t, void *> base_cache;
|
||||
uint64_t remote_ptr;
|
||||
std::string name;
|
||||
};
|
||||
|
||||
// RPC helper functions
|
||||
@ -409,11 +408,6 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
||||
return sock;
|
||||
}
|
||||
|
||||
static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
rpc_msg_free_buffer_req request = {ctx->remote_ptr};
|
||||
@ -524,7 +518,6 @@ static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
|
||||
/* .get_name = */ ggml_backend_rpc_buffer_get_name,
|
||||
/* .free_buffer = */ ggml_backend_rpc_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_rpc_buffer_get_base,
|
||||
/* .init_tensor = */ ggml_backend_rpc_buffer_init_tensor,
|
||||
@ -551,7 +544,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
|
||||
if (response.remote_ptr != 0) {
|
||||
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
||||
ggml_backend_rpc_buffer_interface,
|
||||
new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
|
||||
new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr},
|
||||
response.remote_size);
|
||||
return buffer;
|
||||
} else {
|
||||
@ -609,11 +602,6 @@ static void ggml_backend_rpc_free(ggml_backend_t backend) {
|
||||
delete backend;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
|
||||
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
|
||||
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
|
||||
}
|
||||
|
||||
static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
||||
UNUSED(backend);
|
||||
// this is no-op because we don't have any async operations
|
||||
@ -670,7 +658,6 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g
|
||||
static ggml_backend_i ggml_backend_rpc_interface = {
|
||||
/* .get_name = */ ggml_backend_rpc_name,
|
||||
/* .free = */ ggml_backend_rpc_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_rpc_get_default_buffer_type,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
@ -680,9 +667,6 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_rpc_graph_compute,
|
||||
/* .supports_op = */ NULL,
|
||||
/* .supports_buft = */ NULL,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
};
|
||||
@ -1278,7 +1262,7 @@ static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t *
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
|
||||
// TODO: obtain value from the server
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||
|
||||
UNUSED(dev);
|
||||
}
|
||||
|
@ -249,13 +249,10 @@ struct ggml_backend_sycl_buffer_context {
|
||||
}
|
||||
};
|
||||
|
||||
static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_type_t buft);
|
||||
|
||||
static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name;
|
||||
return buffer->buft->iface.get_name == ggml_backend_sycl_buffer_type_get_name;
|
||||
}
|
||||
|
||||
static void
|
||||
@ -440,7 +437,6 @@ catch (sycl::exception const &exc) {
|
||||
}
|
||||
|
||||
static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
|
||||
/* .get_name = */ ggml_backend_sycl_buffer_get_name,
|
||||
/* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_sycl_buffer_get_base,
|
||||
/* .init_tensor = */ ggml_backend_sycl_buffer_init_tensor,
|
||||
@ -698,16 +694,6 @@ struct ggml_backend_sycl_split_buffer_context {
|
||||
std::vector<queue_ptr> streams;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
return GGML_SYCL_NAME "_Split";
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name;
|
||||
}
|
||||
|
||||
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
@ -915,7 +901,6 @@ static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, u
|
||||
}
|
||||
|
||||
static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
|
||||
/* .get_name = */ ggml_backend_sycl_split_buffer_get_name,
|
||||
/* .free_buffer = */ ggml_backend_sycl_split_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_sycl_split_buffer_get_base,
|
||||
/* .init_tensor = */ ggml_backend_sycl_split_buffer_init_tensor,
|
||||
@ -935,6 +920,10 @@ static const char * ggml_backend_sycl_split_buffer_type_get_name(ggml_backend_bu
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
|
||||
return buffer->buft->iface.get_name == ggml_backend_sycl_split_buffer_type_get_name;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
||||
// instead, we allocate them for each tensor separately in init_tensor
|
||||
@ -1040,12 +1029,6 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
return GGML_SYCL_NAME "_Host";
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_sycl_host_free(buffer->context);
|
||||
}
|
||||
@ -1061,7 +1044,6 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm
|
||||
// FIXME: this is a hack to avoid having to implement a new buffer type
|
||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
buffer->buft = buft;
|
||||
buffer->iface.get_name = ggml_backend_sycl_host_buffer_name;
|
||||
buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer;
|
||||
|
||||
return buffer;
|
||||
@ -4889,12 +4871,6 @@ static void ggml_backend_sycl_free(ggml_backend_t backend) {
|
||||
delete backend;
|
||||
}
|
||||
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
return ggml_backend_sycl_buffer_type(sycl_ctx->device);
|
||||
}
|
||||
|
||||
static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
||||
ggml_tensor *tensor,
|
||||
const void *data, size_t offset,
|
||||
@ -5031,7 +5007,6 @@ static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_ev
|
||||
static ggml_backend_i ggml_backend_sycl_interface = {
|
||||
/* .get_name = */ ggml_backend_sycl_get_name,
|
||||
/* .free = */ ggml_backend_sycl_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
|
||||
/* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
|
||||
/* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
|
||||
/* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
|
||||
@ -5043,9 +5018,6 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
||||
/* .supports_op = */ NULL, // moved to device
|
||||
/* .supports_buft = */ NULL, // moved to device
|
||||
/* .offload_op = */ NULL, // moved to device
|
||||
/* .event_record = */ ggml_backend_sycl_event_record,
|
||||
/* .event_wait = */ ggml_backend_sycl_event_wait,
|
||||
};
|
||||
@ -5092,7 +5064,7 @@ static void ggml_backend_sycl_device_get_memory(ggml_backend_dev_t dev, size_t *
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_sycl_device_get_type(ggml_backend_dev_t dev) {
|
||||
GGML_UNUSED(dev);
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||
}
|
||||
|
||||
static void ggml_backend_sycl_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
@ -5388,12 +5360,14 @@ static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t re
|
||||
return ctx->devices[index];
|
||||
}
|
||||
|
||||
static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name)
|
||||
{
|
||||
static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) {
|
||||
GGML_UNUSED(reg);
|
||||
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
|
||||
return (void *)ggml_backend_sycl_split_buffer_type;
|
||||
}
|
||||
|
||||
// TODO: update to the current function signature
|
||||
//if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
|
||||
// return (void *)ggml_backend_sycl_split_buffer_type;
|
||||
//}
|
||||
|
||||
// SYCL doesn't support registering host memory, left here for reference
|
||||
// "ggml_backend_register_host_buffer"
|
||||
// "ggml_backend_unregister_host_buffer"
|
||||
|
@ -6247,13 +6247,8 @@ static void ggml_vk_get_device_description(int device, char * description, size_
|
||||
|
||||
// device backend
|
||||
|
||||
static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.get_name == ggml_backend_vk_buffer_get_name;
|
||||
return buffer->buft->iface.get_name == ggml_backend_vk_buffer_type_name;
|
||||
}
|
||||
|
||||
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
@ -6317,7 +6312,6 @@ static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t v
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
|
||||
/* .get_name = */ ggml_backend_vk_buffer_get_name,
|
||||
/* .free_buffer = */ ggml_backend_vk_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_vk_buffer_get_base,
|
||||
/* .init_tensor = */ ggml_backend_vk_buffer_init_tensor,
|
||||
@ -6413,7 +6407,6 @@ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_
|
||||
|
||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
buffer->buft = buft;
|
||||
buffer->iface.get_name = ggml_backend_vk_host_buffer_name;
|
||||
buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer;
|
||||
|
||||
return buffer;
|
||||
@ -6646,7 +6639,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
static ggml_backend_i ggml_backend_vk_interface = {
|
||||
/* .get_name = */ ggml_backend_vk_name,
|
||||
/* .free = */ ggml_backend_vk_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
|
||||
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
|
||||
/* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
|
||||
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
|
||||
@ -6656,9 +6648,6 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
||||
/* .supports_op = */ NULL,
|
||||
/* .supports_buft = */ NULL,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
};
|
||||
@ -6717,7 +6706,7 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total
|
||||
//////////////////////////
|
||||
|
||||
struct ggml_backend_vk_device_context {
|
||||
int device;
|
||||
size_t device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
};
|
||||
@ -6749,7 +6738,7 @@ static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(gg
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
|
||||
UNUSED(dev);
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||
}
|
||||
|
||||
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
@ -6758,9 +6747,10 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
props->type = ggml_backend_vk_device_get_type(dev);
|
||||
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* async */ false,
|
||||
/* host_buffer */ true,
|
||||
/* events */ false,
|
||||
/* .async = */ false,
|
||||
/* .host_buffer = */ true,
|
||||
/* .buffer_from_host_ptr = */ false,
|
||||
/* .events = */ false,
|
||||
};
|
||||
}
|
||||
|
||||
@ -6949,7 +6939,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
for (size_t i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
||||
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
||||
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
||||
char desc[256];
|
||||
ggml_backend_vk_get_device_description(i, desc, sizeof(desc));
|
||||
|
@ -4028,7 +4028,9 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
|
||||
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
|
||||
GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
||||
__func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
|
||||
assert(false);
|
||||
#ifndef NDEBUG
|
||||
GGML_ABORT("not enough space in the context's memory pool");
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -205,7 +205,7 @@ extern "C" {
|
||||
enum llama_split_mode {
|
||||
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
||||
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
||||
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
|
||||
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
|
||||
};
|
||||
|
||||
// TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
||||
@ -274,10 +274,7 @@ extern "C" {
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||
|
||||
// main_gpu interpretation depends on split_mode:
|
||||
// LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
|
||||
// LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
|
||||
// LLAMA_SPLIT_MODE_LAYER: ignored
|
||||
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
|
||||
int32_t main_gpu;
|
||||
|
||||
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||
|
@ -20,7 +20,7 @@ logger = logging.getLogger("compare-llama-bench")
|
||||
# Properties by which to differentiate results per commit:
|
||||
KEY_PROPERTIES = [
|
||||
"cpu_info", "gpu_info", "n_gpu_layers", "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
|
||||
"blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "embeddings", "n_threads",
|
||||
"blas", "model_filename", "model_type", "n_batch", "n_ubatch", "embeddings", "n_threads",
|
||||
"type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
|
||||
]
|
||||
|
||||
|
3098
src/llama.cpp
3098
src/llama.cpp
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user