// Note: porting this file to C++ is a work in progress #include "ggml-backend-impl.h" #include "ggml-alloc.h" #include "ggml-impl.h" #include #include #include #include #include #include #include // backend buffer type const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { return buft->iface.get_name(buft); } ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { return buft->iface.alloc_buffer(buft, size); } size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) { return buft->iface.get_alignment(buft); } size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) { // get_max_size is optional, defaults to SIZE_MAX if (buft->iface.get_max_size) { return buft->iface.get_max_size(buft); } return SIZE_MAX; } size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { // get_alloc_size is optional, defaults to ggml_nbytes if (buft->iface.get_alloc_size) { size_t size = buft->iface.get_alloc_size(buft, tensor); assert(size >= ggml_nbytes(tensor)); return size; } return ggml_nbytes(tensor); } bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) { if (buft->iface.is_host) { return buft->iface.is_host(buft); } return false; } ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) { return buft->device; } // backend buffer ggml_backend_buffer_t ggml_backend_buffer_init( ggml_backend_buffer_type_t buft, struct ggml_backend_buffer_i iface, void * context, size_t size) { ggml_backend_buffer_t buffer = new ggml_backend_buffer { /* .interface = */ iface, /* .buft = */ buft, /* .context = */ context, /* .size = */ size, /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY }; return buffer; } const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) { return buffer->iface.get_name(buffer); } void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { if (buffer == NULL) { return; } if (buffer->iface.free_buffer != NULL) { buffer->iface.free_buffer(buffer); } delete buffer; } size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { return buffer->size; } void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { void * base = buffer->iface.get_base(buffer); GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); return base; } void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { // init_tensor is optional if (buffer->iface.init_tensor) { buffer->iface.init_tensor(buffer, tensor); } } size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) { return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer)); } size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) { return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer)); } size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor); } void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { buffer->iface.clear(buffer, value); } bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) { return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer)); } void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { buffer->usage = usage; // FIXME: add a generic callback to the buffer interface if (ggml_backend_buffer_is_multi_buffer(buffer)) { ggml_backend_multi_buffer_set_usage(buffer, usage); } } enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) { return buffer->usage; } ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) { return buffer->buft; } void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) { if (buffer->iface.reset) { buffer->iface.reset(buffer); } } bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer; if (dst_buf->iface.cpy_tensor) { return dst_buf->iface.cpy_tensor(dst_buf, src, dst); } return false; } // backend ggml_guid_t ggml_backend_guid(ggml_backend_t backend) { if (backend == NULL) { return NULL; } return backend->guid; } const char * ggml_backend_name(ggml_backend_t backend) { if (backend == NULL) { return "NULL"; } return backend->iface.get_name(backend); } void ggml_backend_free(ggml_backend_t backend) { if (backend == NULL) { return; } backend->iface.free(backend); } ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) { return backend->iface.get_default_buffer_type(backend); } ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) { return ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), size); } size_t ggml_backend_get_alignment(ggml_backend_t backend) { return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend)); } size_t ggml_backend_get_max_size(ggml_backend_t backend) { return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend)); } void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); if (backend->iface.set_tensor_async == NULL) { ggml_backend_tensor_set(tensor, data, offset, size); } else { backend->iface.set_tensor_async(backend, tensor, data, offset, size); } } void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); if (backend->iface.get_tensor_async == NULL) { ggml_backend_tensor_get(tensor, data, offset, size); } else { backend->iface.get_tensor_async(backend, tensor, data, offset, size); } } void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; GGML_ASSERT(buf != NULL && "tensor buffer not set"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); if (!size) { return; } buf->iface.set_tensor(buf, tensor, data, offset, size); } void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; GGML_ASSERT(buf != NULL && "tensor buffer not set"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); if (!size) { return; } buf->iface.get_tensor(buf, tensor, data, offset, size); } GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; GGML_ASSERT(buf != NULL && "tensor buffer not set"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); if (!size) { return; } GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer"); buf->iface.memset_tensor(buf, tensor, value, offset, size); } void ggml_backend_synchronize(ggml_backend_t backend) { if (backend->iface.synchronize == NULL) { return; } backend->iface.synchronize(backend); } ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { GGML_ASSERT(backend->iface.graph_plan_create != NULL); return backend->iface.graph_plan_create(backend, cgraph); } void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { GGML_ASSERT(backend->iface.graph_plan_free != NULL); backend->iface.graph_plan_free(backend, plan); } enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { GGML_ASSERT(backend->iface.graph_plan_compute != NULL); return backend->iface.graph_plan_compute(backend, plan); } enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph); ggml_backend_synchronize(backend); return err; } enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) { return backend->iface.graph_compute(backend, cgraph); } bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { // helper to ease transition to device interface if (backend->device) { return ggml_backend_dev_supports_op(backend->device, op); } return backend->iface.supports_op(backend, op); } bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { // helper to ease transition to device interface if (backend->device) { return ggml_backend_dev_supports_buft(backend->device, buft); } return backend->iface.supports_buft(backend, buft); } bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) { // helper to ease transition to device interface if (backend->device) { return ggml_backend_dev_offload_op(backend->device, op); } if (backend->iface.offload_op != NULL) { return backend->iface.offload_op(backend, op); } return false; } ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) { return backend->device; } // backend copy static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { if (a->type != b->type) { return false; } for (int i = 0; i < GGML_MAX_DIMS; i++) { if (a->ne[i] != b->ne[i]) { return false; } if (a->nb[i] != b->nb[i]) { return false; } } return true; } void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) { GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); if (src == dst) { return; } if (ggml_backend_buffer_is_host(src->buffer)) { ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); } else if (ggml_backend_buffer_is_host(dst->buffer)) { ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); } else if (!ggml_backend_buffer_copy_tensor(src, dst)) { #ifndef NDEBUG fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer)); #endif size_t nbytes = ggml_nbytes(src); void * data = malloc(nbytes); ggml_backend_tensor_get(src, data, 0, nbytes); ggml_backend_tensor_set(dst, data, 0, nbytes); free(data); } } void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) { GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); if (src == dst) { return; } if (backend_dst->iface.cpy_tensor_async != NULL) { if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) { return; } } // an async copy would normally happen after all the queued operations on both backends are completed // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy ggml_backend_synchronize(backend_src); ggml_backend_synchronize(backend_dst); ggml_backend_tensor_copy(src, dst); } // events ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) { // null device is allowed for the transition period to the device interface if (device == NULL || device->iface.event_new == NULL) { return NULL; } return device->iface.event_new(device); } void ggml_backend_event_free(ggml_backend_event_t event) { if (event == NULL) { return; } event->device->iface.event_free(event->device, event); } void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) { GGML_ASSERT(backend->iface.event_record != NULL); backend->iface.event_record(backend, event); } void ggml_backend_event_synchronize(ggml_backend_event_t event) { GGML_ASSERT(event->device->iface.event_synchronize); event->device->iface.event_synchronize(event->device, event); } void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { GGML_ASSERT(backend->iface.event_wait != NULL); backend->iface.event_wait(backend, event); } // Backend device const char * ggml_backend_dev_name(ggml_backend_dev_t device) { return device->iface.get_name(device); } const char * ggml_backend_dev_description(ggml_backend_dev_t device) { return device->iface.get_description(device); } void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { device->iface.get_memory(device, free, total); } enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) { return device->iface.get_type(device); } void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) { device->iface.get_props(device, props); } ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) { return device->reg; } ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) { return device->iface.init_backend(device, params); } ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) { return device->iface.get_buffer_type(device); } ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) { return device->iface.get_host_buffer_type(device); } ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) { return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size); } bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) { return device->iface.supports_op(device, op); } bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) { return device->iface.supports_buft(device, buft); } bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) { return device->iface.offload_op(device, op); } // Backend (reg) const char * ggml_backend_reg_name(ggml_backend_reg_t reg) { return reg->iface.get_name(reg); } size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) { return reg->iface.get_device_count(reg); } ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) { return reg->iface.get_device(reg, index); } void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { if (!reg->iface.get_proc_address) { return NULL; } return reg->iface.get_proc_address(reg, name); } // Backend registry #ifdef GGML_USE_CUDA #include "ggml-cuda.h" #endif struct ggml_backend_registry { std::vector backends; std::vector devices; ggml_backend_registry() { #ifdef GGML_USE_CUDA register_backend(ggml_backend_cuda_reg()); #endif register_backend(ggml_backend_cpu_reg()); // TODO: sycl, metal, vulkan, kompute, cann } void register_backend(ggml_backend_reg_t reg) { #ifndef NDEBUG fprintf(stderr, "%s: registered backend %s (%zu devices)\n", __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); #endif backends.push_back(reg); for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { register_device(ggml_backend_reg_dev_get(reg, i)); } } void register_device(ggml_backend_dev_t device) { #ifndef NDEBUG fprintf(stderr, "%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device)); #endif devices.push_back(device); } }; static ggml_backend_registry & get_reg() { static ggml_backend_registry reg; return reg; } // Internal API void ggml_backend_register(ggml_backend_reg_t reg) { get_reg().register_backend(reg); } void ggml_backend_device_register(ggml_backend_dev_t device) { get_reg().register_device(device); } // Backend (reg) enumeration size_t ggml_backend_reg_count() { return get_reg().backends.size(); } ggml_backend_reg_t ggml_backend_reg_get(size_t index) { GGML_ASSERT(index < ggml_backend_reg_count()); return get_reg().backends[index]; } ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) { for (size_t i = 0; i < ggml_backend_reg_count(); i++) { ggml_backend_reg_t reg = ggml_backend_reg_get(i); if (strcmp(ggml_backend_reg_name(reg), name) == 0) { return reg; } } return NULL; } // Device enumeration size_t ggml_backend_dev_count() { return get_reg().devices.size(); } ggml_backend_dev_t ggml_backend_dev_get(size_t index) { GGML_ASSERT(index < ggml_backend_dev_count()); return get_reg().devices[index]; } ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) { for (size_t i = 0; i < ggml_backend_dev_count(); i++) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); if (strcmp(ggml_backend_dev_name(dev), name) == 0) { return dev; } } return NULL; } ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) { for (size_t i = 0; i < ggml_backend_dev_count(); i++) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); if (ggml_backend_dev_type(dev) == type) { return dev; } } return NULL; } // Convenience functions ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) { ggml_backend_dev_t dev = ggml_backend_dev_by_name(name); if (!dev) { return NULL; } return ggml_backend_dev_init(dev, params); } ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) { ggml_backend_dev_t dev = ggml_backend_dev_by_type(type); if (!dev) { return NULL; } return ggml_backend_dev_init(dev, params); } ggml_backend_t ggml_backend_init_best(void) { ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL); if (!dev) { dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU_FULL); } if (!dev) { return NULL; } return ggml_backend_dev_init(dev, NULL); } // backend CPU static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) { return "CPU"; GGML_UNUSED(buffer); } static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { uintptr_t data = (uintptr_t)buffer->context; // align the buffer if (data % TENSOR_ALIGNMENT != 0) { data = GGML_PAD(data, TENSOR_ALIGNMENT); } return (void *)data; } static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { free(buffer->context); } static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { memset((char *)tensor->data + offset, value, size); GGML_UNUSED(buffer); } static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { memcpy((char *)tensor->data + offset, data, size); GGML_UNUSED(buffer); } static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { memcpy(data, (const char *)tensor->data + offset, size); GGML_UNUSED(buffer); } static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); return true; } return false; GGML_UNUSED(buffer); } static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { memset(buffer->context, value, buffer->size); } static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { /* .get_name = */ ggml_backend_cpu_buffer_get_name, /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, /* .clear = */ ggml_backend_cpu_buffer_clear, /* .reset = */ NULL, }; static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { /* .get_name = */ ggml_backend_cpu_buffer_get_name, /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, /* .clear = */ ggml_backend_cpu_buffer_clear, /* .reset = */ NULL, }; static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return "CPU"; GGML_UNUSED(buft); } static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h) if (data == NULL) { fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size); return NULL; } return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size); } static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return TENSOR_ALIGNMENT; GGML_UNUSED(buft); } static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; GGML_UNUSED(buft); } ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { /* .iface = */ { /* .get_name = */ ggml_backend_cpu_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, }, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), /* .context = */ NULL, }; return &ggml_backend_cpu_buffer_type; } #ifdef GGML_USE_CPU_HBM // buffer type HBM #include static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return "CPU_HBM"; GGML_UNUSED(buft); } static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) { return "CPU_HBM"; GGML_UNUSED(buf); } static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { hbw_free(buffer->context); } static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { //void * ptr = hbw_malloc(size); void * ptr; int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); if (result != 0) { fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size); return NULL; } ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); buffer->buft = buft; buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name; buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer; return buffer; } ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = { /* .iface = */ { /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, }, /* .context = */ NULL, }; return &ggml_backend_cpu_buffer_type_hbm; } #endif struct ggml_backend_cpu_context { int n_threads; ggml_threadpool_t threadpool; uint8_t * work_data; size_t work_size; ggml_abort_callback abort_callback; void * abort_callback_data; }; static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) { return "CPU"; GGML_UNUSED(backend); } static void ggml_backend_cpu_free(ggml_backend_t backend) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; delete[] cpu_ctx->work_data; delete cpu_ctx; delete backend; } static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) { return ggml_backend_cpu_buffer_type(); GGML_UNUSED(backend); } struct ggml_backend_plan_cpu { struct ggml_cplan cplan; struct ggml_cgraph cgraph; }; static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu; cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); cpu_plan->cgraph = *cgraph; // FIXME: deep copy if (cpu_plan->cplan.work_size > 0) { cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size]; if (cpu_plan->cplan.work_data == NULL) { delete cpu_plan; return NULL; } } cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback; cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data; return cpu_plan; } static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; delete[] cpu_plan->cplan.work_data; delete cpu_plan; GGML_UNUSED(backend); } static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); GGML_UNUSED(backend); } static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); if (cpu_ctx->work_size < cplan.work_size) { delete[] cpu_ctx->work_data; cpu_ctx->work_data = new uint8_t[cplan.work_size]; if (cpu_ctx->work_data == NULL) { cpu_ctx->work_size = 0; return GGML_STATUS_ALLOC_FAILED; } cpu_ctx->work_size = cplan.work_size; } cplan.work_data = (uint8_t *)cpu_ctx->work_data; cplan.abort_callback = cpu_ctx->abort_callback; cplan.abort_callback_data = cpu_ctx->abort_callback_data; return ggml_graph_compute(cgraph, &cplan); } static const struct ggml_backend_i ggml_backend_cpu_i = { /* .get_name = */ ggml_backend_cpu_get_name, /* .free = */ ggml_backend_cpu_free, /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type, /* .set_tensor_async = */ NULL, /* .get_tensor_async = */ NULL, /* .cpy_tensor_async = */ NULL, /* .synchronize = */ NULL, /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create, /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free, /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, /* .graph_compute = */ ggml_backend_cpu_graph_compute, /* .supports_op = */ NULL, /* .supports_buft = */ NULL, /* .offload_op = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, }; static ggml_guid_t ggml_backend_cpu_guid(void) { static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 }; return &guid; } ggml_backend_t ggml_backend_cpu_init(void) { struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context; if (ctx == NULL) { return NULL; } ctx->n_threads = GGML_DEFAULT_N_THREADS; ctx->threadpool = NULL; ctx->work_data = NULL; ctx->work_size = 0; ctx->abort_callback = NULL; ctx->abort_callback_data = NULL; ggml_backend_t cpu_backend = new ggml_backend { /* .guid = */ ggml_backend_cpu_guid(), /* .interface = */ ggml_backend_cpu_i, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), /* .context = */ ctx, }; if (cpu_backend == NULL) { delete ctx; return NULL; } return cpu_backend; } bool ggml_backend_is_cpu(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid()); } void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; ctx->n_threads = n_threads; } void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) { GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; if (ctx->threadpool && ctx->threadpool != threadpool) { // already had a different threadpool, pause/suspend it before switching ggml_threadpool_pause(ctx->threadpool); } ctx->threadpool = threadpool; } void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) { GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; ctx->abort_callback = abort_callback; ctx->abort_callback_data = abort_callback_data; } ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned"); return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size); } //////////////////////// static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) { return "CPU"; GGML_UNUSED(dev); } static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) { // TODO return "CPU"; GGML_UNUSED(dev); } static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { // TODO *free = 0; *total = 0; GGML_UNUSED(dev); } static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) { return GGML_BACKEND_DEVICE_TYPE_CPU_FULL; GGML_UNUSED(dev); } static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { props->name = ggml_backend_cpu_device_get_name(dev); props->description = ggml_backend_cpu_device_get_description(dev); props->type = ggml_backend_cpu_device_get_type(dev); ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total); props->caps = { /* async */ false, /* host_buffer */ false, /* events */ false, }; } static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t dev, const char * params) { return ggml_backend_cpu_init(); GGML_UNUSED(dev); GGML_UNUSED(params); } static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) { return ggml_backend_cpu_buffer_type(); GGML_UNUSED(dev); } static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { return ggml_backend_cpu_buffer_from_ptr(ptr, size); GGML_UNUSED(dev); GGML_UNUSED(max_tensor_size); } static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { switch (op->op) { case GGML_OP_CPY: return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S && op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float case GGML_OP_MUL_MAT: return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type; case GGML_OP_ROPE_BACK: return op->src[2] == NULL && (op->op_params[2] & 4) == 0; case GGML_OP_IM2COL_BACK: return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; case GGML_OP_OUT_PROD: return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32; default: return true; } GGML_UNUSED(dev); } static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { return ggml_backend_buft_is_host(buft); GGML_UNUSED(dev); } static const struct ggml_backend_device_i ggml_backend_cpu_device_i = { /* .get_name = */ ggml_backend_cpu_device_get_name, /* .get_description = */ ggml_backend_cpu_device_get_description, /* .get_memory = */ ggml_backend_cpu_device_get_memory, /* .get_type = */ ggml_backend_cpu_device_get_type, /* .get_props = */ ggml_backend_cpu_device_get_props, /* .init_backend = */ ggml_backend_cpu_device_init, /* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type, /* .get_host_buffer_type = */ NULL, /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_ptr, /* .supports_op = */ ggml_backend_cpu_device_supports_op, /* .supports_buft = */ ggml_backend_cpu_device_supports_buft, /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_synchronize = */ NULL, }; //////////////////////// static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) { return "CPU"; GGML_UNUSED(reg); } static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) { return 1; GGML_UNUSED(reg); } static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) { GGML_ASSERT(index == 0); static ggml_backend_device ggml_backend_cpu_device = { /* .iface = */ ggml_backend_cpu_device_i, /* .reg = */ reg, /* .context = */ NULL, }; return &ggml_backend_cpu_device; GGML_UNUSED(reg); GGML_UNUSED(index); } static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = { /* .get_name = */ ggml_backend_cpu_reg_get_name, /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count, /* .get_device = */ ggml_backend_cpu_reg_get_device, /* .get_proc_address = */ NULL, }; ggml_backend_reg_t ggml_backend_cpu_reg(void) { static struct ggml_backend_reg ggml_backend_cpu_reg = { /* .iface = */ ggml_backend_cpu_reg_i, /* .context = */ NULL, }; return &ggml_backend_cpu_reg; } // multi-buffer buffer struct ggml_backend_multi_buffer_context { ggml_backend_buffer_t * buffers; size_t n_buffers; }; static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) { ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; return ctx->buffers[0]->iface.get_name(ctx->buffers[0]); } static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; for (size_t i = 0; i < ctx->n_buffers; i++) { ggml_backend_buffer_free(ctx->buffers[i]); } free(ctx->buffers); free(ctx); } static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; for (size_t i = 0; i < ctx->n_buffers; i++) { ggml_backend_buffer_clear(ctx->buffers[i], value); } } static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = { /* .get_name = */ ggml_backend_multi_buffer_get_name, /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer, /* .get_base = */ NULL, /* .init_tensor = */ NULL, /* .memset_tensor = */ NULL, /* .set_tensor = */ NULL, /* .get_tensor = */ NULL, /* .cpy_tensor = */ NULL, /* .clear = */ ggml_backend_multi_buffer_clear, /* .reset = */ NULL, }; ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) { ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) malloc(sizeof(struct ggml_backend_multi_buffer_context)); ctx->n_buffers = n_buffers; ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t)); GGML_ASSERT(ctx->buffers != NULL); size_t total_size = 0; for (size_t i = 0; i < n_buffers; i++) { ctx->buffers[i] = buffers[i]; total_size += ggml_backend_buffer_get_size(buffers[i]); } return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_i, ctx, total_size); } bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_multi_buffer_get_name; } void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer)); ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; for (size_t i = 0; i < ctx->n_buffers; i++) { ggml_backend_buffer_set_usage(ctx->buffers[i], usage); } } // creates a copy of the tensor with the same memory layout static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) { struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor); for (int i = 0; i < GGML_MAX_DIMS; i++) { dup->nb[i] = tensor->nb[i]; } return dup; } static bool ggml_is_view_op(enum ggml_op op) { return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE; } // scheduler #ifndef GGML_SCHED_MAX_BACKENDS #define GGML_SCHED_MAX_BACKENDS 16 #endif #ifndef GGML_SCHED_MAX_SPLIT_INPUTS #define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC #endif #ifndef GGML_SCHED_MAX_COPIES #define GGML_SCHED_MAX_COPIES 4 #endif struct ggml_backend_sched_split { int backend_id; int i_start; int i_end; struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS]; int n_inputs; // graph view of this split struct ggml_cgraph graph; }; struct ggml_backend_sched { bool is_reset; // true if the scheduler has been reset since the last graph split bool is_alloc; int n_backends; ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS]; ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS]; ggml_gallocr_t galloc; // hash map of the nodes in the graph struct ggml_hash_set hash_set; int * hv_tensor_backend_ids; // [hash_set.size] struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies] int * node_backend_ids; // [graph_size] int * leaf_backend_ids; // [graph_size] int * prev_node_backend_ids; // [graph_size] int * prev_leaf_backend_ids; // [graph_size] // copy of the graph with modified inputs struct ggml_cgraph graph; // graph splits struct ggml_backend_sched_split * splits; int n_splits; int splits_capacity; // pipeline parallelism support int n_copies; int cur_copy; ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES]; struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS]; int n_graph_inputs; struct ggml_context * ctx; ggml_backend_sched_eval_callback callback_eval; void * callback_eval_user_data; char * context_buffer; size_t context_buffer_size; bool debug; }; #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor) #define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)] #define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)] #define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id) // returns the priority of the backend, lower id is higher priority static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) { for (int i = 0; i < sched->n_backends; i++) { if (sched->backends[i] == backend) { return i; } } return -1; } static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) { ggml_backend_buffer_t buffer = tensor->buffer; if (buffer == NULL) { return -1; } // find highest prio backend that supports the buffer type and the op for (int i = 0; i < sched->n_backends; i++) { if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) && ggml_backend_supports_op(sched->backends[i], op)) { return i; } } #ifndef NDEBUG fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n", __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name); #endif return -1; } #if 0 #define GGML_SCHED_MAX_SPLITS_DEBUG 4096 static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__) #define GET_CAUSE(node) causes[hash_id(node)] #else #define SET_CAUSE(node, ...) #define GET_CAUSE(node) "" #endif // returns the backend that should be used for the node based on the current locations static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) { // TODO: use supports_op to check if the backend supports the op // assign pre-allocated nodes to their backend int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor); if (cur_backend_id != -1) { SET_CAUSE(tensor, "1.dst"); return cur_backend_id; } // view_src if (tensor->view_src != NULL) { cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor); if (cur_backend_id != -1) { SET_CAUSE(tensor, "1.vsrc"); return cur_backend_id; } } if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) { // since the tensor is pre-allocated, it cannot be moved to another backend GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation"); } // graph input if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU) SET_CAUSE(tensor, "1.inp"); return cur_backend_id; } // operations with weights are preferably run on the same backend as the weights for (int i = 0; i < GGML_MAX_SRC; i++) { const struct ggml_tensor * src = tensor->src[i]; if (src == NULL) { continue; } if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); // check if a backend with higher prio wants to offload the op if (src_backend_id == sched->n_backends - 1) { for (int b = 0; b < src_backend_id; b++) { if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) { SET_CAUSE(tensor, "1.off"); return b; } } } SET_CAUSE(tensor, "1.wgt%d", i); return src_backend_id; } } return -1; } static char * fmt_size(size_t size) { static char buffer[128]; if (size >= 1024*1024) { snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024); } else { snprintf(buffer, sizeof(buffer), "%zuK", size/1024); } return buffer; } static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { int cur_split = 0; for (int i = 0; i < graph->n_nodes; i++) { if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) { ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id]; fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend), sched->splits[cur_split].n_inputs); for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) { fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j]))); } fprintf(stderr, "\n"); cur_split++; } struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { continue; } ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)); for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { continue; } ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src); fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name, fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); } fprintf(stderr, "\n"); } } static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) { ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer; ggml_backend_buffer_type_t buft = NULL; if (buf) { // the tensor is already allocated buft = buf->buft; } else { // see if the tensor already has a backend assigned, and use the buffer type of that backend int tensor_backend_id = tensor_backend_id(t); if (tensor_backend_id == -1 && t->view_src) { tensor_backend_id = tensor_backend_id(t->view_src); } if (tensor_backend_id != -1) { buft = sched->bufts[tensor_backend_id]; } } return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft); } static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) { if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) { *node_backend_id = cur_backend_id; SET_CAUSE(node, "2.sup"); } } // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { // reset splits sched->n_splits = 0; sched->n_graph_inputs = 0; sched->is_reset = false; struct ggml_init_params params = { /* .mem_size = */ sched->context_buffer_size, /* .mem_buffer = */ sched->context_buffer, /* .no_alloc = */ true }; ggml_free(sched->ctx); sched->ctx = ggml_init(params); if (sched->ctx == NULL) { GGML_ABORT("%s: failed to initialize context\n", __func__); } // pass 1: assign backends to ops with pre-allocated inputs for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; int * leaf_backend_id = &tensor_backend_id(leaf); // do not overwrite user assignments if (*leaf_backend_id == -1) { *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf); } } for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; int * node_backend_id = &tensor_backend_id(node); // do not overwrite user assignments if (*node_backend_id == -1) { *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node); #if 0 // src if (node->op == GGML_OP_NONE) { continue; } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { continue; } int * src_backend_id = &tensor_backend_id(src); if (*src_backend_id == -1) { *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src); } } #endif } } // pass 2: expand current backend assignments // assign the same backend to adjacent nodes // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend) // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known // expand gpu down { int cur_backend_id = -1; for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { continue; } int * node_backend_id = &tensor_backend_id(node); if (*node_backend_id != -1) { if (*node_backend_id == sched->n_backends - 1) { // skip cpu (lowest prio backend) cur_backend_id = -1; } else { cur_backend_id = *node_backend_id; } } else if (cur_backend_id != -1) { ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } } // expand gpu up { int cur_backend_id = -1; for (int i = graph->n_nodes - 1; i >= 0; i--) { struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { continue; } int * node_backend_id = &tensor_backend_id(node); if (*node_backend_id != -1) { if (*node_backend_id == sched->n_backends - 1) { // skip cpu (lowest prio backend) cur_backend_id = -1; } else { cur_backend_id = *node_backend_id; } } else if (cur_backend_id != -1) { ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } } // expand rest down { int cur_backend_id = -1; for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { continue; } int * node_backend_id = &tensor_backend_id(node); if (*node_backend_id != -1) { cur_backend_id = *node_backend_id; } else if (cur_backend_id != -1) { ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } } // expand rest up { int cur_backend_id = -1; for (int i = graph->n_nodes - 1; i >= 0; i--) { struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { continue; } int * node_backend_id = &tensor_backend_id(node); if (*node_backend_id != -1) { cur_backend_id = *node_backend_id; } else if (cur_backend_id != -1) { ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } } // pass 3: upgrade nodes to higher prio backends with compatible buffer types // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there // however, we also need to verify that the sources are in compatible buffer types // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU) // additionally, set remaining unassigned nodes to the backend with the most supported inputs // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { continue; } int * node_backend_id = &tensor_backend_id(node); if (*node_backend_id == -1) { // unassigned node: find the backend with the most supported inputs int n_supported_best = -1; for (int b = 0; b < sched->n_backends; b++) { if (ggml_backend_supports_op(sched->backends[b], node)) { int n_supported = 0; for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { continue; } if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) { n_supported++; } } if (n_supported > n_supported_best) { n_supported_best = n_supported; *node_backend_id = b; SET_CAUSE(node, "3.best"); } } } } else { // assigned node: upgrade to higher prio backend if possible for (int b = 0; b < *node_backend_id; b++) { if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) { bool supported = true; for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { continue; } if (!ggml_backend_sched_buffer_supported(sched, src, b)) { supported = false; break; } } if (supported) { *node_backend_id = b; SET_CAUSE(node, "3.upg"); break; } } } } } // pass 4: assign backends to remaining src from dst and view_src for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; int * cur_backend_id = &tensor_backend_id(node); if (node->view_src != NULL && *cur_backend_id == -1) { *cur_backend_id = tensor_backend_id(node->view_src); SET_CAUSE(node, "4.vsrc"); } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { continue; } int * src_backend_id = &tensor_backend_id(src); if (*src_backend_id == -1) { if (src->view_src != NULL) { // views are always on the same backend as the source *src_backend_id = tensor_backend_id(src->view_src); SET_CAUSE(src, "4.vsrc"); } else { *src_backend_id = *cur_backend_id; SET_CAUSE(src, "4.cur"); } } } } // pass 5: split graph, find tensors that need to be copied { int i_split = 0; struct ggml_backend_sched_split * split = &sched->splits[0]; // find the backend of the first split, skipping view ops int i = 0; for (; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (!ggml_is_view_op(node->op)) { split->backend_id = tensor_backend_id(node); break; } } split->i_start = 0; split->n_inputs = 0; int cur_backend_id = split->backend_id; for (; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { continue; } const int node_backend_id = tensor_backend_id(node); assert(node_backend_id != -1); // all nodes should be assigned by now // check if we should start a new split based on the sources of the current node bool need_new_split = false; if (node_backend_id == cur_backend_id && split->n_inputs > 0) { for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { continue; } // check if a weight is on a different backend // by starting a new split, the memory of the previously offloaded weights can be reused if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { int src_backend_id = tensor_backend_id(src); if (src_backend_id != cur_backend_id) { need_new_split = true; break; } } // check if the split has too many inputs // FIXME: count the number of inputs instead of only checking when full if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { const size_t id = hash_id(src); int src_backend_id = sched->hv_tensor_backend_ids[id]; bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) { //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name); need_new_split = true; break; } } } } if (node_backend_id != cur_backend_id || need_new_split) { split->i_end = i; i_split++; if (i_split >= sched->splits_capacity) { sched->splits_capacity *= 2; sched->splits = (ggml_backend_sched_split *) realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split)); GGML_ASSERT(sched->splits != NULL); } split = &sched->splits[i_split]; split->backend_id = node_backend_id; split->i_start = i; split->n_inputs = 0; cur_backend_id = node_backend_id; } // find inputs that are not on the same backend for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { continue; } size_t src_id = hash_id(src); const int src_backend_id = sched->hv_tensor_backend_ids[src_id]; assert(src_backend_id != -1); // all inputs should be assigned by now if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) { if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) { ggml_backend_t backend = sched->backends[src_backend_id]; for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * tensor_copy; if (c == sched->cur_copy) { tensor_copy = src; // use the original tensor as the current copy } else { tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c); } if (sched->n_copies > 1) { ggml_set_input(tensor_copy); ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor } tensor_id_copy(src_id, src_backend_id, c) = tensor_copy; SET_CAUSE(tensor_copy, "4.cpy"); } int n_graph_inputs = sched->n_graph_inputs++; GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); sched->graph_inputs[n_graph_inputs] = src; } } if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) { // create a copy of the input in the split's backend if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) { ggml_backend_t backend = sched->backends[cur_backend_id]; for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c); if (sched->n_copies > 1) { ggml_set_input(tensor_copy); ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor } tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy; SET_CAUSE(tensor_copy, "4.cpy"); } int n_inputs = split->n_inputs++; GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); split->inputs[n_inputs] = src; } node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy); } } } split->i_end = graph->n_nodes; sched->n_splits = i_split + 1; } if (sched->debug) { ggml_backend_sched_print_assignments(sched, graph); } // swap node_backend_ids and leaf _backend_ids with prevs { int * tmp = sched->node_backend_ids; sched->node_backend_ids = sched->prev_node_backend_ids; sched->prev_node_backend_ids = tmp; tmp = sched->leaf_backend_ids; sched->leaf_backend_ids = sched->prev_leaf_backend_ids; sched->prev_leaf_backend_ids = tmp; } int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies; if (sched->graph.size < graph_size) { sched->graph.size = graph_size; sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *)); sched->graph.leafs = (ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *)); GGML_ASSERT(sched->graph.nodes != NULL); GGML_ASSERT(sched->graph.leafs != NULL); } sched->graph.n_nodes = 0; sched->graph.n_leafs = 0; struct ggml_cgraph * graph_copy = &sched->graph; for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &sched->splits[i]; split->graph = ggml_graph_view(graph, split->i_start, split->i_end); // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split for (int j = 0; j < split->n_inputs; j++) { assert(graph_copy->size > (graph_copy->n_nodes + 1)); struct ggml_tensor * input = split->inputs[j]; const size_t input_id = hash_id(input); struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy); // add a dependency to the input source so that it is not freed before the copy is done struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input); input_dep->src[0] = input; sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id]; graph_copy->nodes[graph_copy->n_nodes++] = input_dep; // add a dependency to the input copy so that it is allocated at the start of the split sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id; graph_copy->nodes[graph_copy->n_nodes++] = input_cpy; } for (int j = split->i_start; j < split->i_end; j++) { assert(graph_copy->size > graph_copy->n_nodes); sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]); graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j]; } } if (sched->n_copies > 1) { // add input copies as leafs so that they are allocated first for (int i = 0; i < sched->n_graph_inputs; i++) { struct ggml_tensor * input = sched->graph_inputs[i]; size_t id = hash_id(input); int backend_id = tensor_backend_id(input); for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c); sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id; assert(graph_copy->size > graph_copy->n_leafs); graph_copy->leafs[graph_copy->n_leafs++] = input_cpy; } } for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &sched->splits[i]; int backend_id = split->backend_id; for (int j = 0; j < split->n_inputs; j++) { struct ggml_tensor * input = split->inputs[j]; size_t id = hash_id(input); for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c); sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id; assert(graph_copy->size > graph_copy->n_leafs); graph_copy->leafs[graph_copy->n_leafs++] = input_cpy; } } } } // add leafs from the original graph for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf); assert(graph_copy->size > graph_copy->n_leafs); graph_copy->leafs[graph_copy->n_leafs++] = leaf; } } static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { bool backend_ids_changed = false; for (int i = 0; i < sched->graph.n_nodes; i++) { if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] && sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) { backend_ids_changed = true; break; } } if (!backend_ids_changed) { for (int i = 0; i < sched->graph.n_leafs; i++) { if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] && sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) { backend_ids_changed = true; break; } } } // allocate graph if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) { // the re-allocation may cause the split inputs to be moved to a different address ggml_backend_sched_synchronize(sched); #ifndef NDEBUG fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed); #endif ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids); if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) { fprintf(stderr, "%s: failed to allocate graph\n", __func__); return false; } } return true; } static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { struct ggml_backend_sched_split * splits = sched->splits; for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &splits[i]; int split_backend_id = split->backend_id; ggml_backend_t split_backend = sched->backends[split_backend_id]; // copy the input tensors to the split backend for (int j = 0; j < split->n_inputs; j++) { ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]); struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy); if (input->flags & GGML_TENSOR_FLAG_INPUT) { // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done if (sched->events[split_backend_id][sched->cur_copy] != NULL) { ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); } else { ggml_backend_synchronize(split_backend); } ggml_backend_tensor_copy(input, input_cpy); } else { // wait for the split backend to finish using the input before overwriting it if (sched->events[split_backend_id][sched->cur_copy] != NULL) { ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]); } else { ggml_backend_synchronize(split_backend); } // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) { ggml_backend_synchronize(input_backend); if (sched->events[split_backend_id][sched->cur_copy] != NULL) { ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); } else { ggml_backend_synchronize(split_backend); } ggml_backend_tensor_copy(input, input_cpy); } } } if (!sched->callback_eval) { enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); if (ec != GGML_STATUS_SUCCESS) { return ec; } } else { // similar to ggml_backend_compare_graph_backend for (int j0 = 0; j0 < split->graph.n_nodes; j0++) { struct ggml_tensor * t = split->graph.nodes[j0]; // check if the user needs data from this node bool need = sched->callback_eval(t, true, sched->callback_eval_user_data); int j1 = j0; // determine the range [j0, j1] of nodes that can be computed together while (!need && j1 < split->graph.n_nodes - 1) { t = split->graph.nodes[++j1]; need = sched->callback_eval(t, true, sched->callback_eval_user_data); } struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv); if (ec != GGML_STATUS_SUCCESS) { return ec; } // TODO: pass backend to the callback, then the user can decide if they want to synchronize ggml_backend_synchronize(split_backend); if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) { break; } j0 = j1; } } // record the event of this copy if (split->n_inputs > 0) { if (sched->events[split_backend_id][sched->cur_copy] != NULL) { ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend); } } } sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies; return GGML_STATUS_SUCCESS; } ggml_backend_sched_t ggml_backend_sched_new( ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel) { GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched)); sched->debug = getenv("GGML_SCHED_DEBUG") != NULL; sched->n_backends = n_backends; sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1; // initialize hash table // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead) sched->hash_set = ggml_hash_set_new(graph_size); sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); sched->hv_tensor_copies = (ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *)); const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2; sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0])); sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0])); sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0])); sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false); sched->context_buffer = (char *) malloc(sched->context_buffer_size); const int initial_splits_capacity = 16; sched->splits = (ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0])); sched->splits_capacity = initial_splits_capacity; for (int b = 0; b < n_backends; b++) { sched->backends[b] = backends[b]; sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]); GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b])); if (sched->n_copies > 1) { for (int c = 0; c < sched->n_copies; c++) { sched->events[b][c] = ggml_backend_event_new(backends[b]->device); } } } sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); ggml_backend_sched_reset(sched); return sched; } void ggml_backend_sched_free(ggml_backend_sched_t sched) { if (sched == NULL) { return; } for (int b = 0; b < sched->n_backends; b++) { for (int c = 0; c < sched->n_copies; c++) { ggml_backend_event_free(sched->events[b][c]); } } ggml_gallocr_free(sched->galloc); ggml_free(sched->ctx); ggml_hash_set_free(&sched->hash_set); free(sched->splits); free(sched->hv_tensor_backend_ids); free(sched->hv_tensor_copies); free(sched->node_backend_ids); free(sched->leaf_backend_ids); free(sched->prev_node_backend_ids); free(sched->prev_leaf_backend_ids); free(sched->context_buffer); free(sched->graph.nodes); free(sched->graph.leafs); free(sched); } void ggml_backend_sched_reset(ggml_backend_sched_t sched) { // reset state for the next run if (!sched->is_reset) { ggml_hash_set_reset(&sched->hash_set); memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *)); sched->is_reset = true; } sched->is_alloc = false; } bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs); ggml_backend_sched_split_graph(sched, measure_graph); if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) { return false; } ggml_backend_sched_reset(sched); ggml_backend_sched_synchronize(sched); return true; } bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs); ggml_backend_sched_split_graph(sched, graph); if (!ggml_backend_sched_alloc_splits(sched)) { return false; } sched->is_alloc = true; return true; } enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph); ggml_backend_sched_synchronize(sched); return err; } enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { if (!sched->is_reset && !sched->is_alloc) { ggml_backend_sched_reset(sched); } if (!sched->is_alloc) { if (!ggml_backend_sched_alloc_graph(sched, graph)) { return GGML_STATUS_ALLOC_FAILED; } } return ggml_backend_sched_compute_splits(sched); } void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) { for (int i = 0; i < sched->n_backends; i++) { ggml_backend_synchronize(sched->backends[i]); } } void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) { sched->callback_eval = callback; sched->callback_eval_user_data = user_data; } int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) { return sched->n_splits; } int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) { return sched->n_copies; } int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) { return sched->n_backends; } ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) { GGML_ASSERT(i >= 0 && i < sched->n_backends); return sched->backends[i]; } size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) { int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); return ggml_gallocr_get_buffer_size(sched->galloc, backend_index); } void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); tensor_backend_id(node) = backend_index; SET_CAUSE(node, "usr"); sched->is_reset = false; } ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) { int backend_index = tensor_backend_id(node); if (backend_index == -1) { return NULL; } return sched->backends[backend_index]; } // utils void ggml_backend_view_init(struct ggml_tensor * tensor) { GGML_ASSERT(tensor->buffer == NULL); GGML_ASSERT(tensor->view_src != NULL); GGML_ASSERT(tensor->view_src->buffer != NULL); GGML_ASSERT(tensor->view_src->data != NULL); tensor->buffer = tensor->view_src->buffer; tensor->data = (char *)tensor->view_src->data + tensor->view_offs; ggml_backend_buffer_init_tensor(tensor->buffer, tensor); } void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) { GGML_ASSERT(tensor->buffer == NULL); GGML_ASSERT(tensor->data == NULL); GGML_ASSERT(tensor->view_src == NULL); GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer)); GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <= (char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer)); tensor->buffer = buffer; tensor->data = addr; ggml_backend_buffer_init_tensor(buffer, tensor); } static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) { GGML_ASSERT(src != NULL); GGML_ASSERT(src->data && "graph must be allocated"); size_t id = ggml_hash_insert(&hash_set, src); if (id == GGML_HASHSET_ALREADY_EXISTS) { return node_copies[ggml_hash_find(&hash_set, src)]; } struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src); if (src->view_src != NULL) { dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src); dst->view_offs = src->view_offs; } dst->op = src->op; memcpy(dst->op_params, src->op_params, sizeof(dst->op_params)); ggml_set_name(dst, src->name); // copy src for (int i = 0; i < GGML_MAX_SRC; i++) { struct ggml_tensor * s = src->src[i]; if (s == NULL) { continue; } dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s); } node_copies[id] = dst; return dst; } static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) { size_t id = ggml_hash_find(hash_set, src); if (node_init[id]) { return; } node_init[id] = true; struct ggml_tensor * dst = node_copies[id]; if (dst->view_src != NULL) { graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src); ggml_backend_view_init(dst); } else { ggml_backend_tensor_copy(src, dst); } // init src for (int i = 0; i < GGML_MAX_SRC; i++) { struct ggml_tensor * s = src->src[i]; if (s == NULL) { continue; } graph_copy_init_tensor(hash_set, node_copies, node_init, s); } } struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) { struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size); struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0])); struct ggml_init_params params = { /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false), /* .mem_buffer = */ NULL, /* .no_alloc = */ true }; struct ggml_context * ctx_allocated = ggml_init(params); struct ggml_context * ctx_unallocated = ggml_init(params); if (ctx_allocated == NULL || ctx_unallocated == NULL) { fprintf(stderr, "failed to allocate context for graph copy\n"); ggml_hash_set_free(&hash_set); free(node_copies); free(node_init); ggml_free(ctx_allocated); ggml_free(ctx_unallocated); return { /* .buffer = */ NULL, /* .ctx_allocated = */ NULL, /* .ctx_unallocated = */ NULL, /* .graph = */ NULL, }; } // dup nodes for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node); } // allocate nodes ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend); if (buffer == NULL) { fprintf(stderr, "failed to allocate buffer for graph copy\n"); ggml_hash_set_free(&hash_set); free(node_copies); free(node_init); ggml_free(ctx_allocated); ggml_free(ctx_unallocated); return { /* .buffer = */ NULL, /* .ctx_allocated = */ NULL, /* .ctx_unallocated = */ NULL, /* .graph = */ NULL, }; } //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024); // copy data and init views for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; graph_copy_init_tensor(&hash_set, node_copies, node_init, node); } // build graph copy struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false); for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)]; graph_copy->nodes[i] = node_copy; } graph_copy->n_nodes = graph->n_nodes; ggml_hash_set_free(&hash_set); free(node_copies); free(node_init); return { /* .buffer = */ buffer, /* .ctx_allocated = */ ctx_allocated, /* .ctx_unallocated = */ ctx_unallocated, /* .graph = */ graph_copy, }; } void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) { ggml_backend_buffer_free(copy.buffer); ggml_free(copy.ctx_allocated); ggml_free(copy.ctx_unallocated); } bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) { struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph); if (copy.buffer == NULL) { return false; } struct ggml_cgraph * g1 = graph; struct ggml_cgraph * g2 = copy.graph; assert(g1->n_nodes == g2->n_nodes); for (int i = 0; i < g1->n_nodes; i++) { //printf("eval %d/%d\n", i, g1->n_nodes); struct ggml_tensor * t1 = g1->nodes[i]; struct ggml_tensor * t2 = g2->nodes[i]; assert(t1->op == t2->op && ggml_are_same_layout(t1, t2)); struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1); struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1); ggml_backend_graph_compute(backend1, &g1v); ggml_backend_graph_compute(backend2, &g2v); if (ggml_is_view_op(t1->op)) { continue; } // compare results, calculate rms etc if (!callback(i, t1, t2, user_data)) { break; } } ggml_backend_graph_copy_free(copy); return true; }