diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp index cc0adaf2f..01917ab01 100644 --- a/ggml-kompute.cpp +++ b/ggml-kompute.cpp @@ -1,5 +1,7 @@ -#include "ggml-kompute.h" #include "ggml.h" +#include "ggml-backend.h" +#include "ggml-backend-impl.h" +#include "ggml-kompute.h" // These are generated at build time by cmake custom command #include "shaderop_scale.h" @@ -488,16 +490,28 @@ void ggml_vk_free_memory(ggml_vk_memory &memory) } static -decltype(ggml_kompute_context::buffers)::iterator ggml_vk_find_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint64_t & offset) { +ggml_vk_memory * ggml_vk_find_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint64_t & offset) { + // compatibility with ggml-backend + if (t->buffer && t->buffer->buft == ggml_backend_kompute_buffer_type()) { + ggml_vk_memory * buf_ctx = (ggml_vk_memory *) t->buffer->context; + + const intptr_t ioffs = reinterpret_cast(t->data) - reinterpret_cast(buf_ctx->data); + + GGML_ASSERT(ioffs >= 0 && ioffs + ggml_nbytes(t) <= (int64_t)t->buffer->size); + + offset = (uint64_t)ioffs; + return buf_ctx; + } + for (auto it = ctx->buffers.begin(); ; it++) { if (it == ctx->buffers.end()) { fprintf(stderr, "%s: Failed to find tensor %p\n", __func__, t->data); - return it; + return nullptr; } if (it->data <= t->data && reinterpret_cast(it->data) + it->size >= (reinterpret_cast(t->data) + ggml_nbytes(t))) { offset = reinterpret_cast(t->data) - reinterpret_cast(it->data); - return it; + return &*it; } } } @@ -505,8 +519,8 @@ decltype(ggml_kompute_context::buffers)::iterator ggml_vk_find_tensor(struct ggm static const std::shared_ptr ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint32_t *alignedOffset) { uint64_t originalOffset = 0; - auto res = ggml_vk_find_tensor(ctx, t, originalOffset); - if (res == ctx->buffers.end()) { + auto * res = ggml_vk_find_tensor(ctx, t, originalOffset); + if (!res) { static std::shared_ptr nullTensor = nullptr; return nullTensor; } @@ -1629,3 +1643,158 @@ kp::TensorT::dataType() { return TensorDataTypes::eUnsignedInt; } + +//////////////////////////////////////////////////////////////////////////////// + +// backend interface + +static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) { + GGML_UNUSED(buffer); + return "Kompute"; +} + +static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) { + auto * memory = (ggml_vk_memory *)buffer->context; + if (ggml_vk_has_device()) { + ggml_vk_free_memory(*memory); + } + delete memory; +} + +static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) { + return ((ggml_vk_memory *)buffer->context)->data; +} + +static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + memcpy((char *)tensor->data + offset, data, size); + ggml_vk_h2d_buffer(*(ggml_vk_memory *)buffer->context); +} + +static void ggml_backend_kompute_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + ggml_vk_d2h_buffer(*(ggml_vk_memory *)buffer->context); + memcpy(data, (const char *)tensor->data + offset, size); +} + +static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + auto * memory = (ggml_vk_memory *)buffer->context; + memset(memory->data, value, buffer->size); + ggml_vk_h2d_buffer(*memory); +} + +static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = { + /* .get_name = */ ggml_backend_kompute_buffer_get_name, + /* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer, + /* .get_base = */ ggml_backend_kompute_buffer_get_base, + /* .init_tensor = */ NULL, + /* .set_tensor = */ ggml_backend_kompute_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_kompute_buffer_get_tensor, + /* .cpy_tensor_from = */ NULL, + /* .cpy_tensor_to = */ NULL, + /* .clear = */ ggml_backend_kompute_buffer_clear, + /* .reset = */ NULL, +}; + +// default buffer type + +static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return "Kompute"; +} + +static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + auto * ctx = new ggml_vk_memory(ggml_vk_allocate(size)); + return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size); +} + +static size_t ggml_backend_kompute_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return 32; +} + +static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { + GGML_UNUSED(buft); + return ggml_backend_is_kompute(backend); +} + +ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void) { + static struct ggml_backend_buffer_type ggml_backend_buffer_type_kompute = { + /* .iface = */ { + /* .get_name = */ ggml_backend_kompute_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend, + /* .is_host = */ NULL, + }, + /* .context = */ NULL, + }; + + return &ggml_backend_buffer_type_kompute; +} + +// backend + +static const char * ggml_backend_kompute_name(ggml_backend_t backend) { + GGML_UNUSED(backend); + return "Kompute"; +} + +static void ggml_backend_kompute_free(ggml_backend_t backend) { + struct ggml_kompute_context * ctx = (struct ggml_kompute_context *)backend->context; + ggml_vk_free_device(); + ggml_vk_free(ctx); + delete backend; +} + +static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) { + GGML_UNUSED(backend); + return ggml_backend_kompute_buffer_type(); +} + +static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + auto * ctx = (ggml_kompute_context *)backend->context; + ggml_vk_graph_compute(ctx, cgraph); + return true; +} + +static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { + GGML_UNUSED(backend); + GGML_UNUSED(op); + return true; // TODO: implement +} + +static struct ggml_backend_i kompute_backend_i = { + /* .get_name = */ ggml_backend_kompute_name, + /* .free = */ ggml_backend_kompute_free, + /* .get_default_buffer_type = */ ggml_backend_kompute_get_default_buffer_type, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_from_async = */ NULL, + /* .cpy_tensor_to_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_kompute_graph_compute, + /* .supports_op = */ ggml_backend_kompute_supports_op, +}; + +ggml_backend_t ggml_backend_kompute_init() { + if (!ggml_vk_has_device()) { + fprintf(stderr, "%s: error: device was not initialized\n", __func__); + return nullptr; + } + + struct ggml_kompute_context * ctx = ggml_vk_init(); + + ggml_backend_t kompute_backend = new ggml_backend { + /* .interface = */ kompute_backend_i, + /* .context = */ ctx, + }; + + return kompute_backend; +} + +bool ggml_backend_is_kompute(ggml_backend_t backend) { + return backend && backend->iface.get_name == ggml_backend_kompute_name; +} diff --git a/ggml-kompute.h b/ggml-kompute.h index ac8a4d4a0..f895dc545 100644 --- a/ggml-kompute.h +++ b/ggml-kompute.h @@ -1,5 +1,7 @@ #pragma once +#include "ggml-backend.h" + #include #include #include @@ -55,3 +57,17 @@ void ggml_vk_d2h_all(struct ggml_kompute_context * ctx); void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t); void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t); void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf); + +// +// backend API +// user-code should use only these functions +// + +// forward declaration +typedef struct ggml_backend * ggml_backend_t; + +GGML_API ggml_backend_t ggml_backend_kompute_init(void); + +GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend); + +GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void); diff --git a/llama.cpp b/llama.cpp index 3f2ae956f..0588250f2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -760,63 +760,6 @@ static std::string llama_format_win_err(DWORD err) { } #endif -// TODO(jared): remove this -struct llama_buffer { - void * data = NULL; - size_t size = 0; -#ifdef GGML_USE_KOMPUTE - ggml_vk_memory memory; -#endif - - // fallback to malloc / free - // useful in cases where CUDA can try to allocate PINNED memory - bool fallback = false; - - void resize(size_t n) { - llama_host_free(data); - -#ifdef GGML_USE_KOMPUTE - if (ggml_vk_has_device()) { - this->memory = ggml_vk_allocate(n); - this->data = (uint8_t*)memory.data; - this->size = n; - return; - } -#endif - data = llama_host_malloc(n); - if (!data) { - fallback = true; - data = malloc(n); - } else { - fallback = false; - } - - GGML_ASSERT(data); - size = n; - } - - ~llama_buffer() { - if (data) { -#ifdef GGML_USE_KOMPUTE - if (memory.data) { - if (ggml_vk_has_device()) { - ggml_vk_free_memory(memory); - } - data = NULL; - return; - } -#endif - if (fallback) { // NOLINT - free(data); - } else { - llama_host_free(data); - } - } - - data = NULL; - } -}; - template struct no_init { T value; @@ -1288,6 +1231,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { buft = ggml_backend_cuda_buffer_type(gpu); #elif defined(GGML_USE_CLBLAST) buft = ggml_backend_opencl_buffer_type(); +#elif defined(GGML_USE_KOMPUTE) + buft = ggml_backend_kompute_buffer_type(); #endif if (buft == nullptr) { @@ -1721,11 +1666,6 @@ struct llama_context { // allocator for the input tensors ggml_tallocr * alloc = nullptr; -// TODO(jared): remove this -#if defined(GGML_USE_KOMPUTE) - ggml_kompute_context * ctx_kompute = NULL; -#endif - // temporary buffer for copying data to/from the backend std::vector> buf_copy; @@ -4362,10 +4302,6 @@ struct llm_build_context { std::vector & buf_compute_meta; -#ifdef GGML_USE_KOMPUTE - ggml_kompute_context * ctx_kompute; -#endif - struct ggml_context * ctx0 = nullptr; // TODO: consider making the entire interface noexcept @@ -4405,10 +4341,6 @@ struct llm_build_context { do_rope_shift (worst_case || kv_self.has_shift), cb (cb), buf_compute_meta (lctx.buf_compute_meta) -// TODO(jared): remove this -#ifdef GGML_USE_KOMPUTE - , ctx_kompute (lctx.ctx_kompute) -#endif { // all initializations should be done in init() } @@ -6028,11 +5960,6 @@ static struct ggml_cgraph * llama_build_graph( bool alloc_inp_KQ_mask = false; bool alloc_inp_K_shift = false; - // TODO(jared): do we still need this? -#ifdef GGML_USE_KOMPUTE - const bool needs_h2d_all = lctx.ctx_kompute && !ggml_vk_has_h2d_all(lctx.ctx_kompute); -#endif - // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) // TODO: improve handling of input and output tensors, then replace this with ggml_set_name llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { @@ -6149,22 +6076,6 @@ static struct ggml_cgraph * llama_build_graph( alloc_inp_K_shift = true; } - - // TODO(jared): this shouldn't be needed anymore -#ifdef GGML_USE_KOMPUTE - if (lctx.ctx_kompute && !needs_h2d_all) { - const char * offload_tensors[] = {"inp_tokens", "inp_pos", "KQ_mask", "K_shift"}; - for (auto off : offload_tensors) { - if (strcmp(name, off) == 0) { - ggml_vk_h2d_tensor(lctx.ctx_kompute, cur); - break; - } - } - if (strcmp(name, "inp_embd") == 0 && !batch.token) { - ggml_vk_h2d_tensor(lctx.ctx_kompute, cur); - } - } -#endif }; struct ggml_cgraph * result = NULL; @@ -6230,12 +6141,6 @@ static struct ggml_cgraph * llama_build_graph( GGML_ASSERT(false); } -#ifdef GGML_USE_KOMPUTE - if (needs_h2d_all) { - ggml_vk_h2d_all(lctx.ctx_kompute); - } -#endif - llm.free(); return result; @@ -6374,25 +6279,6 @@ static int llama_decode_internal( if (ggml_backend_is_metal(lctx.backend_metal)) { ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads); } -#elif defined(GGML_USE_KOMPUTE) - if (lctx.ctx_kompute && n_tokens == 1) { - ggml_vk_graph_compute(lctx.ctx_kompute, gf); - ggml_vk_d2h_tensor(lctx.ctx_kompute, res); - } else { - if (lctx.ctx_kompute) { - for (int il = 0; il < hparams.n_layer; ++il) { - ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.k_l[il]); - ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.v_l[il]); - } - } - ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); - if (lctx.ctx_kompute) { - for (int il = 0; il < hparams.n_layer; ++il) { - ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.k_l[il]); - ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.v_l[il]); - } - } - } #endif if (lctx.backend_cpu != nullptr) { @@ -9446,6 +9332,16 @@ struct llama_context * llama_new_context_with_model( } } } +#elif defined(GGML_USE_KOMPUTE) + if (ggml_vk_has_device() && model->n_gpu_layers > 0) { + auto * backend = ggml_backend_kompute_init(); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } #endif ctx->backend_cpu = ggml_backend_cpu_init(); if (ctx->backend_cpu == nullptr) { @@ -9518,23 +9414,6 @@ struct llama_context * llama_new_context_with_model( ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0); } } - - // TODO(jared): remove this -#if defined(GGML_USE_KOMPUTE) - if (ggml_vk_has_device() && model->n_gpu_layers > 0) { - // this allocates all Vulkan resources and memory buffers - ctx->ctx_kompute = ggml_vk_init(); - - const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); - - printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); - - ggml_vk_add_buffer(ctx->ctx_kompute, "data", ctx->model.buf.memory); - ggml_vk_add_buffer(ctx->ctx_kompute, "eval", ctx->buf_compute.memory); - ggml_vk_add_buffer(ctx->ctx_kompute, "kv", ctx->kv_self.buf.memory); - ggml_vk_add_buffer(ctx->ctx_kompute, "alloc", ctx->buf_alloc.memory); - } -#endif } #ifdef GGML_USE_MPI @@ -9555,9 +9434,6 @@ struct llama_context * llama_new_context_with_model( } void llama_free(struct llama_context * ctx) { -#ifdef GGML_USE_KOMPUTE - ggml_vk_free(ctx->ctx_kompute); -#endif delete ctx; #ifdef GGML_USE_KOMPUTE ggml_vk_free_device();