Support for gguf.

2025-01-02 14:54:35 +00:00 · 2023-09-21 12:39:33 -04:00 · 2023-09-21 12:39:33 -04:00 · 1b1416d7b7
commit 1b1416d7b7
parent 2c24d67e7b
1 changed files with 44 additions and 0 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -514,6 +514,9 @@ static std::string llama_format_win_err(DWORD err) {
 struct llama_buffer {
    void * data = NULL;
    size_t size = 0;
+#if defined(GGML_USE_KOMPUTE)
+    ggml_vk_memory memory;
+#endif

    // fallback to malloc / free
    // useful in cases where CUDA can try to allocate PINNED memory
@ -522,6 +525,14 @@ struct llama_buffer {
    void resize(size_t n) {
        llama_host_free(data);

+#if defined(GGML_USE_KOMPUTE)
+        if (ggml_vk_has_device()) {
+            this->memory = ggml_vk_allocate(n);
+            this->data = (uint8_t*)memory.data;
+            this->size = n;
+            return;
+        }
+#endif
        data = llama_host_malloc(n);
        if (!data) {
            fallback = true;
@ -536,6 +547,13 @@ struct llama_buffer {

    ~llama_buffer() {
        if (data) {
+#if defined(GGML_USE_KOMPUTE)
+            if (ggml_vk_has_device()) {
+                ggml_vk_free_memory(memory);
+                data = NULL;
+                return;
+            }
+#endif
            if (fallback) { // NOLINT
                free(data);
            } else {
@ -1398,6 +1416,9 @@ struct llama_model_loader {
            use_mmap = false;
        }

+#if defined(GGML_USE_KOMPUTE)
+        use_mmap = false;
+#endif
        this->use_mmap = use_mmap;
    }

@ -6470,6 +6491,23 @@ struct llama_context * llama_new_context_with_model(
            LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
 #undef LLAMA_METAL_CHECK_BUF
        }
+#elif defined(GGML_USE_KOMPUTE)
+    if (ggml_vk_has_device() && params.n_gpu_layers > 0
+        && (model->ftype == LLAMA_FTYPE_ALL_F32
+            || model->ftype == LLAMA_FTYPE_MOSTLY_F16
+            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0)) {
+        // this allocates all Vulkan resources and memory buffers
+        ctx->ctx_kompute = ggml_vk_init();
+
+        const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
+
+        printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
+
+        ggml_vk_add_buffer(ctx->ctx_kompute, "data", ctx->model.buf.memory);
+        ggml_vk_add_buffer(ctx->ctx_kompute, "eval", ctx->buf_compute.memory);
+        ggml_vk_add_buffer(ctx->ctx_kompute, "kv", ctx->kv_self.buf.memory);
+        ggml_vk_add_buffer(ctx->ctx_kompute, "alloc", ctx->buf_alloc.memory);
+    }
 #endif
    }

@ -6503,7 +6541,13 @@ static struct llama_context * llama_init_from_file(
 }

 void llama_free(struct llama_context * ctx) {
+#ifdef GGML_USE_KOMPUTE
+    ggml_vk_free(ctx->ctx_kompute);
+#endif
    delete ctx;
+#ifdef GGML_USE_KOMPUTE
+    ggml_vk_free_device();
+#endif
 }

 int llama_n_vocab(const struct llama_context * ctx) {