metal : use shared buffers between CPU and GPU (#1696)

* Use MTLDevice.newBufferWithBytesNoCopy to share buffers between CPU and GPU * Page-align buffers used by Metal * Remove trailing whitespace * Only import unistd.h for Metal builds * metal : remove unnecessary copies --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-12-24 10:24:35 +00:00 · 2023-06-05 13:24:04 -07:00 · 2023-06-05 13:24:04 -07:00 · 9d0693bce3
commit 9d0693bce3
parent efe0507632
4 changed files with 38 additions and 16 deletions
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -195,14 +195,25 @@ bool ggml_metal_add_buffer(
            }
        }
        size_t page_size = getpagesize();
        size_t aligned_size = size;
        if ((aligned_size % page_size) != 0) {
            aligned_size += (page_size - (aligned_size % page_size));
        }
        ctx->buffers[ctx->n_buffers].name = name;
        ctx->buffers[ctx->n_buffers].data = data;
        ctx->buffers[ctx->n_buffers].size = size;
-        ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytes:data length:size options:MTLResourceStorageModeShared];
+        ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil];
        if (ctx->buffers[ctx->n_buffers].metal == nil) {
            fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
            return false;
        } else {
            fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
        }
        ++ctx->n_buffers;
        fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, size / 1024.0 / 1024.0);
    }
    return true;
--- a/ggml.c
+++ b/ggml.c
@ -22,6 +22,10 @@
 #include <float.h>
 #include <limits.h>
 #ifdef GGML_USE_METAL
 #include <unistd.h>
 #endif
 // if C99 - static_assert is noop
 // ref: https://stackoverflow.com/a/53923785/4039976
 #ifndef static_assert
@ -122,7 +126,11 @@ typedef void* thread_ret_t;
 #else
 inline static void* ggml_aligned_malloc(size_t size) {
    void* aligned_memory = NULL;
 #ifdef GGML_USE_METAL
    int result = posix_memalign(&aligned_memory, getpagesize(), size);
 #else
    int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
 #endif
    if (result != 0) {
        // Handle allocation failure
        return NULL;
--- a/llama-util.h
+++ b/llama-util.h
@ -405,13 +405,29 @@ struct llama_buffer {
    llama_buffer() = default;
    void resize(size_t len) {
 #ifdef GGML_USE_METAL
        free(addr);
        int result = posix_memalign((void **) &addr, getpagesize(), len);
        if (result == 0) {
            memset(addr, 0, len);
        }
        else {
            addr = NULL;
        }
 #else
        delete[] addr;
        addr = new uint8_t[len];
 #endif
        size = len;
    }
    ~llama_buffer() {
 #ifdef GGML_USE_METAL
        free(addr);
 #else
        delete[] addr;
 #endif
        addr = NULL;
    }
    // disable copy and move
--- a/llama.cpp
+++ b/llama.cpp
@ -53,7 +53,6 @@ enum e_model {
    MODEL_65B,
 };
 static const size_t MB = 1024*1024;
 // computed for n_ctx == 2048
@ -1281,12 +1280,6 @@ static bool llama_eval_internal(
    ggml_set_name(embd, "embd");
    memcpy(embd->data, tokens, N*ggml_element_size(embd));
 #ifdef GGML_USE_METAL
    if (lctx.ctx_metal && N == 1) {
        ggml_metal_set_tensor(lctx.ctx_metal, embd);
    }
 #endif
    struct ggml_tensor * cur;
    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
@ -1484,12 +1477,6 @@ static bool llama_eval_internal(
        }
        ggml_graph_compute(ctx0, &gf);
        if (lctx.ctx_metal) {
            // We need to sync the CPU KV cache with the GPU KV cache
            ggml_metal_set_tensor(lctx.ctx_metal, kv_self.k);
            ggml_metal_set_tensor(lctx.ctx_metal, kv_self.v);
        }
    }
 #else
    ggml_graph_compute(ctx0, &gf);