mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-24 10:24:35 +00:00
metal : use shared buffers between CPU and GPU (#1696)
* Use MTLDevice.newBufferWithBytesNoCopy to share buffers between CPU and GPU * Page-align buffers used by Metal * Remove trailing whitespace * Only import unistd.h for Metal builds * metal : remove unnecessary copies --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
efe0507632
commit
9d0693bce3
17
ggml-metal.m
17
ggml-metal.m
@ -195,14 +195,25 @@ bool ggml_metal_add_buffer(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t page_size = getpagesize();
|
||||||
|
size_t aligned_size = size;
|
||||||
|
if ((aligned_size % page_size) != 0) {
|
||||||
|
aligned_size += (page_size - (aligned_size % page_size));
|
||||||
|
}
|
||||||
|
|
||||||
ctx->buffers[ctx->n_buffers].name = name;
|
ctx->buffers[ctx->n_buffers].name = name;
|
||||||
ctx->buffers[ctx->n_buffers].data = data;
|
ctx->buffers[ctx->n_buffers].data = data;
|
||||||
ctx->buffers[ctx->n_buffers].size = size;
|
ctx->buffers[ctx->n_buffers].size = size;
|
||||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytes:data length:size options:MTLResourceStorageModeShared];
|
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
|
||||||
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
|
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
|
||||||
|
}
|
||||||
|
|
||||||
++ctx->n_buffers;
|
++ctx->n_buffers;
|
||||||
|
|
||||||
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, size / 1024.0 / 1024.0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
8
ggml.c
8
ggml.c
@ -22,6 +22,10 @@
|
|||||||
#include <float.h>
|
#include <float.h>
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
|
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
// if C99 - static_assert is noop
|
// if C99 - static_assert is noop
|
||||||
// ref: https://stackoverflow.com/a/53923785/4039976
|
// ref: https://stackoverflow.com/a/53923785/4039976
|
||||||
#ifndef static_assert
|
#ifndef static_assert
|
||||||
@ -122,7 +126,11 @@ typedef void* thread_ret_t;
|
|||||||
#else
|
#else
|
||||||
inline static void* ggml_aligned_malloc(size_t size) {
|
inline static void* ggml_aligned_malloc(size_t size) {
|
||||||
void* aligned_memory = NULL;
|
void* aligned_memory = NULL;
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
int result = posix_memalign(&aligned_memory, getpagesize(), size);
|
||||||
|
#else
|
||||||
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
|
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
|
||||||
|
#endif
|
||||||
if (result != 0) {
|
if (result != 0) {
|
||||||
// Handle allocation failure
|
// Handle allocation failure
|
||||||
return NULL;
|
return NULL;
|
||||||
|
16
llama-util.h
16
llama-util.h
@ -405,13 +405,29 @@ struct llama_buffer {
|
|||||||
llama_buffer() = default;
|
llama_buffer() = default;
|
||||||
|
|
||||||
void resize(size_t len) {
|
void resize(size_t len) {
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
free(addr);
|
||||||
|
int result = posix_memalign((void **) &addr, getpagesize(), len);
|
||||||
|
if (result == 0) {
|
||||||
|
memset(addr, 0, len);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
addr = NULL;
|
||||||
|
}
|
||||||
|
#else
|
||||||
delete[] addr;
|
delete[] addr;
|
||||||
addr = new uint8_t[len];
|
addr = new uint8_t[len];
|
||||||
|
#endif
|
||||||
size = len;
|
size = len;
|
||||||
}
|
}
|
||||||
|
|
||||||
~llama_buffer() {
|
~llama_buffer() {
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
free(addr);
|
||||||
|
#else
|
||||||
delete[] addr;
|
delete[] addr;
|
||||||
|
#endif
|
||||||
|
addr = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
// disable copy and move
|
// disable copy and move
|
||||||
|
13
llama.cpp
13
llama.cpp
@ -53,7 +53,6 @@ enum e_model {
|
|||||||
MODEL_65B,
|
MODEL_65B,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static const size_t MB = 1024*1024;
|
static const size_t MB = 1024*1024;
|
||||||
|
|
||||||
// computed for n_ctx == 2048
|
// computed for n_ctx == 2048
|
||||||
@ -1281,12 +1280,6 @@ static bool llama_eval_internal(
|
|||||||
ggml_set_name(embd, "embd");
|
ggml_set_name(embd, "embd");
|
||||||
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
|
||||||
if (lctx.ctx_metal && N == 1) {
|
|
||||||
ggml_metal_set_tensor(lctx.ctx_metal, embd);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
||||||
|
|
||||||
@ -1484,12 +1477,6 @@ static bool llama_eval_internal(
|
|||||||
}
|
}
|
||||||
|
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute(ctx0, &gf);
|
||||||
|
|
||||||
if (lctx.ctx_metal) {
|
|
||||||
// We need to sync the CPU KV cache with the GPU KV cache
|
|
||||||
ggml_metal_set_tensor(lctx.ctx_metal, kv_self.k);
|
|
||||||
ggml_metal_set_tensor(lctx.ctx_metal, kv_self.v);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute(ctx0, &gf);
|
||||||
|
Loading…
Reference in New Issue
Block a user