metal : rewrite to fit new backend interface correctly (WIP)

This commit is contained in:
Georgi Gerganov 2023-07-20 16:36:33 +03:00
parent cb82adadb8
commit d45c1631bc
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
6 changed files with 208 additions and 273 deletions

View File

@ -94,7 +94,6 @@ struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size
*allocator = (struct ggml_backend_buffer){ *allocator = (struct ggml_backend_buffer){
/* .interface = */ ggml_allocator_simple_interface, /* .interface = */ ggml_allocator_simple_interface,
/* .context = */ ctx, /* .context = */ ctx,
/* .backend_size = */ 0,
/* .backend_data = */ NULL, /* .backend_data = */ NULL,
}; };
return allocator; return allocator;
@ -146,6 +145,9 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
return; return;
} }
//printf("src->data = %p, src->extra = %p\n", src->data, src->extra);
//printf("dst->data = %p, dst->extra = %p\n", dst->data, dst->extra);
if (dst->backend->interface.cpy_tensor_from != NULL) { if (dst->backend->interface.cpy_tensor_from != NULL) {
dst->backend->interface.cpy_tensor_from(dst->backend->context, src, dst); dst->backend->interface.cpy_tensor_from(dst->backend->context, src, dst);
} else if (src->backend->interface.cpy_tensor_to != NULL) { } else if (src->backend->interface.cpy_tensor_to != NULL) {
@ -193,7 +195,6 @@ static struct ggml_backend_buffer * ggml_backend_cpu_alloc_buffer(struct ggml_ba
struct ggml_backend_buffer * buffer = ggml_allocator_simple_init(data, size, TENSOR_ALIGNMENT); struct ggml_backend_buffer * buffer = ggml_allocator_simple_init(data, size, TENSOR_ALIGNMENT);
buffer->interface.free_data = ggml_backend_cpu_free_buffer; buffer->interface.free_data = ggml_backend_cpu_free_buffer;
buffer->backend_size = size;
buffer->backend_data = data; buffer->backend_data = data;
return buffer; return buffer;

View File

@ -27,7 +27,6 @@ extern "C" {
struct ggml_backend_buffer { struct ggml_backend_buffer {
struct ggml_backend_buffer_interface interface; struct ggml_backend_buffer_interface interface;
ggml_buffer_context_t context; ggml_buffer_context_t context;
size_t backend_size;
void * backend_data; void * backend_data;
}; };

View File

@ -19,14 +19,9 @@
#pragma once #pragma once
#include "ggml.h"
#include <stddef.h> #include <stddef.h>
#include <stdbool.h> #include <stdbool.h>
// max memory buffers that can be mapped to the device
#define GGML_METAL_MAX_BUFFERS 16
//struct ggml_tensor; //struct ggml_tensor;
//struct ggml_cgraph; //struct ggml_cgraph;
@ -34,16 +29,9 @@
extern "C" { extern "C" {
#endif #endif
struct ggml_backend * ggml_backend_metal_init(struct ggml_backend * backend_cpu); struct ggml_backend;
// TODO: temporary - move to backend interface
bool ggml_backend_metal_map_buffer(
struct ggml_backend * backend,
const char * name,
void * data,
size_t size,
size_t max_size);
struct ggml_backend * ggml_backend_metal_init(void);
//struct ggml_metal_context; //struct ggml_metal_context;
// //

View File

@ -12,18 +12,16 @@
#else #else
#define metal_printf(...) fprintf(stderr, __VA_ARGS__) #define metal_printf(...) fprintf(stderr, __VA_ARGS__)
#endif #endif
//#define metal_printf(...) fprintf(stderr, __VA_ARGS__)
#define UNUSED(x) (void)(x) #define UNUSED(x) (void)(x)
struct ggml_metal_buffer { struct ggml_metal_buffer_wrapper {
const char * name; id<MTLBuffer> buffer;
void * data;
size_t size;
id<MTLBuffer> metal;
}; };
static void * g_ptr_base = (void *)0x1000;
struct ggml_metal_context { struct ggml_metal_context {
int n_cb; int n_cb;
@ -33,9 +31,6 @@ struct ggml_metal_context {
id<MTLCommandQueue> queue; id<MTLCommandQueue> queue;
id<MTLLibrary> library; id<MTLLibrary> library;
int n_buffers;
struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
// custom kernels // custom kernels
#define GGML_METAL_DECL_KERNEL(name) \ #define GGML_METAL_DECL_KERNEL(name) \
id<MTLFunction> function_##name; \ id<MTLFunction> function_##name; \
@ -96,7 +91,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
ctx->n_cb = n_cb; ctx->n_cb = n_cb;
ctx->device = MTLCreateSystemDefaultDevice(); ctx->device = MTLCreateSystemDefaultDevice();
ctx->queue = [ctx->device newCommandQueue]; ctx->queue = [ctx->device newCommandQueue];
ctx->n_buffers = 0;
// determine if we can use MPS // determine if we can use MPS
if (MPSSupportsMTLDevice(ctx->device)) { if (MPSSupportsMTLDevice(ctx->device)) {
@ -205,9 +199,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
void ggml_metal_free(struct ggml_metal_context * ctx) { void ggml_metal_free(struct ggml_metal_context * ctx) {
fprintf(stderr, "%s: deallocating\n", __func__); fprintf(stderr, "%s: deallocating\n", __func__);
for (int i = 0; i < ctx->n_buffers; ++i) {
[ctx->buffers[i].metal release];
}
free(ctx); free(ctx);
} }
@ -215,143 +206,29 @@ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
ctx->n_cb = n_cb; ctx->n_cb = n_cb;
} }
// finds the Metal buffer that contains the tensor data on the GPU device static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * tensor, size_t * offs) {
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the if (tensor == nil) {
// Metal buffer based on the host memory pointer
//
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
//fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
const int64_t tsize = ggml_nbytes(t);
// find the view that contains the tensor fully
for (int i = 0; i < ctx->n_buffers; ++i) {
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
*offs = (size_t) ioffs;
//fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
return ctx->buffers[i].metal;
}
}
fprintf(stderr, "%s: error: buffer is nil for tensor '%s'\n", __func__, t->name);
return nil; return nil;
}
// TODO: rename to ggml_metal_map_buffer
bool ggml_metal_add_buffer(
struct ggml_metal_context * ctx,
const char * name,
void * data,
size_t size,
size_t max_size) {
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
fprintf(stderr, "%s: too many buffers\n", __func__);
return false;
} }
if (data) { switch (tensor->op) {
// verify that the buffer does not overlap with any of the existing buffers case GGML_OP_RESHAPE:
for (int i = 0; i < ctx->n_buffers; ++i) { case GGML_OP_VIEW:
const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data; case GGML_OP_TRANSPOSE:
case GGML_OP_PERMUTE:
if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { {
fprintf(stderr, "%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); if (tensor->op == GGML_OP_VIEW) {
return false; //printf("view offs = %zu\n", *(size_t *)tensor->op_params);
} }
return ggml_metal_get_buffer(tensor->src[0], offs);
} }
const size_t size_page = getpagesize(); default: {}
size_t size_aligned = size;
if ((size_aligned % size_page) != 0) {
size_aligned += (size_page - (size_aligned % size_page));
} }
// the buffer fits into the max buffer size allowed by the device *offs = (size_t) tensor->data - (size_t) g_ptr_base;
if (size_aligned <= ctx->device.maxBufferLength) { //printf("%s: offs = %zu, %p, op = %s\n", __func__, *offs, tensor->extra, ggml_op_name(tensor->op));
ctx->buffers[ctx->n_buffers].name = name; return ((struct ggml_metal_buffer_wrapper *) tensor->extra)->buffer;
ctx->buffers[ctx->n_buffers].data = data;
ctx->buffers[ctx->n_buffers].size = size;
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
if (ctx->buffers[ctx->n_buffers].metal == nil) {
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
return false;
}
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
++ctx->n_buffers;
} else {
// this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
// one of the views
const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
const size_t size_step = ctx->device.maxBufferLength - size_ovlp;
const size_t size_view = ctx->device.maxBufferLength;
for (size_t i = 0; i < size; i += size_step) {
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
ctx->buffers[ctx->n_buffers].name = name;
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
if (ctx->buffers[ctx->n_buffers].metal == nil) {
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
return false;
}
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
if (i + size_step < size) {
fprintf(stderr, "\n");
}
++ctx->n_buffers;
}
}
fprintf(stderr, ", (%8.2f / %8.2f)",
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
} else {
fprintf(stderr, "\n");
}
}
return true;
}
void ggml_metal_set_tensor(
struct ggml_metal_context * ctx,
struct ggml_tensor * t) {
metal_printf("%s: set input for tensor '%s'\n", __func__, t->name);
size_t offs;
id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);
memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t));
}
void ggml_metal_get_tensor(
struct ggml_metal_context * ctx,
struct ggml_tensor * t) {
metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name);
size_t offs;
id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);
memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
} }
void ggml_metal_graph_compute( void ggml_metal_graph_compute(
@ -432,23 +309,35 @@ void ggml_metal_graph_compute(
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil; switch (dst->op) {
id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil; case GGML_OP_NONE:
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_TRANSPOSE:
case GGML_OP_PERMUTE:
{
continue;
} break;
default: break;
}
//metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op)); id<MTLBuffer> id_src0 = ggml_metal_get_buffer(src0, &offs_src0);
//if (src0) { id<MTLBuffer> id_src1 = ggml_metal_get_buffer(src1, &offs_src1);
// metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, id<MTLBuffer> id_dst = ggml_metal_get_buffer(dst, &offs_dst);
// ggml_is_contiguous(src0), src0->name);
//} metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
//if (src1) { if (src0) {
// metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
// ggml_is_contiguous(src1), src1->name); ggml_is_contiguous(src0), src0->name);
//} }
//if (dst) { if (src1) {
// metal_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
// dst->name); ggml_is_contiguous(src1), src1->name);
//} }
if (dst) {
metal_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2,
dst->name);
}
switch (dst->op) { switch (dst->op) {
case GGML_OP_NONE: case GGML_OP_NONE:
@ -501,7 +390,9 @@ void ggml_metal_graph_compute(
encoder = [command_buffer computeCommandEncoder]; encoder = [command_buffer computeCommandEncoder];
} }
const float scale = *(const float *) src1->data; //const float scale = *(const float *) src1->data;
const float scale = ((float *)((char *)[((struct ggml_metal_buffer_wrapper *)(src1->extra))->buffer contents] + (size_t) src1->data - (size_t)g_ptr_base))[0];
//printf("scale: %f, src1->data: %p, src1->extra: %p, src1->extra->buffer: %p\n", scale, src1->data, src1->extra, ((struct ggml_metal_buffer_wrapper *)(src1->extra))->buffer);
[encoder setComputePipelineState:ctx->pipeline_scale]; [encoder setComputePipelineState:ctx->pipeline_scale];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@ -578,7 +469,8 @@ void ggml_metal_graph_compute(
encoder = [command_buffer computeCommandEncoder]; encoder = [command_buffer computeCommandEncoder];
} }
const int n_past = ((int32_t *)(src1->data))[0]; //const int n_past = ((int32_t *)(src1->data))[0];
const int n_past = ((int32_t *)(dst->op_params))[0];
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf]; [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@ -740,6 +632,10 @@ void ggml_metal_graph_compute(
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
//printf("id_src0 %p, offs_src0 %zu\n", id_src0, offs_src0);
//printf("id_src1 %p, offs_src1 %zu\n", id_src1, offs_src1);
//printf("id_dst %p, offs_dst %zu\n", id_dst, offs_dst);
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) { if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
@ -877,11 +773,10 @@ void ggml_metal_graph_compute(
encoder = [command_buffer computeCommandEncoder]; encoder = [command_buffer computeCommandEncoder];
} }
const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1]; const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2]; const int mode = ((int32_t *) dst->op_params)[2];
const int n_past = ((int32_t *)(dst->op_params))[0];
float freq_base; float freq_base;
float freq_scale; float freq_scale;
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
@ -994,61 +889,140 @@ void ggml_metal_graph_compute(
} }
} }
bool ggml_backend_metal_map_buffer(
struct ggml_backend * backend,
const char * name,
void * data,
size_t size,
size_t max_size) {
return ggml_metal_add_buffer(backend->context, name, data, size, max_size);
}
static const char * ggml_backend_metal_name(struct ggml_backend * ctx) { static const char * ggml_backend_metal_name(struct ggml_backend * ctx) {
return "Metal"; return "Metal";
UNUSED(ctx); UNUSED(ctx);
} }
static void ggml_backend_metal_free(struct ggml_backend * backend) {
struct ggml_metal_context * ctx_metal = (struct ggml_metal_context *)backend->context;
ggml_metal_free(ctx_metal);
free(backend);
}
static const size_t TENSOR_ALIGNMENT = 128;
static void ggml_backend_metal_init_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) {
tensor->extra = alloc->backend_data;
}
static void ggml_backend_metal_free_data(struct ggml_backend_buffer * alloc) {
struct ggml_metal_buffer_wrapper * wrapper = (struct ggml_metal_buffer_wrapper *)alloc->backend_data;
[wrapper->buffer release];
free(wrapper);
}
static struct ggml_backend_buffer * ggml_backend_metal_alloc_buffer(struct ggml_backend * backend, size_t size) {
struct ggml_metal_context * ctx_metal = (struct ggml_metal_context *)backend->context;
struct ggml_metal_buffer_wrapper * wrapper = malloc(sizeof(struct ggml_metal_buffer_wrapper));
wrapper->buffer = [ctx_metal->device newBufferWithLength:size options:MTLResourceStorageModeShared];
if (wrapper->buffer == nil) {
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
GGML_ASSERT(false);
}
//printf("XXXXXXXXXXXXXXX ALOC: %p %p %p size = %zu\n", (void * )wrapper, (void *)&wrapper->buffer, (void *)[wrapper->buffer contents], size);
struct ggml_backend_buffer * buffer = ggml_allocator_simple_init(g_ptr_base, size, TENSOR_ALIGNMENT);
buffer->interface.init_tensor = ggml_backend_metal_init_tensor;
buffer->interface.free_data = ggml_backend_metal_free_data;
buffer->backend_data = wrapper;
return buffer;
}
static void ggml_backend_metal_set_tensor_async(struct ggml_backend * backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
GGML_ASSERT(tensor->extra != nil && "tensor not allocated");
struct ggml_metal_buffer_wrapper * wrapper = (struct ggml_metal_buffer_wrapper *)tensor->extra;
char * contents = (char *)[wrapper->buffer contents];
const size_t t_data = (size_t) tensor->data - (size_t) g_ptr_base;
//printf("XXXXXXXXXXXXXXX SET : %p %p %p offset = %zu\n", (void *)(tensor->data), (void *)&wrapper->buffer, (void *)contents, offset);
memcpy((char *)contents + t_data + offset, data, size);
//memcpy((char *)tensor->data, data, size);
UNUSED(backend);
}
static void ggml_backend_metal_get_tensor_async(struct ggml_backend * backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
//printf("XXXXXXXXXXXXXXX GET : %d %p, backend = %s\n", (void *)(tensor->data), (void *)tensor->extra, tensor->backend->interface.get_name(tensor->backend));
GGML_ASSERT(tensor->extra != nil && "tensor not allocated");
struct ggml_metal_buffer_wrapper * wrapper = (struct ggml_metal_buffer_wrapper *)tensor->extra;
const char * contents = (const char *)[wrapper->buffer contents];
const size_t t_data = (size_t) tensor->data - (size_t) g_ptr_base;
//printf("XXXXXXXXXXXXXXX GET : %p %p %p offset = %zu\n", (void *)(tensor->data), (void *)&wrapper->buffer, (void *)contents, offset);
memcpy(data, (const char *)contents + t_data + offset, size);
UNUSED(backend);
}
static void ggml_backend_metal_synchronize(struct ggml_backend * backend) {
UNUSED(backend);
}
static ggml_graph_plan_t ggml_backend_metal_graph_plan_create(struct ggml_backend * backend, struct ggml_cgraph * cgraph) {
GGML_ASSERT(false);
return nil;
UNUSED(backend);
UNUSED(cgraph);
}
static void ggml_backend_metal_graph_plan_free(struct ggml_backend * backend, ggml_graph_plan_t plan) {
GGML_ASSERT(false);
UNUSED(backend);
UNUSED(plan);
}
static void ggml_backend_metal_graph_plan_compute(struct ggml_backend * backend, ggml_graph_plan_t plan) {
GGML_ASSERT(false);
UNUSED(backend);
UNUSED(plan);
}
static void ggml_backend_metal_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { static void ggml_backend_metal_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) {
ggml_metal_graph_compute(backend->context, cgraph); ggml_metal_graph_compute(backend->context, cgraph);
} }
static struct ggml_backend_interface metal_backend_interface = { static struct ggml_backend_interface metal_backend_interface = {
/* .get_name = */ ggml_backend_metal_name, /* .get_name = */ ggml_backend_metal_name,
/* .free = */ NULL, //ggml_backend_metal_alloc_buffer, /* .free = */ ggml_backend_metal_free,
/* .alloc_buffer = */ NULL, //ggml_backend_metal_free_buffer, /* .alloc_buffer = */ ggml_backend_metal_alloc_buffer,
/* .set_tensor_async = */ NULL, //ggml_backend_metal_reset_buffer, /* .set_tensor_async = */ ggml_backend_metal_set_tensor_async,
/* .get_tensor_async = */ NULL, //ggml_backend_metal_alloc_tensor, /* .get_tensor_async = */ ggml_backend_metal_get_tensor_async,
/* .synchronize = */ NULL, //ggml_backend_metal_set_tensor_async, /* .synchronize = */ ggml_backend_metal_synchronize,
/* .cpy_tensor_from = */ NULL, //ggml_backend_metal_get_tensor_async, /* .cpy_tensor_from = */ nil, //ggml_backend_metal_get_tensor_async,
/* .cpy_tensor_to = */ NULL, //ggml_backend_metal_synchronize, /* .cpy_tensor_to = */ nil, //ggml_backend_metal_synchronize,
/* .graph_plan_create = */ NULL, //nullptr, /* .graph_plan_create = */ ggml_backend_metal_graph_plan_create,
/* .graph_plan_free = */ NULL, //nullptr, /* .graph_plan_free = */ ggml_backend_metal_graph_plan_free,
/* .graph_plan_compute = */ NULL, //ggml_backend_metal_graph_plan_create, /* .graph_plan_compute = */ ggml_backend_metal_graph_plan_compute,
/* .graph_compute = */ ggml_backend_metal_graph_compute, /* .graph_compute = */ ggml_backend_metal_graph_compute,
}; };
struct ggml_backend * ggml_backend_metal_init(struct ggml_backend * backend_cpu) { struct ggml_backend * ggml_backend_metal_init(void) {
struct ggml_metal_context * ctx = ggml_metal_init(8); struct ggml_metal_context * ctx = ggml_metal_init(1);
struct ggml_backend * backend_metal = malloc(sizeof(struct ggml_backend)); struct ggml_backend * backend_metal = malloc(sizeof(struct ggml_backend));
*backend_metal = (struct ggml_backend){ *backend_metal = (struct ggml_backend){
/* .interface = */ metal_backend_interface, /* .interface = */ metal_backend_interface,
/* .context = */ ctx, /* .context = */ ctx,
/* .is_ram_shared = */ true, /* .is_ram_shared = */ false,
}; };
// reuses CPU calls for now
backend_metal->interface.free = backend_cpu->interface.free;
backend_metal->interface.alloc_buffer = backend_cpu->interface.alloc_buffer;
backend_metal->interface.set_tensor_async = backend_cpu->interface.set_tensor_async;
backend_metal->interface.get_tensor_async = backend_cpu->interface.get_tensor_async;
backend_metal->interface.synchronize = backend_cpu->interface.synchronize;
backend_metal->interface.cpy_tensor_from = backend_cpu->interface.cpy_tensor_from;
backend_metal->interface.cpy_tensor_to = backend_cpu->interface.cpy_tensor_to;
backend_metal->interface.graph_plan_create = backend_cpu->interface.graph_plan_create;
backend_metal->interface.graph_plan_free = backend_cpu->interface.graph_plan_free;
backend_metal->interface.graph_plan_compute = backend_cpu->interface.graph_plan_compute;
return backend_metal; return backend_metal;
} }

12
ggml.c
View File

@ -4927,6 +4927,7 @@ struct ggml_tensor * ggml_view_tensor(
result->nb[1] = src->nb[1]; result->nb[1] = src->nb[1];
result->nb[2] = src->nb[2]; result->nb[2] = src->nb[2];
result->nb[3] = src->nb[3]; result->nb[3] = src->nb[3];
result->extra = src->extra;
return result; return result;
} }
@ -6262,6 +6263,7 @@ struct ggml_tensor * ggml_reshape(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a; result->src[0] = a;
result->src[1] = NULL; result->src[1] = NULL;
result->extra = a->extra;
return result; return result;
} }
@ -6287,6 +6289,7 @@ struct ggml_tensor * ggml_reshape_1d(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a; result->src[0] = a;
result->src[1] = NULL; result->src[1] = NULL;
result->extra = a->extra;
return result; return result;
} }
@ -6313,6 +6316,7 @@ struct ggml_tensor * ggml_reshape_2d(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a; result->src[0] = a;
result->src[1] = NULL; result->src[1] = NULL;
result->extra = a->extra;
return result; return result;
} }
@ -6340,6 +6344,7 @@ struct ggml_tensor * ggml_reshape_3d(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a; result->src[0] = a;
result->src[1] = NULL; result->src[1] = NULL;
result->extra = a->extra;
return result; return result;
} }
@ -6369,6 +6374,7 @@ struct ggml_tensor * ggml_reshape_4d(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a; result->src[0] = a;
result->src[1] = NULL; result->src[1] = NULL;
result->extra = a->extra;
return result; return result;
} }
@ -6396,6 +6402,7 @@ struct ggml_tensor * ggml_view_1d(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a; result->src[0] = a;
result->src[1] = NULL; result->src[1] = NULL;
result->extra = a->extra;
return result; return result;
} }
@ -6431,6 +6438,7 @@ struct ggml_tensor * ggml_view_2d(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a; result->src[0] = a;
result->src[1] = NULL; result->src[1] = NULL;
result->extra = a->extra;
return result; return result;
} }
@ -6468,6 +6476,7 @@ struct ggml_tensor * ggml_view_3d(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a; result->src[0] = a;
result->src[1] = NULL; result->src[1] = NULL;
result->extra = a->extra;
return result; return result;
} }
@ -6507,6 +6516,7 @@ struct ggml_tensor * ggml_view_4d(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a; result->src[0] = a;
result->src[1] = NULL; result->src[1] = NULL;
result->extra = a->extra;
return result; return result;
} }
@ -6568,6 +6578,7 @@ struct ggml_tensor * ggml_permute(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a; result->src[0] = a;
result->src[1] = NULL; result->src[1] = NULL;
result->extra = a->extra;
int32_t params[] = { axis0, axis1, axis2, axis3 }; int32_t params[] = { axis0, axis1, axis2, axis3 };
ggml_set_op_params(result, &params, sizeof(params)); ggml_set_op_params(result, &params, sizeof(params));
@ -6599,6 +6610,7 @@ struct ggml_tensor * ggml_transpose(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a; result->src[0] = a;
result->src[1] = NULL; result->src[1] = NULL;
result->extra = a->extra;
return result; return result;
} }

View File

@ -234,8 +234,8 @@ struct llama_model {
ggml_context * ctx_cuda = NULL; ggml_context * ctx_cuda = NULL;
#endif #endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
ggml_backend * backend_metal; ggml_backend * backend_metal = NULL;
ggml_buffer * buf_metal; ggml_buffer * buf_metal = NULL;
ggml_context * ctx_metal = NULL; ggml_context * ctx_metal = NULL;
#endif #endif
@ -991,7 +991,7 @@ static void llama_model_load_internal(
#endif #endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
if (n_gpu_layers > 0) { if (n_gpu_layers > 0) {
model.backend_metal = ggml_backend_metal_init(backend_cpu); model.backend_metal = ggml_backend_metal_init();
backend_gpu = model.backend_metal; backend_gpu = model.backend_metal;
} }
#endif #endif
@ -1081,7 +1081,6 @@ static void llama_model_load_internal(
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
if (n_gpu_layers > 0) { if (n_gpu_layers > 0) {
// the metal context is actually a CPU context because we have unified memory
const size_t ctx_size = ctx_sizes[model.backend_metal]; const size_t ctx_size = ctx_sizes[model.backend_metal];
const size_t n_tensors = ml->tensors_map.tensors.size(); const size_t n_tensors = ml->tensors_map.tensors.size();
@ -1089,7 +1088,6 @@ static void llama_model_load_internal(
struct ggml_init_params params = ggml_init_params_default(); struct ggml_init_params params = ggml_init_params_default();
params.buffer = model.buf_metal; params.buffer = model.buf_metal;
params.no_alloc = ml->use_mmap;
model.ctx_metal = ggml_init(params); model.ctx_metal = ggml_init(params);
if (!model.ctx_metal) { if (!model.ctx_metal) {
@ -1372,10 +1370,10 @@ static ggml_graph_splits llama_build_graph(
struct ggml_tensor * tmpv = ggml_mul_mat(ctx_l, model.layers[il].wv, cur); struct ggml_tensor * tmpv = ggml_mul_mat(ctx_l, model.layers[il].wv, cur);
ggml_set_name(tmpv, "tmpv"); ggml_set_name(tmpv, "tmpv");
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx_l, ggml_reshape_3d(ctx_l, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0); struct ggml_tensor * Kcur = ggml_rope(ctx_l, ggml_reshape_3d(ctx_l, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
ggml_set_name(Kcur, "Kcur"); ggml_set_name(Kcur, "Kcur");
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx_l, ggml_reshape_3d(ctx_l, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0); struct ggml_tensor * Qcur = ggml_rope(ctx_l, ggml_reshape_3d(ctx_l, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
ggml_set_name(Qcur, "Qcur"); ggml_set_name(Qcur, "Qcur");
struct ggml_tensor * Vcur = ggml_transpose(ctx_l, ggml_reshape_2d(ctx_l, tmpv, n_embd, N)); struct ggml_tensor * Vcur = ggml_transpose(ctx_l, ggml_reshape_2d(ctx_l, tmpv, n_embd, N));
@ -1428,15 +1426,15 @@ static ggml_graph_splits llama_build_graph(
// KQ_scaled = KQ / sqrt(n_embd/n_head) // KQ_scaled = KQ / sqrt(n_embd/n_head)
// KQ_scaled shape [n_past + N, N, n_head, 1] // KQ_scaled shape [n_past + N, N, n_head, 1]
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx_kv, KQ, KQ_scale); struct ggml_tensor * KQ_scaled = ggml_scale(ctx_kv, KQ, KQ_scale);
ggml_set_name(KQ_scaled, "KQ_scaled"); ggml_set_name(KQ_scaled, "KQ_scaled");
// KQ_masked = mask_past(KQ_scaled) // KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx_kv, KQ_scaled, n_past); struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx_kv, KQ_scaled, n_past);
ggml_set_name(KQ_masked, "KQ_masked"); ggml_set_name(KQ_masked, "KQ_masked");
// KQ = soft_max(KQ_masked) // KQ = soft_max(KQ_masked)
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx_kv, KQ_masked); struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx_kv, KQ_masked);
ggml_set_name(KQ_soft_max, "KQ_soft_max"); ggml_set_name(KQ_soft_max, "KQ_soft_max");
// split cached V into n_head heads // split cached V into n_head heads
@ -2717,6 +2715,12 @@ struct llama_context * llama_new_context_with_model(
} else { } else {
ctx->backend_kv = model->backend_cpu; ctx->backend_kv = model->backend_cpu;
} }
#elif GGML_USE_METAL
if ((uint32_t)params.n_gpu_layers >= model->hparams.n_layer/2 && !params.low_vram) {
ctx->backend_kv = model->backend_metal;
} else {
ctx->backend_kv = model->backend_cpu;
}
#else #else
ctx->backend_kv = model->backend_cpu; ctx->backend_kv = model->backend_cpu;
#endif #endif
@ -2817,49 +2821,6 @@ struct llama_context * llama_new_context_with_model(
} }
} }
#ifdef GGML_USE_METAL
if (params.n_gpu_layers > 0) {
void * data_ptr = NULL;
size_t data_size = 0;
if (params.use_mmap) {
data_ptr = ctx->model.mapping->addr;
data_size = ctx->model.mapping->size;
} else {
data_ptr = ggml_get_mem_buffer(ctx->model.ctx_metal);
data_size = ggml_get_mem_size (ctx->model.ctx_metal);
}
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx_metal);
printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
#define LLAMA_METAL_CHECK_BUF(result) \
if (!(result)) { \
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
llama_free(ctx); \
return NULL; \
}
LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "data", data_ptr, data_size, max_size));
struct ggml_backend_buffer * buf_compute = ctx->buf_compute_metal->backend_buffer;
struct ggml_backend_buffer * buf_kv = ctx->kv_self.buf->backend_buffer;
struct ggml_backend_buffer * buf_input = ctx->buf_input->backend_buffer;
struct ggml_backend_buffer * buf_output = ctx->buf_output->backend_buffer;
LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "eval", buf_compute->backend_data, buf_compute->backend_size, 0));
LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "kv", buf_kv->backend_data, buf_kv->backend_size, 0));
LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "inp", buf_input->backend_data, buf_input->backend_size, 0));
LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "inp", buf_output->backend_data, buf_output->backend_size, 0));
//LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
//LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
#undef LLAMA_METAL_CHECK_BUF
}
#endif
fprintf(stderr, "%s: layer backends: ", __func__); fprintf(stderr, "%s: layer backends: ", __func__);
fprintf(stderr, "input: %s, ", ggml_backend_name(ctx->model.backend_inp)); fprintf(stderr, "input: %s, ", ggml_backend_name(ctx->model.backend_inp));
@ -3150,14 +3111,14 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
ggml_set_name(scale_tensor, "scale_tensor"); ggml_set_name(scale_tensor, "scale_tensor");
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); BA = ggml_scale(lora_ctx, BA, scale_tensor);
ggml_set_name(BA, "BA_scaled"); ggml_set_name(BA, "BA_scaled");
} }
ggml_tensor * r; ggml_tensor * r;
if (base_t == dest_t) { if (base_t == dest_t) {
r = ggml_add_inplace(lora_ctx, dest_t, BA); r = ggml_add(lora_ctx, dest_t, BA);
ggml_set_name(r, "r_add_inplace"); ggml_set_name(r, "r_add");
} }
else { else {
r = ggml_add(lora_ctx, base_t, BA); r = ggml_add(lora_ctx, base_t, BA);