Adjust Metal buffer allocation to avoid allocating beyond MTLDevice.recommendedMaxWorkingSetSize

2025-01-11 03:01:45 +00:00 · 2023-07-01 21:33:16 -07:00 · 2023-07-01 21:33:16 -07:00 · da7d2f9587
commit da7d2f9587
parent b213227067
4 changed files with 691 additions and 637 deletions
--- a/examples/metal/metal.cpp
+++ b/examples/metal/metal.cpp
@ -50,8 +50,6 @@ int main(int argc, char ** argv) {
        struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
        *(int32_t *) input->data = 1; // BOS
        ggml_metal_set_tensor(ctx_metal, input);
        // warmup
        ggml_metal_graph_compute(ctx_metal, &gf);
@ -72,7 +70,6 @@ int main(int argc, char ** argv) {
    // debug output
    {
        struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
        ggml_metal_get_tensor(ctx_metal, logits);
        float * ptr = (float *) ggml_get_data(logits);
--- a/ggml-metal.h
+++ b/ggml-metal.h
@ -13,9 +13,6 @@
 // are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
 // used during the graph evaluation to determine the arguments of the compute kernels.
 //
 // Synchronization between device and host memory (for example for input and output tensors)
 // is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
 //
 #pragma once
@ -23,7 +20,7 @@
 #include <stdbool.h>
 // max memory buffers that can be mapped to the device
-#define GGML_METAL_MAX_BUFFERS 16
+#define GGML_METAL_MAX_BUFFERS 256
 struct ggml_tensor;
 struct ggml_cgraph;
@ -51,12 +48,6 @@ bool ggml_metal_add_buffer(
                           size_t   size,
                           size_t   max_size);
 // set data from host memory into the device
 void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
 // get data from the device into host memory
 void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
 // same as ggml_graph_compute but uses Metal
 // creates gf->n_threads command buffers in parallel
 void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -22,6 +22,7 @@ struct ggml_metal_buffer {
    size_t   size;
    id<MTLBuffer>       metal;
    int                 gpu_use_count;
 };
 struct ggml_metal_context {
@ -30,6 +31,9 @@ struct ggml_metal_context {
    id<MTLDevice>       device;
    id<MTLCommandQueue> queue;
    id<MTLLibrary>      library;
    NSCondition         *buffer_allocation_condition;
    dispatch_queue_t    command_dispatch_queue;
    bool                allocation_waiting;
    int n_buffers;
    struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
@ -90,10 +94,14 @@ struct ggml_metal_context * ggml_metal_init(void) {
    fprintf(stderr, "%s: allocating\n", __func__);
    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
    memset(ctx, 0, sizeof(struct ggml_metal_context));
    ctx->device = MTLCreateSystemDefaultDevice();
    ctx->queue  = [ctx->device newCommandQueue];
    ctx->n_buffers = 0;
    ctx->command_dispatch_queue = dispatch_queue_create("llama.cpp.command_dispatch", DISPATCH_QUEUE_CONCURRENT);
    ctx->buffer_allocation_condition = [[NSCondition alloc] init];
    ctx->allocation_waiting = false;
    // determine if we can use MPS
    if (MPSSupportsMTLDevice(ctx->device)) {
@ -151,7 +159,7 @@ struct ggml_metal_context * ggml_metal_init(void) {
 #define GGML_METAL_ADD_KERNEL(name) \
        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:nil]; \
-        fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name);
+        fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (__bridge void *) ctx->pipeline_##name);
        GGML_METAL_ADD_KERNEL(add);
        GGML_METAL_ADD_KERNEL(mul);
@ -188,7 +196,7 @@ struct ggml_metal_context * ggml_metal_init(void) {
 #undef GGML_METAL_ADD_KERNEL
    }
-
+    fprintf(stderr, "%s: currentAllocatedSize = %8.2f MB\n", __func__, ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
    fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
    fprintf(stderr, "%s: hasUnifiedMemory             = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
    if (ctx->device.maxTransferRate != 0) {
@ -205,6 +213,13 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    for (int i = 0; i < ctx->n_buffers; ++i) {
        [ctx->buffers[i].metal release];
    }
    ctx->command_dispatch_queue = nil;
    [ctx->buffer_allocation_condition release];
    ctx->buffer_allocation_condition = nil;
    ctx->n_buffers = 0;
    ctx->device = nil;
    ctx->library = nil;
    ctx->queue = nil;
    free(ctx);
 }
@ -212,7 +227,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
 // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
 // Metal buffer based on the host memory pointer
 //
-static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
+static int ggml_metal_get_buffer_index(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
    //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
    const int64_t tsize = ggml_nbytes(t);
@ -220,19 +235,82 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
    // find the view that contains the tensor fully
    for (int i = 0; i < ctx->n_buffers; ++i) {
        const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
        size_t size = ctx->buffers[i].size;
-        if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
+        if (ioffs >= 0 && ioffs + tsize <= (int64_t) size) {
            *offs = (size_t) ioffs;
-
+            return i;
            //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
            return ctx->buffers[i].metal;
        }
    }
-    fprintf(stderr, "%s: error: buffer is nil\n", __func__);
+    fprintf(stderr, "%s: error: buffer is NULL\n", __func__);
    return -1;
 }
 static void ggml_metal_release_unused_buffer(struct ggml_metal_context * ctx) {
    // Release a buffer that no command buffers are actively using
    if (ctx->allocation_waiting) {
        for (int i = 0; i < ctx->n_buffers; ++i) {
            if (ctx->buffers[i].gpu_use_count == 0 && ctx->buffers[i].metal != nil) {
                [ctx->buffers[i].metal release];
                ctx->buffers[i].metal = nil;
                ctx->allocation_waiting = false;
                break;
            }
        }
    }
 }
 static void ggml_metal_signal_buffer_dealloc(struct ggml_metal_context * ctx, int buf_idx) {
    void *data = ctx->buffers[buf_idx].data;
    const char * name = ctx->buffers[buf_idx].name;
    size_t address = (size_t) data;
    [ctx->buffer_allocation_condition lock];
    [ctx->buffer_allocation_condition signal];
    [ctx->buffer_allocation_condition unlock];
 }
 static void ggml_metal_allocate_buffer(struct ggml_metal_context * ctx, int buf_idx) {
    void *data = ctx->buffers[buf_idx].data;
    const char * name = ctx->buffers[buf_idx].name;
    size_t size = ctx->buffers[buf_idx].size;
    size_t address = (size_t) data;
    ctx->buffers[buf_idx].metal = [ctx->device newBufferWithBytesNoCopy:data
                                                                 length:size
                                                                options:MTLResourceStorageModeShared
                                                            deallocator:^(void *const ptr, const NSUInteger len) {
                                                                dispatch_async(ctx->command_dispatch_queue, ^{
                                                                    ggml_metal_signal_buffer_dealloc(ctx, buf_idx);
                                                                });
                                                            }];
 }
 static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, int buf_idx, bool wait_for_alloc) {
    if (buf_idx < 0) {
        return nil;
    }
    size_t size = ctx->buffers[buf_idx].size;
    while (true) {
        if (ctx->buffers[buf_idx].metal != nil) {
            return ctx->buffers[buf_idx].metal;
        }
        if (ctx->device.currentAllocatedSize + size <= ctx->device.recommendedMaxWorkingSetSize) {
            ggml_metal_allocate_buffer(ctx, buf_idx);
            return ctx->buffers[buf_idx].metal;
        }
        if (!wait_for_alloc) {
            return nil;
        }
        ctx->allocation_waiting = true;
        ggml_metal_release_unused_buffer(ctx);
        [ctx->buffer_allocation_condition wait];
    }
    return ctx->buffers[buf_idx].metal;
 }
 bool ggml_metal_add_buffer(
@ -268,24 +346,18 @@ bool ggml_metal_add_buffer(
        if (size_aligned <= ctx->device.maxBufferLength) {
            ctx->buffers[ctx->n_buffers].name = name;
            ctx->buffers[ctx->n_buffers].data = data;
-            ctx->buffers[ctx->n_buffers].size = size;
+            ctx->buffers[ctx->n_buffers].size = size_aligned;
-
+            ctx->buffers[ctx->n_buffers].metal = nil;
-            ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
+            ctx->buffers[ctx->n_buffers].gpu_use_count = 0;
-
+            fprintf(stderr, "%s: prepared size for '%-16s' buffer (%d), size = %8.2f MB\n", __func__, name, ctx->n_buffers, size_aligned / 1024.0 / 1024.0);
            if (ctx->buffers[ctx->n_buffers].metal == nil) {
                fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
                return false;
            }
            fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
            ++ctx->n_buffers;
        } else {
            // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
            // one of the views
            size_t max_buffer_length = 1 * 1024ul * 1024ul * 1024ul; // ctx->device.maxBufferLength;
            const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
-            const size_t size_step = ctx->device.maxBufferLength - size_ovlp;
+            const size_t size_step = max_buffer_length - size_ovlp;
-            const size_t size_view = ctx->device.maxBufferLength;
+            const size_t size_view = max_buffer_length;
            for (size_t i = 0; i < size; i += size_step) {
                const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
@ -293,59 +365,18 @@ bool ggml_metal_add_buffer(
                ctx->buffers[ctx->n_buffers].name = name;
                ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
                ctx->buffers[ctx->n_buffers].size = size_step_aligned;
-
+                ctx->buffers[ctx->n_buffers].metal = nil;
-                ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
+                ctx->buffers[ctx->n_buffers].gpu_use_count = 0;
-
+                fprintf(stderr, "%s: prepared size for '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
                if (ctx->buffers[ctx->n_buffers].metal == nil) {
                    fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
                    return false;
                }
                fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
                if (i + size_step < size) {
                    fprintf(stderr, "\n");
                }
                ++ctx->n_buffers;
            }
        }
        fprintf(stderr, ", (%8.2f / %8.2f)",
                ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
                ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
        if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
            fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
        } else {
            fprintf(stderr, "\n");
        }
    }
    return true;
 }
 void ggml_metal_set_tensor(
        struct ggml_metal_context * ctx,
        struct ggml_tensor * t) {
    metal_printf("%s: set input for tensor '%s'\n", __func__, t->name);
    size_t offs;
    id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);
    memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t));
 }
 void ggml_metal_get_tensor(
        struct ggml_metal_context * ctx,
        struct ggml_tensor * t) {
    metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name);
    size_t offs;
    id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);
    memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
 }
 void ggml_metal_graph_compute(
        struct ggml_metal_context * ctx,
               struct ggml_cgraph * gf) {
@ -354,38 +385,82 @@ void ggml_metal_graph_compute(
    // create multiple command buffers and enqueue them
    // then, we encode the graph into the command buffers in parallel
-    const int n_cb = gf->n_threads;
+    NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:16];
-    NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
+    int buf_idx_src0 = -1;
    int buf_idx_src1 = -1;
    int buf_idx_dst = -1;
-    for (int i = 0; i < n_cb; ++i) {
+    int node_curr = 0;
-        command_buffers[i] = [ctx->queue commandBuffer];
+    while (node_curr < gf->n_nodes) {
        id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer];
        [command_buffers addObject:command_buffer];
        [command_buffer enqueue];
-        // enqueue the command buffers in order to specify their execution order
+        [ctx->buffer_allocation_condition lock];
-        [command_buffers[i] enqueue];
+        NSMutableSet *buf_idxs_used = [NSMutableSet set];
-    }
+        int node_start = node_curr;
-
+        while (node_curr < gf->n_nodes) {
    // TODO: is this the best way to start threads?
    dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
        const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
        dispatch_async(queue, ^{
            size_t offs_src0 = 0;
            size_t offs_src1 = 0;
            size_t offs_dst  = 0;
-            id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
+            struct ggml_tensor * src0 = gf->nodes[node_curr]->src0;
            struct ggml_tensor * src1 = gf->nodes[node_curr]->src1;
            struct ggml_tensor * dst  = gf->nodes[node_curr];
            bool wait_for_alloc = (node_start == node_curr);
 #define GGML_METAL_ALLOC_OR_BREAK(buffer_name)                                                          \
            if (buffer_name) {                                                                          \
                if (buf_idx_##buffer_name < 0) {                                                        \
                    int buf_idx = ggml_metal_get_buffer_index(ctx, buffer_name, &offs_##buffer_name);   \
                    if (ggml_metal_get_buffer(ctx, buf_idx, wait_for_alloc)) {                          \
                        buf_idx_##buffer_name = buf_idx;                                                \
                        [buf_idxs_used addObject:[NSNumber numberWithInt:buf_idx]];                     \
                    } else {                                                                            \
                        break;                                                                          \
                    }                                                                                   \
                }                                                                                       \
            }
            GGML_METAL_ALLOC_OR_BREAK(src0)
            GGML_METAL_ALLOC_OR_BREAK(src1)
            GGML_METAL_ALLOC_OR_BREAK(dst)
 #undef GGML_METAL_ALLOC_OR_BREAK
            buf_idx_src0 = -1;
            buf_idx_src1 = -1;
            buf_idx_dst = -1;
            ++node_curr;
        }
        for (NSNumber *buf_idx_num in buf_idxs_used) {
            int buf_idx = [buf_idx_num intValue];
            ctx->buffers[buf_idx].gpu_use_count++;
        }
        [ctx->buffer_allocation_condition unlock];
        [command_buffer addCompletedHandler: ^(id<MTLCommandBuffer> cb) {
            [ctx->buffer_allocation_condition lock];
            for (NSNumber *buf_idx_num in buf_idxs_used) {
                int buf_idx = [buf_idx_num intValue];
                ctx->buffers[buf_idx].gpu_use_count--;
            }
            ggml_metal_release_unused_buffer(ctx);
            [ctx->buffer_allocation_condition unlock];
        }];
        id<MTLComputeCommandEncoder> encoder = nil;
-            const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
+        for (int i = node_start; i < node_curr; ++i) {
            const int node_end   = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
            for (int i = node_start; i < node_end; ++i) {
            metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
            size_t offs_src0 = 0;
            size_t offs_src1 = 0;
            size_t offs_dst  = 0;
            struct ggml_tensor * src0 = gf->nodes[i]->src0;
            struct ggml_tensor * src1 = gf->nodes[i]->src1;
            struct ggml_tensor * dst  = gf->nodes[i];
@ -424,9 +499,11 @@ void ggml_metal_graph_compute(
            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
            const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;
-                id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
+            [ctx->buffer_allocation_condition lock];
-                id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
+            id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, ggml_metal_get_buffer_index(ctx, src0, &offs_src0), false) : nil;
-                id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(ctx, dst,  &offs_dst)  : nil;
+            id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, ggml_metal_get_buffer_index(ctx, src1, &offs_src1), false) : nil;
            id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(ctx, ggml_metal_get_buffer_index(ctx, dst,  &offs_dst), false)  : nil;
            [ctx->buffer_allocation_condition unlock];
            //metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
            //if (src0) {
@ -960,12 +1037,9 @@ void ggml_metal_graph_compute(
        }
        [command_buffer commit];
        });
    }
-    // wait for all threads to finish
+    int n_cb = [command_buffers count];
    dispatch_barrier_sync(queue, ^{});
    [command_buffers[n_cb - 1] waitUntilCompleted];
    // check status of command buffers
@ -973,7 +1047,8 @@ void ggml_metal_graph_compute(
    for (int i = 0; i < n_cb; i++) {
        MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
        if (status != MTLCommandBufferStatusCompleted) {
-            fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
+            const char *error_str = [[[command_buffers[i] error] description] cStringUsingEncoding:NSUTF8StringEncoding];
            fprintf(stderr, "%s: command buffer %d failed with status %lu: %s\n", __func__, i, status, error_str);
            GGML_ASSERT(false);
        }
    }
--- a/llama.cpp
+++ b/llama.cpp
@ -1555,7 +1555,6 @@ static bool llama_eval_internal(
 #ifdef GGML_USE_METAL
    if (lctx.ctx_metal && N == 1) {
        ggml_metal_graph_compute(lctx.ctx_metal, &gf);
        ggml_metal_get_tensor   (lctx.ctx_metal, cur);
    } else {
        // IMPORTANT:
        // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@ -1564,14 +1563,6 @@ static bool llama_eval_internal(
        //
        // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
        // But for now, we have focused only on Matrix x Vector Metal multiplication.
        //
        // TODO: avoid these syncs via shared memory (ref #1696)
        //
        if (lctx.ctx_metal) {
            // We need to sync the GPU KV cache with the CPU KV cache
            ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
            ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
        }
        ggml_graph_compute(ctx0, &gf);
    }