Adjust Metal buffer allocation to avoid allocating beyond MTLDevice.recommendedMaxWorkingSetSize

This commit is contained in:
Kilty McGowan 2023-07-01 21:33:16 -07:00
parent b213227067
commit da7d2f9587
4 changed files with 691 additions and 637 deletions

View File

@ -50,8 +50,6 @@ int main(int argc, char ** argv) {
struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd"); struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
*(int32_t *) input->data = 1; // BOS *(int32_t *) input->data = 1; // BOS
ggml_metal_set_tensor(ctx_metal, input);
// warmup // warmup
ggml_metal_graph_compute(ctx_metal, &gf); ggml_metal_graph_compute(ctx_metal, &gf);
@ -72,7 +70,6 @@ int main(int argc, char ** argv) {
// debug output // debug output
{ {
struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1]; struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
ggml_metal_get_tensor(ctx_metal, logits);
float * ptr = (float *) ggml_get_data(logits); float * ptr = (float *) ggml_get_data(logits);

View File

@ -13,9 +13,6 @@
// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is // are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
// used during the graph evaluation to determine the arguments of the compute kernels. // used during the graph evaluation to determine the arguments of the compute kernels.
// //
// Synchronization between device and host memory (for example for input and output tensors)
// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
//
#pragma once #pragma once
@ -23,7 +20,7 @@
#include <stdbool.h> #include <stdbool.h>
// max memory buffers that can be mapped to the device // max memory buffers that can be mapped to the device
#define GGML_METAL_MAX_BUFFERS 16 #define GGML_METAL_MAX_BUFFERS 256
struct ggml_tensor; struct ggml_tensor;
struct ggml_cgraph; struct ggml_cgraph;
@ -51,12 +48,6 @@ bool ggml_metal_add_buffer(
size_t size, size_t size,
size_t max_size); size_t max_size);
// set data from host memory into the device
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
// get data from the device into host memory
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
// same as ggml_graph_compute but uses Metal // same as ggml_graph_compute but uses Metal
// creates gf->n_threads command buffers in parallel // creates gf->n_threads command buffers in parallel
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);

View File

@ -22,6 +22,7 @@ struct ggml_metal_buffer {
size_t size; size_t size;
id<MTLBuffer> metal; id<MTLBuffer> metal;
int gpu_use_count;
}; };
struct ggml_metal_context { struct ggml_metal_context {
@ -30,6 +31,9 @@ struct ggml_metal_context {
id<MTLDevice> device; id<MTLDevice> device;
id<MTLCommandQueue> queue; id<MTLCommandQueue> queue;
id<MTLLibrary> library; id<MTLLibrary> library;
NSCondition *buffer_allocation_condition;
dispatch_queue_t command_dispatch_queue;
bool allocation_waiting;
int n_buffers; int n_buffers;
struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
@ -90,10 +94,14 @@ struct ggml_metal_context * ggml_metal_init(void) {
fprintf(stderr, "%s: allocating\n", __func__); fprintf(stderr, "%s: allocating\n", __func__);
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
memset(ctx, 0, sizeof(struct ggml_metal_context));
ctx->device = MTLCreateSystemDefaultDevice(); ctx->device = MTLCreateSystemDefaultDevice();
ctx->queue = [ctx->device newCommandQueue]; ctx->queue = [ctx->device newCommandQueue];
ctx->n_buffers = 0; ctx->n_buffers = 0;
ctx->command_dispatch_queue = dispatch_queue_create("llama.cpp.command_dispatch", DISPATCH_QUEUE_CONCURRENT);
ctx->buffer_allocation_condition = [[NSCondition alloc] init];
ctx->allocation_waiting = false;
// determine if we can use MPS // determine if we can use MPS
if (MPSSupportsMTLDevice(ctx->device)) { if (MPSSupportsMTLDevice(ctx->device)) {
@ -151,7 +159,7 @@ struct ggml_metal_context * ggml_metal_init(void) {
#define GGML_METAL_ADD_KERNEL(name) \ #define GGML_METAL_ADD_KERNEL(name) \
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:nil]; \ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:nil]; \
fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (__bridge void *) ctx->pipeline_##name);
GGML_METAL_ADD_KERNEL(add); GGML_METAL_ADD_KERNEL(add);
GGML_METAL_ADD_KERNEL(mul); GGML_METAL_ADD_KERNEL(mul);
@ -188,7 +196,7 @@ struct ggml_metal_context * ggml_metal_init(void) {
#undef GGML_METAL_ADD_KERNEL #undef GGML_METAL_ADD_KERNEL
} }
fprintf(stderr, "%s: currentAllocatedSize = %8.2f MB\n", __func__, ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
if (ctx->device.maxTransferRate != 0) { if (ctx->device.maxTransferRate != 0) {
@ -205,6 +213,13 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
for (int i = 0; i < ctx->n_buffers; ++i) { for (int i = 0; i < ctx->n_buffers; ++i) {
[ctx->buffers[i].metal release]; [ctx->buffers[i].metal release];
} }
ctx->command_dispatch_queue = nil;
[ctx->buffer_allocation_condition release];
ctx->buffer_allocation_condition = nil;
ctx->n_buffers = 0;
ctx->device = nil;
ctx->library = nil;
ctx->queue = nil;
free(ctx); free(ctx);
} }
@ -212,7 +227,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
// Metal buffer based on the host memory pointer // Metal buffer based on the host memory pointer
// //
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) { static int ggml_metal_get_buffer_index(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
//fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
const int64_t tsize = ggml_nbytes(t); const int64_t tsize = ggml_nbytes(t);
@ -220,19 +235,82 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
// find the view that contains the tensor fully // find the view that contains the tensor fully
for (int i = 0; i < ctx->n_buffers; ++i) { for (int i = 0; i < ctx->n_buffers; ++i) {
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data; const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
size_t size = ctx->buffers[i].size;
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { if (ioffs >= 0 && ioffs + tsize <= (int64_t) size) {
*offs = (size_t) ioffs; *offs = (size_t) ioffs;
return i;
//fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
return ctx->buffers[i].metal;
} }
} }
fprintf(stderr, "%s: error: buffer is nil\n", __func__); fprintf(stderr, "%s: error: buffer is NULL\n", __func__);
return -1;
}
static void ggml_metal_release_unused_buffer(struct ggml_metal_context * ctx) {
// Release a buffer that no command buffers are actively using
if (ctx->allocation_waiting) {
for (int i = 0; i < ctx->n_buffers; ++i) {
if (ctx->buffers[i].gpu_use_count == 0 && ctx->buffers[i].metal != nil) {
[ctx->buffers[i].metal release];
ctx->buffers[i].metal = nil;
ctx->allocation_waiting = false;
break;
}
}
}
}
static void ggml_metal_signal_buffer_dealloc(struct ggml_metal_context * ctx, int buf_idx) {
void *data = ctx->buffers[buf_idx].data;
const char * name = ctx->buffers[buf_idx].name;
size_t address = (size_t) data;
[ctx->buffer_allocation_condition lock];
[ctx->buffer_allocation_condition signal];
[ctx->buffer_allocation_condition unlock];
}
static void ggml_metal_allocate_buffer(struct ggml_metal_context * ctx, int buf_idx) {
void *data = ctx->buffers[buf_idx].data;
const char * name = ctx->buffers[buf_idx].name;
size_t size = ctx->buffers[buf_idx].size;
size_t address = (size_t) data;
ctx->buffers[buf_idx].metal = [ctx->device newBufferWithBytesNoCopy:data
length:size
options:MTLResourceStorageModeShared
deallocator:^(void *const ptr, const NSUInteger len) {
dispatch_async(ctx->command_dispatch_queue, ^{
ggml_metal_signal_buffer_dealloc(ctx, buf_idx);
});
}];
}
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, int buf_idx, bool wait_for_alloc) {
if (buf_idx < 0) {
return nil; return nil;
}
size_t size = ctx->buffers[buf_idx].size;
while (true) {
if (ctx->buffers[buf_idx].metal != nil) {
return ctx->buffers[buf_idx].metal;
}
if (ctx->device.currentAllocatedSize + size <= ctx->device.recommendedMaxWorkingSetSize) {
ggml_metal_allocate_buffer(ctx, buf_idx);
return ctx->buffers[buf_idx].metal;
}
if (!wait_for_alloc) {
return nil;
}
ctx->allocation_waiting = true;
ggml_metal_release_unused_buffer(ctx);
[ctx->buffer_allocation_condition wait];
}
return ctx->buffers[buf_idx].metal;
} }
bool ggml_metal_add_buffer( bool ggml_metal_add_buffer(
@ -268,24 +346,18 @@ bool ggml_metal_add_buffer(
if (size_aligned <= ctx->device.maxBufferLength) { if (size_aligned <= ctx->device.maxBufferLength) {
ctx->buffers[ctx->n_buffers].name = name; ctx->buffers[ctx->n_buffers].name = name;
ctx->buffers[ctx->n_buffers].data = data; ctx->buffers[ctx->n_buffers].data = data;
ctx->buffers[ctx->n_buffers].size = size; ctx->buffers[ctx->n_buffers].size = size_aligned;
ctx->buffers[ctx->n_buffers].metal = nil;
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; ctx->buffers[ctx->n_buffers].gpu_use_count = 0;
fprintf(stderr, "%s: prepared size for '%-16s' buffer (%d), size = %8.2f MB\n", __func__, name, ctx->n_buffers, size_aligned / 1024.0 / 1024.0);
if (ctx->buffers[ctx->n_buffers].metal == nil) {
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
return false;
}
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
++ctx->n_buffers; ++ctx->n_buffers;
} else { } else {
// this overlap between the views will guarantee that the tensor with the maximum size will fully fit into // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
// one of the views // one of the views
size_t max_buffer_length = 1 * 1024ul * 1024ul * 1024ul; // ctx->device.maxBufferLength;
const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
const size_t size_step = ctx->device.maxBufferLength - size_ovlp; const size_t size_step = max_buffer_length - size_ovlp;
const size_t size_view = ctx->device.maxBufferLength; const size_t size_view = max_buffer_length;
for (size_t i = 0; i < size; i += size_step) { for (size_t i = 0; i < size; i += size_step) {
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
@ -293,59 +365,18 @@ bool ggml_metal_add_buffer(
ctx->buffers[ctx->n_buffers].name = name; ctx->buffers[ctx->n_buffers].name = name;
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
ctx->buffers[ctx->n_buffers].size = size_step_aligned; ctx->buffers[ctx->n_buffers].size = size_step_aligned;
ctx->buffers[ctx->n_buffers].metal = nil;
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; ctx->buffers[ctx->n_buffers].gpu_use_count = 0;
fprintf(stderr, "%s: prepared size for '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
if (ctx->buffers[ctx->n_buffers].metal == nil) {
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
return false;
}
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
if (i + size_step < size) {
fprintf(stderr, "\n");
}
++ctx->n_buffers; ++ctx->n_buffers;
} }
} }
fprintf(stderr, ", (%8.2f / %8.2f)",
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
} else {
fprintf(stderr, "\n");
}
} }
return true; return true;
} }
void ggml_metal_set_tensor(
struct ggml_metal_context * ctx,
struct ggml_tensor * t) {
metal_printf("%s: set input for tensor '%s'\n", __func__, t->name);
size_t offs;
id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);
memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t));
}
void ggml_metal_get_tensor(
struct ggml_metal_context * ctx,
struct ggml_tensor * t) {
metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name);
size_t offs;
id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);
memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
}
void ggml_metal_graph_compute( void ggml_metal_graph_compute(
struct ggml_metal_context * ctx, struct ggml_metal_context * ctx,
struct ggml_cgraph * gf) { struct ggml_cgraph * gf) {
@ -354,38 +385,82 @@ void ggml_metal_graph_compute(
// create multiple command buffers and enqueue them // create multiple command buffers and enqueue them
// then, we encode the graph into the command buffers in parallel // then, we encode the graph into the command buffers in parallel
const int n_cb = gf->n_threads; NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:16];
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb]; int buf_idx_src0 = -1;
int buf_idx_src1 = -1;
int buf_idx_dst = -1;
for (int i = 0; i < n_cb; ++i) { int node_curr = 0;
command_buffers[i] = [ctx->queue commandBuffer]; while (node_curr < gf->n_nodes) {
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer];
[command_buffers addObject:command_buffer];
[command_buffer enqueue];
// enqueue the command buffers in order to specify their execution order [ctx->buffer_allocation_condition lock];
[command_buffers[i] enqueue]; NSMutableSet *buf_idxs_used = [NSMutableSet set];
} int node_start = node_curr;
while (node_curr < gf->n_nodes) {
// TODO: is this the best way to start threads?
dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
dispatch_async(queue, ^{
size_t offs_src0 = 0; size_t offs_src0 = 0;
size_t offs_src1 = 0; size_t offs_src1 = 0;
size_t offs_dst = 0; size_t offs_dst = 0;
id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx]; struct ggml_tensor * src0 = gf->nodes[node_curr]->src0;
struct ggml_tensor * src1 = gf->nodes[node_curr]->src1;
struct ggml_tensor * dst = gf->nodes[node_curr];
bool wait_for_alloc = (node_start == node_curr);
#define GGML_METAL_ALLOC_OR_BREAK(buffer_name) \
if (buffer_name) { \
if (buf_idx_##buffer_name < 0) { \
int buf_idx = ggml_metal_get_buffer_index(ctx, buffer_name, &offs_##buffer_name); \
if (ggml_metal_get_buffer(ctx, buf_idx, wait_for_alloc)) { \
buf_idx_##buffer_name = buf_idx; \
[buf_idxs_used addObject:[NSNumber numberWithInt:buf_idx]]; \
} else { \
break; \
} \
} \
}
GGML_METAL_ALLOC_OR_BREAK(src0)
GGML_METAL_ALLOC_OR_BREAK(src1)
GGML_METAL_ALLOC_OR_BREAK(dst)
#undef GGML_METAL_ALLOC_OR_BREAK
buf_idx_src0 = -1;
buf_idx_src1 = -1;
buf_idx_dst = -1;
++node_curr;
}
for (NSNumber *buf_idx_num in buf_idxs_used) {
int buf_idx = [buf_idx_num intValue];
ctx->buffers[buf_idx].gpu_use_count++;
}
[ctx->buffer_allocation_condition unlock];
[command_buffer addCompletedHandler: ^(id<MTLCommandBuffer> cb) {
[ctx->buffer_allocation_condition lock];
for (NSNumber *buf_idx_num in buf_idxs_used) {
int buf_idx = [buf_idx_num intValue];
ctx->buffers[buf_idx].gpu_use_count--;
}
ggml_metal_release_unused_buffer(ctx);
[ctx->buffer_allocation_condition unlock];
}];
id<MTLComputeCommandEncoder> encoder = nil; id<MTLComputeCommandEncoder> encoder = nil;
const int node_start = (cb_idx + 0) * n_nodes_per_cb; for (int i = node_start; i < node_curr; ++i) {
const int node_end = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
for (int i = node_start; i < node_end; ++i) {
metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
size_t offs_src0 = 0;
size_t offs_src1 = 0;
size_t offs_dst = 0;
struct ggml_tensor * src0 = gf->nodes[i]->src0; struct ggml_tensor * src0 = gf->nodes[i]->src0;
struct ggml_tensor * src1 = gf->nodes[i]->src1; struct ggml_tensor * src1 = gf->nodes[i]->src1;
struct ggml_tensor * dst = gf->nodes[i]; struct ggml_tensor * dst = gf->nodes[i];
@ -424,9 +499,11 @@ void ggml_metal_graph_compute(
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil; [ctx->buffer_allocation_condition lock];
id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil; id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, ggml_metal_get_buffer_index(ctx, src0, &offs_src0), false) : nil;
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, ggml_metal_get_buffer_index(ctx, src1, &offs_src1), false) : nil;
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(ctx, ggml_metal_get_buffer_index(ctx, dst, &offs_dst), false) : nil;
[ctx->buffer_allocation_condition unlock];
//metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op)); //metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
//if (src0) { //if (src0) {
@ -960,12 +1037,9 @@ void ggml_metal_graph_compute(
} }
[command_buffer commit]; [command_buffer commit];
});
} }
// wait for all threads to finish int n_cb = [command_buffers count];
dispatch_barrier_sync(queue, ^{});
[command_buffers[n_cb - 1] waitUntilCompleted]; [command_buffers[n_cb - 1] waitUntilCompleted];
// check status of command buffers // check status of command buffers
@ -973,7 +1047,8 @@ void ggml_metal_graph_compute(
for (int i = 0; i < n_cb; i++) { for (int i = 0; i < n_cb; i++) {
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status]; MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
if (status != MTLCommandBufferStatusCompleted) { if (status != MTLCommandBufferStatusCompleted) {
fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status); const char *error_str = [[[command_buffers[i] error] description] cStringUsingEncoding:NSUTF8StringEncoding];
fprintf(stderr, "%s: command buffer %d failed with status %lu: %s\n", __func__, i, status, error_str);
GGML_ASSERT(false); GGML_ASSERT(false);
} }
} }

View File

@ -1555,7 +1555,6 @@ static bool llama_eval_internal(
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
if (lctx.ctx_metal && N == 1) { if (lctx.ctx_metal && N == 1) {
ggml_metal_graph_compute(lctx.ctx_metal, &gf); ggml_metal_graph_compute(lctx.ctx_metal, &gf);
ggml_metal_get_tensor (lctx.ctx_metal, cur);
} else { } else {
// IMPORTANT: // IMPORTANT:
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@ -1564,14 +1563,6 @@ static bool llama_eval_internal(
// //
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch. // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
// But for now, we have focused only on Matrix x Vector Metal multiplication. // But for now, we have focused only on Matrix x Vector Metal multiplication.
//
// TODO: avoid these syncs via shared memory (ref #1696)
//
if (lctx.ctx_metal) {
// We need to sync the GPU KV cache with the CPU KV cache
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
}
ggml_graph_compute(ctx0, &gf); ggml_graph_compute(ctx0, &gf);
} }