mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 03:01:45 +00:00
Adjust Metal buffer allocation to avoid allocating beyond MTLDevice.recommendedMaxWorkingSetSize
This commit is contained in:
parent
b213227067
commit
da7d2f9587
@ -50,8 +50,6 @@ int main(int argc, char ** argv) {
|
|||||||
struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
|
struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
|
||||||
*(int32_t *) input->data = 1; // BOS
|
*(int32_t *) input->data = 1; // BOS
|
||||||
|
|
||||||
ggml_metal_set_tensor(ctx_metal, input);
|
|
||||||
|
|
||||||
// warmup
|
// warmup
|
||||||
ggml_metal_graph_compute(ctx_metal, &gf);
|
ggml_metal_graph_compute(ctx_metal, &gf);
|
||||||
|
|
||||||
@ -72,7 +70,6 @@ int main(int argc, char ** argv) {
|
|||||||
// debug output
|
// debug output
|
||||||
{
|
{
|
||||||
struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
|
struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
|
||||||
ggml_metal_get_tensor(ctx_metal, logits);
|
|
||||||
|
|
||||||
float * ptr = (float *) ggml_get_data(logits);
|
float * ptr = (float *) ggml_get_data(logits);
|
||||||
|
|
||||||
|
11
ggml-metal.h
11
ggml-metal.h
@ -13,9 +13,6 @@
|
|||||||
// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
|
// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
|
||||||
// used during the graph evaluation to determine the arguments of the compute kernels.
|
// used during the graph evaluation to determine the arguments of the compute kernels.
|
||||||
//
|
//
|
||||||
// Synchronization between device and host memory (for example for input and output tensors)
|
|
||||||
// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
|
|
||||||
//
|
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
@ -23,7 +20,7 @@
|
|||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
|
||||||
// max memory buffers that can be mapped to the device
|
// max memory buffers that can be mapped to the device
|
||||||
#define GGML_METAL_MAX_BUFFERS 16
|
#define GGML_METAL_MAX_BUFFERS 256
|
||||||
|
|
||||||
struct ggml_tensor;
|
struct ggml_tensor;
|
||||||
struct ggml_cgraph;
|
struct ggml_cgraph;
|
||||||
@ -51,12 +48,6 @@ bool ggml_metal_add_buffer(
|
|||||||
size_t size,
|
size_t size,
|
||||||
size_t max_size);
|
size_t max_size);
|
||||||
|
|
||||||
// set data from host memory into the device
|
|
||||||
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
|
||||||
|
|
||||||
// get data from the device into host memory
|
|
||||||
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
|
||||||
|
|
||||||
// same as ggml_graph_compute but uses Metal
|
// same as ggml_graph_compute but uses Metal
|
||||||
// creates gf->n_threads command buffers in parallel
|
// creates gf->n_threads command buffers in parallel
|
||||||
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
||||||
|
263
ggml-metal.m
263
ggml-metal.m
@ -22,6 +22,7 @@ struct ggml_metal_buffer {
|
|||||||
size_t size;
|
size_t size;
|
||||||
|
|
||||||
id<MTLBuffer> metal;
|
id<MTLBuffer> metal;
|
||||||
|
int gpu_use_count;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_metal_context {
|
struct ggml_metal_context {
|
||||||
@ -30,6 +31,9 @@ struct ggml_metal_context {
|
|||||||
id<MTLDevice> device;
|
id<MTLDevice> device;
|
||||||
id<MTLCommandQueue> queue;
|
id<MTLCommandQueue> queue;
|
||||||
id<MTLLibrary> library;
|
id<MTLLibrary> library;
|
||||||
|
NSCondition *buffer_allocation_condition;
|
||||||
|
dispatch_queue_t command_dispatch_queue;
|
||||||
|
bool allocation_waiting;
|
||||||
|
|
||||||
int n_buffers;
|
int n_buffers;
|
||||||
struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
|
struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
|
||||||
@ -90,10 +94,14 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
|||||||
fprintf(stderr, "%s: allocating\n", __func__);
|
fprintf(stderr, "%s: allocating\n", __func__);
|
||||||
|
|
||||||
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
||||||
|
memset(ctx, 0, sizeof(struct ggml_metal_context));
|
||||||
|
|
||||||
ctx->device = MTLCreateSystemDefaultDevice();
|
ctx->device = MTLCreateSystemDefaultDevice();
|
||||||
ctx->queue = [ctx->device newCommandQueue];
|
ctx->queue = [ctx->device newCommandQueue];
|
||||||
ctx->n_buffers = 0;
|
ctx->n_buffers = 0;
|
||||||
|
ctx->command_dispatch_queue = dispatch_queue_create("llama.cpp.command_dispatch", DISPATCH_QUEUE_CONCURRENT);
|
||||||
|
ctx->buffer_allocation_condition = [[NSCondition alloc] init];
|
||||||
|
ctx->allocation_waiting = false;
|
||||||
|
|
||||||
// determine if we can use MPS
|
// determine if we can use MPS
|
||||||
if (MPSSupportsMTLDevice(ctx->device)) {
|
if (MPSSupportsMTLDevice(ctx->device)) {
|
||||||
@ -151,7 +159,7 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
|||||||
#define GGML_METAL_ADD_KERNEL(name) \
|
#define GGML_METAL_ADD_KERNEL(name) \
|
||||||
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
||||||
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:nil]; \
|
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:nil]; \
|
||||||
fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name);
|
fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (__bridge void *) ctx->pipeline_##name);
|
||||||
|
|
||||||
GGML_METAL_ADD_KERNEL(add);
|
GGML_METAL_ADD_KERNEL(add);
|
||||||
GGML_METAL_ADD_KERNEL(mul);
|
GGML_METAL_ADD_KERNEL(mul);
|
||||||
@ -188,7 +196,7 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
|||||||
|
|
||||||
#undef GGML_METAL_ADD_KERNEL
|
#undef GGML_METAL_ADD_KERNEL
|
||||||
}
|
}
|
||||||
|
fprintf(stderr, "%s: currentAllocatedSize = %8.2f MB\n", __func__, ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
|
||||||
fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||||
fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||||
if (ctx->device.maxTransferRate != 0) {
|
if (ctx->device.maxTransferRate != 0) {
|
||||||
@ -205,6 +213,13 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|||||||
for (int i = 0; i < ctx->n_buffers; ++i) {
|
for (int i = 0; i < ctx->n_buffers; ++i) {
|
||||||
[ctx->buffers[i].metal release];
|
[ctx->buffers[i].metal release];
|
||||||
}
|
}
|
||||||
|
ctx->command_dispatch_queue = nil;
|
||||||
|
[ctx->buffer_allocation_condition release];
|
||||||
|
ctx->buffer_allocation_condition = nil;
|
||||||
|
ctx->n_buffers = 0;
|
||||||
|
ctx->device = nil;
|
||||||
|
ctx->library = nil;
|
||||||
|
ctx->queue = nil;
|
||||||
free(ctx);
|
free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -212,7 +227,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|||||||
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
||||||
// Metal buffer based on the host memory pointer
|
// Metal buffer based on the host memory pointer
|
||||||
//
|
//
|
||||||
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
|
static int ggml_metal_get_buffer_index(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
|
||||||
//fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
//fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
||||||
|
|
||||||
const int64_t tsize = ggml_nbytes(t);
|
const int64_t tsize = ggml_nbytes(t);
|
||||||
@ -220,19 +235,82 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
|||||||
// find the view that contains the tensor fully
|
// find the view that contains the tensor fully
|
||||||
for (int i = 0; i < ctx->n_buffers; ++i) {
|
for (int i = 0; i < ctx->n_buffers; ++i) {
|
||||||
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
||||||
|
size_t size = ctx->buffers[i].size;
|
||||||
|
|
||||||
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
|
if (ioffs >= 0 && ioffs + tsize <= (int64_t) size) {
|
||||||
*offs = (size_t) ioffs;
|
*offs = (size_t) ioffs;
|
||||||
|
return i;
|
||||||
//fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
|
|
||||||
|
|
||||||
return ctx->buffers[i].metal;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s: error: buffer is nil\n", __func__);
|
fprintf(stderr, "%s: error: buffer is NULL\n", __func__);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_metal_release_unused_buffer(struct ggml_metal_context * ctx) {
|
||||||
|
// Release a buffer that no command buffers are actively using
|
||||||
|
if (ctx->allocation_waiting) {
|
||||||
|
for (int i = 0; i < ctx->n_buffers; ++i) {
|
||||||
|
if (ctx->buffers[i].gpu_use_count == 0 && ctx->buffers[i].metal != nil) {
|
||||||
|
[ctx->buffers[i].metal release];
|
||||||
|
ctx->buffers[i].metal = nil;
|
||||||
|
ctx->allocation_waiting = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_metal_signal_buffer_dealloc(struct ggml_metal_context * ctx, int buf_idx) {
|
||||||
|
void *data = ctx->buffers[buf_idx].data;
|
||||||
|
const char * name = ctx->buffers[buf_idx].name;
|
||||||
|
size_t address = (size_t) data;
|
||||||
|
|
||||||
|
[ctx->buffer_allocation_condition lock];
|
||||||
|
[ctx->buffer_allocation_condition signal];
|
||||||
|
[ctx->buffer_allocation_condition unlock];
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_metal_allocate_buffer(struct ggml_metal_context * ctx, int buf_idx) {
|
||||||
|
void *data = ctx->buffers[buf_idx].data;
|
||||||
|
const char * name = ctx->buffers[buf_idx].name;
|
||||||
|
size_t size = ctx->buffers[buf_idx].size;
|
||||||
|
size_t address = (size_t) data;
|
||||||
|
|
||||||
|
ctx->buffers[buf_idx].metal = [ctx->device newBufferWithBytesNoCopy:data
|
||||||
|
length:size
|
||||||
|
options:MTLResourceStorageModeShared
|
||||||
|
deallocator:^(void *const ptr, const NSUInteger len) {
|
||||||
|
dispatch_async(ctx->command_dispatch_queue, ^{
|
||||||
|
ggml_metal_signal_buffer_dealloc(ctx, buf_idx);
|
||||||
|
});
|
||||||
|
}];
|
||||||
|
}
|
||||||
|
|
||||||
|
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, int buf_idx, bool wait_for_alloc) {
|
||||||
|
if (buf_idx < 0) {
|
||||||
return nil;
|
return nil;
|
||||||
|
}
|
||||||
|
size_t size = ctx->buffers[buf_idx].size;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
if (ctx->buffers[buf_idx].metal != nil) {
|
||||||
|
return ctx->buffers[buf_idx].metal;
|
||||||
|
}
|
||||||
|
if (ctx->device.currentAllocatedSize + size <= ctx->device.recommendedMaxWorkingSetSize) {
|
||||||
|
ggml_metal_allocate_buffer(ctx, buf_idx);
|
||||||
|
return ctx->buffers[buf_idx].metal;
|
||||||
|
}
|
||||||
|
if (!wait_for_alloc) {
|
||||||
|
return nil;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->allocation_waiting = true;
|
||||||
|
ggml_metal_release_unused_buffer(ctx);
|
||||||
|
[ctx->buffer_allocation_condition wait];
|
||||||
|
}
|
||||||
|
|
||||||
|
return ctx->buffers[buf_idx].metal;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_metal_add_buffer(
|
bool ggml_metal_add_buffer(
|
||||||
@ -268,24 +346,18 @@ bool ggml_metal_add_buffer(
|
|||||||
if (size_aligned <= ctx->device.maxBufferLength) {
|
if (size_aligned <= ctx->device.maxBufferLength) {
|
||||||
ctx->buffers[ctx->n_buffers].name = name;
|
ctx->buffers[ctx->n_buffers].name = name;
|
||||||
ctx->buffers[ctx->n_buffers].data = data;
|
ctx->buffers[ctx->n_buffers].data = data;
|
||||||
ctx->buffers[ctx->n_buffers].size = size;
|
ctx->buffers[ctx->n_buffers].size = size_aligned;
|
||||||
|
ctx->buffers[ctx->n_buffers].metal = nil;
|
||||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
ctx->buffers[ctx->n_buffers].gpu_use_count = 0;
|
||||||
|
fprintf(stderr, "%s: prepared size for '%-16s' buffer (%d), size = %8.2f MB\n", __func__, name, ctx->n_buffers, size_aligned / 1024.0 / 1024.0);
|
||||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
|
||||||
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
|
||||||
|
|
||||||
++ctx->n_buffers;
|
++ctx->n_buffers;
|
||||||
} else {
|
} else {
|
||||||
// this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
|
// this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
|
||||||
// one of the views
|
// one of the views
|
||||||
|
size_t max_buffer_length = 1 * 1024ul * 1024ul * 1024ul; // ctx->device.maxBufferLength;
|
||||||
const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
|
const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
|
||||||
const size_t size_step = ctx->device.maxBufferLength - size_ovlp;
|
const size_t size_step = max_buffer_length - size_ovlp;
|
||||||
const size_t size_view = ctx->device.maxBufferLength;
|
const size_t size_view = max_buffer_length;
|
||||||
|
|
||||||
for (size_t i = 0; i < size; i += size_step) {
|
for (size_t i = 0; i < size; i += size_step) {
|
||||||
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
|
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
|
||||||
@ -293,59 +365,18 @@ bool ggml_metal_add_buffer(
|
|||||||
ctx->buffers[ctx->n_buffers].name = name;
|
ctx->buffers[ctx->n_buffers].name = name;
|
||||||
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
|
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
|
||||||
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
||||||
|
ctx->buffers[ctx->n_buffers].metal = nil;
|
||||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
ctx->buffers[ctx->n_buffers].gpu_use_count = 0;
|
||||||
|
fprintf(stderr, "%s: prepared size for '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
||||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
|
||||||
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
|
||||||
if (i + size_step < size) {
|
|
||||||
fprintf(stderr, "\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
++ctx->n_buffers;
|
++ctx->n_buffers;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, ", (%8.2f / %8.2f)",
|
|
||||||
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
|
|
||||||
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
|
||||||
|
|
||||||
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
|
|
||||||
fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
|
|
||||||
} else {
|
|
||||||
fprintf(stderr, "\n");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_metal_set_tensor(
|
|
||||||
struct ggml_metal_context * ctx,
|
|
||||||
struct ggml_tensor * t) {
|
|
||||||
metal_printf("%s: set input for tensor '%s'\n", __func__, t->name);
|
|
||||||
|
|
||||||
size_t offs;
|
|
||||||
id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);
|
|
||||||
|
|
||||||
memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t));
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_metal_get_tensor(
|
|
||||||
struct ggml_metal_context * ctx,
|
|
||||||
struct ggml_tensor * t) {
|
|
||||||
metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name);
|
|
||||||
|
|
||||||
size_t offs;
|
|
||||||
id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);
|
|
||||||
|
|
||||||
memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_metal_graph_compute(
|
void ggml_metal_graph_compute(
|
||||||
struct ggml_metal_context * ctx,
|
struct ggml_metal_context * ctx,
|
||||||
struct ggml_cgraph * gf) {
|
struct ggml_cgraph * gf) {
|
||||||
@ -354,38 +385,82 @@ void ggml_metal_graph_compute(
|
|||||||
// create multiple command buffers and enqueue them
|
// create multiple command buffers and enqueue them
|
||||||
// then, we encode the graph into the command buffers in parallel
|
// then, we encode the graph into the command buffers in parallel
|
||||||
|
|
||||||
const int n_cb = gf->n_threads;
|
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:16];
|
||||||
|
|
||||||
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
|
int buf_idx_src0 = -1;
|
||||||
|
int buf_idx_src1 = -1;
|
||||||
|
int buf_idx_dst = -1;
|
||||||
|
|
||||||
for (int i = 0; i < n_cb; ++i) {
|
int node_curr = 0;
|
||||||
command_buffers[i] = [ctx->queue commandBuffer];
|
while (node_curr < gf->n_nodes) {
|
||||||
|
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer];
|
||||||
|
[command_buffers addObject:command_buffer];
|
||||||
|
[command_buffer enqueue];
|
||||||
|
|
||||||
// enqueue the command buffers in order to specify their execution order
|
[ctx->buffer_allocation_condition lock];
|
||||||
[command_buffers[i] enqueue];
|
NSMutableSet *buf_idxs_used = [NSMutableSet set];
|
||||||
}
|
int node_start = node_curr;
|
||||||
|
while (node_curr < gf->n_nodes) {
|
||||||
// TODO: is this the best way to start threads?
|
|
||||||
dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
|
|
||||||
|
|
||||||
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
|
||||||
const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
|
|
||||||
|
|
||||||
dispatch_async(queue, ^{
|
|
||||||
size_t offs_src0 = 0;
|
size_t offs_src0 = 0;
|
||||||
size_t offs_src1 = 0;
|
size_t offs_src1 = 0;
|
||||||
size_t offs_dst = 0;
|
size_t offs_dst = 0;
|
||||||
|
|
||||||
id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
|
struct ggml_tensor * src0 = gf->nodes[node_curr]->src0;
|
||||||
|
struct ggml_tensor * src1 = gf->nodes[node_curr]->src1;
|
||||||
|
struct ggml_tensor * dst = gf->nodes[node_curr];
|
||||||
|
|
||||||
|
bool wait_for_alloc = (node_start == node_curr);
|
||||||
|
|
||||||
|
#define GGML_METAL_ALLOC_OR_BREAK(buffer_name) \
|
||||||
|
if (buffer_name) { \
|
||||||
|
if (buf_idx_##buffer_name < 0) { \
|
||||||
|
int buf_idx = ggml_metal_get_buffer_index(ctx, buffer_name, &offs_##buffer_name); \
|
||||||
|
if (ggml_metal_get_buffer(ctx, buf_idx, wait_for_alloc)) { \
|
||||||
|
buf_idx_##buffer_name = buf_idx; \
|
||||||
|
[buf_idxs_used addObject:[NSNumber numberWithInt:buf_idx]]; \
|
||||||
|
} else { \
|
||||||
|
break; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_METAL_ALLOC_OR_BREAK(src0)
|
||||||
|
GGML_METAL_ALLOC_OR_BREAK(src1)
|
||||||
|
GGML_METAL_ALLOC_OR_BREAK(dst)
|
||||||
|
|
||||||
|
#undef GGML_METAL_ALLOC_OR_BREAK
|
||||||
|
|
||||||
|
buf_idx_src0 = -1;
|
||||||
|
buf_idx_src1 = -1;
|
||||||
|
buf_idx_dst = -1;
|
||||||
|
++node_curr;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (NSNumber *buf_idx_num in buf_idxs_used) {
|
||||||
|
int buf_idx = [buf_idx_num intValue];
|
||||||
|
ctx->buffers[buf_idx].gpu_use_count++;
|
||||||
|
}
|
||||||
|
[ctx->buffer_allocation_condition unlock];
|
||||||
|
|
||||||
|
[command_buffer addCompletedHandler: ^(id<MTLCommandBuffer> cb) {
|
||||||
|
[ctx->buffer_allocation_condition lock];
|
||||||
|
for (NSNumber *buf_idx_num in buf_idxs_used) {
|
||||||
|
int buf_idx = [buf_idx_num intValue];
|
||||||
|
ctx->buffers[buf_idx].gpu_use_count--;
|
||||||
|
}
|
||||||
|
ggml_metal_release_unused_buffer(ctx);
|
||||||
|
[ctx->buffer_allocation_condition unlock];
|
||||||
|
}];
|
||||||
|
|
||||||
id<MTLComputeCommandEncoder> encoder = nil;
|
id<MTLComputeCommandEncoder> encoder = nil;
|
||||||
|
|
||||||
const int node_start = (cb_idx + 0) * n_nodes_per_cb;
|
for (int i = node_start; i < node_curr; ++i) {
|
||||||
const int node_end = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
|
|
||||||
|
|
||||||
for (int i = node_start; i < node_end; ++i) {
|
|
||||||
metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
||||||
|
|
||||||
|
size_t offs_src0 = 0;
|
||||||
|
size_t offs_src1 = 0;
|
||||||
|
size_t offs_dst = 0;
|
||||||
|
|
||||||
struct ggml_tensor * src0 = gf->nodes[i]->src0;
|
struct ggml_tensor * src0 = gf->nodes[i]->src0;
|
||||||
struct ggml_tensor * src1 = gf->nodes[i]->src1;
|
struct ggml_tensor * src1 = gf->nodes[i]->src1;
|
||||||
struct ggml_tensor * dst = gf->nodes[i];
|
struct ggml_tensor * dst = gf->nodes[i];
|
||||||
@ -424,9 +499,11 @@ void ggml_metal_graph_compute(
|
|||||||
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
||||||
const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
|
const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
|
||||||
|
|
||||||
id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
|
[ctx->buffer_allocation_condition lock];
|
||||||
id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
|
id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, ggml_metal_get_buffer_index(ctx, src0, &offs_src0), false) : nil;
|
||||||
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil;
|
id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, ggml_metal_get_buffer_index(ctx, src1, &offs_src1), false) : nil;
|
||||||
|
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(ctx, ggml_metal_get_buffer_index(ctx, dst, &offs_dst), false) : nil;
|
||||||
|
[ctx->buffer_allocation_condition unlock];
|
||||||
|
|
||||||
//metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
|
//metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
|
||||||
//if (src0) {
|
//if (src0) {
|
||||||
@ -960,12 +1037,9 @@ void ggml_metal_graph_compute(
|
|||||||
}
|
}
|
||||||
|
|
||||||
[command_buffer commit];
|
[command_buffer commit];
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// wait for all threads to finish
|
int n_cb = [command_buffers count];
|
||||||
dispatch_barrier_sync(queue, ^{});
|
|
||||||
|
|
||||||
[command_buffers[n_cb - 1] waitUntilCompleted];
|
[command_buffers[n_cb - 1] waitUntilCompleted];
|
||||||
|
|
||||||
// check status of command buffers
|
// check status of command buffers
|
||||||
@ -973,7 +1047,8 @@ void ggml_metal_graph_compute(
|
|||||||
for (int i = 0; i < n_cb; i++) {
|
for (int i = 0; i < n_cb; i++) {
|
||||||
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
|
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
|
||||||
if (status != MTLCommandBufferStatusCompleted) {
|
if (status != MTLCommandBufferStatusCompleted) {
|
||||||
fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
const char *error_str = [[[command_buffers[i] error] description] cStringUsingEncoding:NSUTF8StringEncoding];
|
||||||
|
fprintf(stderr, "%s: command buffer %d failed with status %lu: %s\n", __func__, i, status, error_str);
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1555,7 +1555,6 @@ static bool llama_eval_internal(
|
|||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (lctx.ctx_metal && N == 1) {
|
if (lctx.ctx_metal && N == 1) {
|
||||||
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
||||||
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
|
||||||
} else {
|
} else {
|
||||||
// IMPORTANT:
|
// IMPORTANT:
|
||||||
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
||||||
@ -1564,14 +1563,6 @@ static bool llama_eval_internal(
|
|||||||
//
|
//
|
||||||
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
|
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
|
||||||
// But for now, we have focused only on Matrix x Vector Metal multiplication.
|
// But for now, we have focused only on Matrix x Vector Metal multiplication.
|
||||||
//
|
|
||||||
// TODO: avoid these syncs via shared memory (ref #1696)
|
|
||||||
//
|
|
||||||
if (lctx.ctx_metal) {
|
|
||||||
// We need to sync the GPU KV cache with the CPU KV cache
|
|
||||||
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
|
|
||||||
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute(ctx0, &gf);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user