mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-31 22:04:35 +00:00
metal : add comments
This commit is contained in:
parent
43b9d694df
commit
5273e59b09
@ -15,7 +15,8 @@
|
|||||||
// max memory buffers that can be mapped to the device
|
// max memory buffers that can be mapped to the device
|
||||||
#define GGML_METAL_MAX_BUFFERS 64
|
#define GGML_METAL_MAX_BUFFERS 64
|
||||||
|
|
||||||
#define GGML_METAL_MAX_COMMAND_BUFFERS 128
|
// max number of MTLCommandBuffer used to submit a graph for processing
|
||||||
|
#define GGML_METAL_MAX_COMMAND_BUFFERS 8
|
||||||
|
|
||||||
#ifdef GGML_METAL_NDEBUG
|
#ifdef GGML_METAL_NDEBUG
|
||||||
#define GGML_METAL_LOG(...)
|
#define GGML_METAL_LOG(...)
|
||||||
@ -226,8 +227,6 @@ enum ggml_metal_kernel_type {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_metal_context {
|
struct ggml_backend_metal_context {
|
||||||
int n_cb;
|
|
||||||
|
|
||||||
id<MTLDevice> device;
|
id<MTLDevice> device;
|
||||||
id<MTLCommandQueue> queue;
|
id<MTLCommandQueue> queue;
|
||||||
|
|
||||||
@ -240,21 +239,28 @@ struct ggml_backend_metal_context {
|
|||||||
bool support_simdgroup_reduction;
|
bool support_simdgroup_reduction;
|
||||||
bool support_simdgroup_mm;
|
bool support_simdgroup_mm;
|
||||||
|
|
||||||
bool should_capture_next_compute;
|
// capture state
|
||||||
|
bool capture_next_compute;
|
||||||
bool capture_started;
|
bool capture_started;
|
||||||
|
|
||||||
id<MTLCaptureScope> cap_scope;
|
id<MTLCaptureScope> capture_scope;
|
||||||
|
|
||||||
id<MTLCommandBuffer> command_buffers[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
|
// command buffer state
|
||||||
|
int n_cb; // number of extra threads used to submit the command buffers
|
||||||
int n_nodes_0;
|
int n_nodes_0; // number of nodes submitted by the main thread
|
||||||
int n_nodes_1;
|
int n_nodes_1; // remaining number of nodes submitted by the n_cb threads
|
||||||
int n_nodes_per_cb;
|
int n_nodes_per_cb;
|
||||||
|
|
||||||
struct ggml_cgraph * gf;
|
struct ggml_cgraph * gf;
|
||||||
|
|
||||||
|
// the callback given to the thread pool
|
||||||
|
// TODO: ideally, this should be created once, utilizing the command buffer state above
|
||||||
|
// for some reason, doing it like this leads to a crash
|
||||||
void (^encode_async)(size_t ith);
|
void (^encode_async)(size_t ith);
|
||||||
|
|
||||||
|
// n_cb command buffers + 1 used by the main thread
|
||||||
|
id<MTLCommandBuffer> command_buffers[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
|
||||||
|
|
||||||
// abort ggml_metal_graph_compute if callback returns true
|
// abort ggml_metal_graph_compute if callback returns true
|
||||||
ggml_abort_callback abort_callback;
|
ggml_abort_callback abort_callback;
|
||||||
void * abort_callback_data;
|
void * abort_callback_data;
|
||||||
@ -476,17 +482,16 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
|
|||||||
GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false");
|
GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false");
|
||||||
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||||
|
|
||||||
ctx->should_capture_next_compute = false;
|
ctx->capture_next_compute = false;
|
||||||
ctx->capture_started = false;
|
ctx->capture_started = false;
|
||||||
|
ctx->capture_scope = nil;
|
||||||
|
|
||||||
ctx->cap_scope = nil;
|
ctx->gf = nil;
|
||||||
|
ctx->encode_async = nil;
|
||||||
for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
|
for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
|
||||||
ctx->command_buffers[i] = nil;
|
ctx->command_buffers[i] = nil;
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->encode_async = nil;
|
|
||||||
|
|
||||||
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
|
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
|
||||||
if (@available(macOS 10.12, iOS 16.0, *)) {
|
if (@available(macOS 10.12, iOS 16.0, *)) {
|
||||||
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
|
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
|
||||||
@ -3000,31 +3005,37 @@ static void ggml_metal_encode_node(
|
|||||||
static enum ggml_status ggml_metal_graph_compute(
|
static enum ggml_status ggml_metal_graph_compute(
|
||||||
struct ggml_backend_metal_context * ctx,
|
struct ggml_backend_metal_context * ctx,
|
||||||
struct ggml_cgraph * gf) {
|
struct ggml_cgraph * gf) {
|
||||||
|
// number of nodes encoded by the main thread (empirically determined)
|
||||||
|
const int n_main = 128;
|
||||||
|
|
||||||
|
// number of threads in addition to the main thread
|
||||||
|
const int n_cb = ctx->n_cb;
|
||||||
|
|
||||||
|
// submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
|
||||||
|
// the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
|
||||||
|
// while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
|
||||||
|
// each thread creates it's own command buffer and enqueues the ops in parallel
|
||||||
|
//
|
||||||
|
// tests on M1 Pro and M2 Ultra using LLaMA models, show that optimal values for n_cb are 1 or 2
|
||||||
|
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
// create multiple command buffers and enqueue them
|
|
||||||
// then, we encode the graph into the command buffers in parallel
|
|
||||||
|
|
||||||
const int n_cb = ctx->n_cb;
|
|
||||||
|
|
||||||
ctx->gf = gf;
|
ctx->gf = gf;
|
||||||
|
|
||||||
ctx->n_nodes_0 = MIN(128, gf->n_nodes);
|
ctx->n_nodes_0 = MIN(n_main, gf->n_nodes);
|
||||||
ctx->n_nodes_1 = gf->n_nodes - ctx->n_nodes_0;
|
ctx->n_nodes_1 = gf->n_nodes - ctx->n_nodes_0;
|
||||||
|
|
||||||
ctx->n_nodes_per_cb = (ctx->n_nodes_1 + n_cb - 1) / n_cb;
|
ctx->n_nodes_per_cb = (ctx->n_nodes_1 + ctx->n_cb - 1) / ctx->n_cb;
|
||||||
|
|
||||||
//const int64_t t_start = ggml_time_us();
|
const bool should_capture = ctx->capture_next_compute;
|
||||||
|
|
||||||
const bool should_capture = ctx->should_capture_next_compute;
|
|
||||||
if (should_capture) {
|
if (should_capture) {
|
||||||
ctx->should_capture_next_compute = false;
|
ctx->capture_next_compute = false;
|
||||||
|
|
||||||
if (!ctx->capture_started) {
|
if (!ctx->capture_started) {
|
||||||
// create capture scope
|
// create capture scope
|
||||||
ctx->cap_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx->device];
|
ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx->device];
|
||||||
|
|
||||||
MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
|
MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
|
||||||
descriptor.captureObject = ctx->cap_scope;
|
descriptor.captureObject = ctx->capture_scope;
|
||||||
descriptor.destination = MTLCaptureDestinationGPUTraceDocument;
|
descriptor.destination = MTLCaptureDestinationGPUTraceDocument;
|
||||||
descriptor.outputURL = [NSURL fileURLWithPath:[NSString stringWithFormat:@"/tmp/perf-metal.gputrace"]];
|
descriptor.outputURL = [NSURL fileURLWithPath:[NSString stringWithFormat:@"/tmp/perf-metal.gputrace"]];
|
||||||
|
|
||||||
@ -3033,7 +3044,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||||||
GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
|
GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
|
||||||
GGML_ABORT("capture failed");
|
GGML_ABORT("capture failed");
|
||||||
} else {
|
} else {
|
||||||
[ctx->cap_scope beginScope];
|
[ctx->capture_scope beginScope];
|
||||||
ctx->capture_started = true;
|
ctx->capture_started = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -3055,7 +3066,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||||||
int node_start = 0;
|
int node_start = 0;
|
||||||
int node_end = n_nodes_0;
|
int node_end = n_nodes_0;
|
||||||
|
|
||||||
if ((int) iter < n_cb_l) {
|
if (cb_idx < n_cb_l) {
|
||||||
node_start = n_nodes_0 + ( (cb_idx + 0) * n_nodes_per_cb);
|
node_start = n_nodes_0 + ( (cb_idx + 0) * n_nodes_per_cb);
|
||||||
node_end = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1));
|
node_end = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1));
|
||||||
}
|
}
|
||||||
@ -3079,17 +3090,20 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// the main thread commits the first few commands immediately
|
||||||
|
// command_buffer[n_cb]
|
||||||
{
|
{
|
||||||
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
|
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
|
||||||
ctx->command_buffers[n_cb] = command_buffer;
|
ctx->command_buffers[n_cb] = command_buffer;
|
||||||
|
|
||||||
[command_buffer enqueue];
|
[command_buffer enqueue];
|
||||||
|
|
||||||
ctx->encode_async(n_cb);
|
ctx->encode_async(n_cb);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// prepare the rest of the command buffers asynchronously
|
||||||
|
// command_buffer[0.. n_cb)
|
||||||
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
||||||
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
|
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
|
||||||
ctx->command_buffers[cb_idx] = command_buffer;
|
ctx->command_buffers[cb_idx] = command_buffer;
|
||||||
|
|
||||||
// always enqueue the first two command buffers
|
// always enqueue the first two command buffers
|
||||||
@ -3101,14 +3115,8 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||||||
|
|
||||||
dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async);
|
dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async);
|
||||||
|
|
||||||
//{
|
// wait for completion and check status of each command buffer
|
||||||
// const int64_t t_end = ggml_time_us();
|
|
||||||
// //printf("time to encode: %d us, n_cb = %d\n", (int) (t_end - t_start), n_cb);
|
|
||||||
//}
|
|
||||||
|
|
||||||
// Wait for completion and check status of each command buffer
|
|
||||||
// needed to detect if the device ran out-of-memory for example (#1881)
|
// needed to detect if the device ran out-of-memory for example (#1881)
|
||||||
|
|
||||||
{
|
{
|
||||||
id<MTLCommandBuffer> command_buffer = ctx->command_buffers[n_cb];
|
id<MTLCommandBuffer> command_buffer = ctx->command_buffers[n_cb];
|
||||||
[command_buffer waitUntilCompleted];
|
[command_buffer waitUntilCompleted];
|
||||||
@ -3143,7 +3151,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
|
const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
|
||||||
if (next_queued) {
|
if (next_queued) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -3156,13 +3164,8 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||||||
[next_buffer commit];
|
[next_buffer commit];
|
||||||
}
|
}
|
||||||
|
|
||||||
//{
|
|
||||||
// const int64_t t_end = ggml_time_us();
|
|
||||||
// printf("time to compute: %d us\n", (int)(t_end - t_start));
|
|
||||||
//}
|
|
||||||
|
|
||||||
if (!should_capture && ctx->capture_started) {
|
if (!should_capture && ctx->capture_started) {
|
||||||
[ctx->cap_scope endScope];
|
[ctx->capture_scope endScope];
|
||||||
[[MTLCaptureManager sharedCaptureManager] stopCapture];
|
[[MTLCaptureManager sharedCaptureManager] stopCapture];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -3514,7 +3517,7 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: setting encode_async here causes crash. why?
|
// TODO: setting encode_async here causes crash during the next ggml_metal_graph_compute call. why?
|
||||||
//ctx->encode_async = ^(size_t iter) {
|
//ctx->encode_async = ^(size_t iter) {
|
||||||
// ...
|
// ...
|
||||||
//};
|
//};
|
||||||
@ -3598,7 +3601,7 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
|
|||||||
GGML_ASSERT(ggml_backend_is_metal(backend));
|
GGML_ASSERT(ggml_backend_is_metal(backend));
|
||||||
|
|
||||||
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
|
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
|
||||||
ctx->should_capture_next_compute = true;
|
ctx->capture_next_compute = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
|
GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
|
||||||
|
Loading…
Reference in New Issue
Block a user