Adjust Metal buffer allocation to avoid allocating beyond MTLDevice.recommendedMaxWorkingSetSize

This commit is contained in:
Kilty McGowan 2023-07-01 21:33:16 -07:00
parent b213227067
commit da7d2f9587
4 changed files with 691 additions and 637 deletions

View File

@ -50,8 +50,6 @@ int main(int argc, char ** argv) {
struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd"); struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
*(int32_t *) input->data = 1; // BOS *(int32_t *) input->data = 1; // BOS
ggml_metal_set_tensor(ctx_metal, input);
// warmup // warmup
ggml_metal_graph_compute(ctx_metal, &gf); ggml_metal_graph_compute(ctx_metal, &gf);
@ -72,7 +70,6 @@ int main(int argc, char ** argv) {
// debug output // debug output
{ {
struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1]; struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
ggml_metal_get_tensor(ctx_metal, logits);
float * ptr = (float *) ggml_get_data(logits); float * ptr = (float *) ggml_get_data(logits);

View File

@ -13,9 +13,6 @@
// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is // are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
// used during the graph evaluation to determine the arguments of the compute kernels. // used during the graph evaluation to determine the arguments of the compute kernels.
// //
// Synchronization between device and host memory (for example for input and output tensors)
// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
//
#pragma once #pragma once
@ -23,7 +20,7 @@
#include <stdbool.h> #include <stdbool.h>
// max memory buffers that can be mapped to the device // max memory buffers that can be mapped to the device
#define GGML_METAL_MAX_BUFFERS 16 #define GGML_METAL_MAX_BUFFERS 256
struct ggml_tensor; struct ggml_tensor;
struct ggml_cgraph; struct ggml_cgraph;
@ -51,12 +48,6 @@ bool ggml_metal_add_buffer(
size_t size, size_t size,
size_t max_size); size_t max_size);
// set data from host memory into the device
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
// get data from the device into host memory
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
// same as ggml_graph_compute but uses Metal // same as ggml_graph_compute but uses Metal
// creates gf->n_threads command buffers in parallel // creates gf->n_threads command buffers in parallel
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);

File diff suppressed because it is too large Load Diff

View File

@ -1555,7 +1555,6 @@ static bool llama_eval_internal(
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
if (lctx.ctx_metal && N == 1) { if (lctx.ctx_metal && N == 1) {
ggml_metal_graph_compute(lctx.ctx_metal, &gf); ggml_metal_graph_compute(lctx.ctx_metal, &gf);
ggml_metal_get_tensor (lctx.ctx_metal, cur);
} else { } else {
// IMPORTANT: // IMPORTANT:
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@ -1564,14 +1563,6 @@ static bool llama_eval_internal(
// //
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch. // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
// But for now, we have focused only on Matrix x Vector Metal multiplication. // But for now, we have focused only on Matrix x Vector Metal multiplication.
//
// TODO: avoid these syncs via shared memory (ref #1696)
//
if (lctx.ctx_metal) {
// We need to sync the GPU KV cache with the CPU KV cache
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
}
ggml_graph_compute(ctx0, &gf); ggml_graph_compute(ctx0, &gf);
} }