Adjust Metal buffer allocation to avoid allocating beyond MTLDevice.recommendedMaxWorkingSetSize

2025-01-11 03:01:45 +00:00 · 2023-07-01 21:33:16 -07:00 · 2023-07-01 21:33:16 -07:00 · da7d2f9587
commit da7d2f9587
parent b213227067
4 changed files with 691 additions and 637 deletions
--- a/examples/metal/metal.cpp
+++ b/examples/metal/metal.cpp
@ -50,8 +50,6 @@ int main(int argc, char ** argv) {
        struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
        *(int32_t *) input->data = 1; // BOS

-        ggml_metal_set_tensor(ctx_metal, input);
-
        // warmup
        ggml_metal_graph_compute(ctx_metal, &gf);

@ -72,7 +70,6 @@ int main(int argc, char ** argv) {
    // debug output
    {
        struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
-        ggml_metal_get_tensor(ctx_metal, logits);

        float * ptr = (float *) ggml_get_data(logits);

--- a/ggml-metal.h
+++ b/ggml-metal.h
@ -13,9 +13,6 @@
 // are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
 // used during the graph evaluation to determine the arguments of the compute kernels.
 //
-// Synchronization between device and host memory (for example for input and output tensors)
-// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
-//

 #pragma once

@ -23,7 +20,7 @@
 #include <stdbool.h>

 // max memory buffers that can be mapped to the device
-#define GGML_METAL_MAX_BUFFERS 16
+#define GGML_METAL_MAX_BUFFERS 256

 struct ggml_tensor;
 struct ggml_cgraph;
@ -51,12 +48,6 @@ bool ggml_metal_add_buffer(
                           size_t   size,
                           size_t   max_size);

-// set data from host memory into the device
-void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
-
-// get data from the device into host memory
-void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
-
 // same as ggml_graph_compute but uses Metal
 // creates gf->n_threads command buffers in parallel
 void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
--- a/ggml-metal.m
+++ b/ggml-metal.m
--- a/llama.cpp
+++ b/llama.cpp
@ -1555,7 +1555,6 @@ static bool llama_eval_internal(
 #ifdef GGML_USE_METAL
    if (lctx.ctx_metal && N == 1) {
        ggml_metal_graph_compute(lctx.ctx_metal, &gf);
-        ggml_metal_get_tensor   (lctx.ctx_metal, cur);
    } else {
        // IMPORTANT:
        // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@ -1564,15 +1563,7 @@ static bool llama_eval_internal(
        //
        // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
        // But for now, we have focused only on Matrix x Vector Metal multiplication.
-        //
-        // TODO: avoid these syncs via shared memory (ref #1696)
-        //
-        if (lctx.ctx_metal) {
-            // We need to sync the GPU KV cache with the CPU KV cache
-            ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
-            ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
-        }
-
+        
        ggml_graph_compute(ctx0, &gf);
    }
 #else