From 3d679827e7d21a5a970582bc09afcc575871e0b5 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 21 Jul 2023 12:41:46 +0200
Subject: [PATCH] improved memory management fixes

---
 ggml-backend.c | 92 ++++++++++++++++++++++++++++++++++++--------------
 ggml.c         |  1 +
 ggml.h         |  3 +-
 llama.cpp      |  4 ++-
 4 files changed, 72 insertions(+), 28 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index f19454a15..a8fc3632b 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -7,6 +7,9 @@
 
 #define UNUSED(x) (void)(x)
 
+//#define AT_PRINTF printf
+#define AT_PRINTF(...) ((void)0)
+
 // allocator
 
 static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
@@ -146,16 +149,16 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str
     /////
     if (alloc->measure && allocator_ctx->size != MAX_SIZE_INIT) {
         allocator_ctx->size = MAX_SIZE_INIT;
-        //allocator_ctx->data = 0;
+        allocator_ctx->data = 0x1000;
         allocator_ctx->free_blocks[0].size = MAX_SIZE_INIT;
-        //allocator_ctx->free_blocks[0].addr = 0;
+        allocator_ctx->free_blocks[0].addr = 0x1000;
     }
     /////
 
     size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
     size = aligned_offset(NULL, size, allocator_ctx->alignment);
 
-    // printf("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
+    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
 
     size_t max_avail = 0;
 
@@ -173,7 +176,7 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str
         }
     }
 
-    // printf("block %d\n", best_fit_block);
+    AT_PRINTF("block %d\n", best_fit_block);
 
     if (best_fit_block == -1) {
         fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
@@ -217,7 +220,8 @@ void ggml_allocator_default_free_tensor(struct ggml_backend_buffer * alloc, stru
 
     size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
     size = aligned_offset(NULL, size, allocator_ctx->alignment);
-    //printf("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks);
+    AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks);
+    tensor->freed = true;
 
     // see if we can merge with an existing block
     for (int i = 0; i < allocator_ctx->n_free_blocks; i++) {
@@ -826,11 +830,13 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
             struct ggml_tensor * node = gf->nodes[i];
             node->n_children = 0;
             node->n_views = 0;
+            //node->freed = false;
         }
         for (int i = 0; i < gf->n_leafs; i++) {
             struct ggml_tensor * leaf = gf->leafs[i];
             leaf->n_children = 0;
             leaf->n_views = 0;
+            //leaf->freed = false;
         }
     }
 
@@ -839,6 +845,13 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
         struct ggml_cgraph * gf = graphs[g];
         for (int i = 0; i < gf->n_nodes; i++) {
             struct ggml_tensor * node = gf->nodes[i];
+            if (ggml_is_view(node)) {
+                struct ggml_tensor * ancestor = node;
+                do {
+                    ancestor = view_parent(ancestor);
+                } while (ggml_is_view(ancestor));
+                ancestor->n_views += 1;
+            }
             for (int j = 0; j < GGML_MAX_SRC; j++) {
                 struct ggml_tensor * parent = node->src[j];
                 if (parent == NULL) {
@@ -869,47 +882,74 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
                 if (parent == NULL) {
                     break;
                 }
+                if (parent->freed) {
+                    printf("!!!!!! tensor %s used after free\n", parent->name);
+                }
+                if (ggml_is_view(parent)) {
+                    struct ggml_tensor * ancestor = parent;
+                    do {
+                        ancestor = view_parent(ancestor);
+                    } while (ggml_is_view(ancestor));
+                    if (ancestor->freed) {
+                        printf("!!!!!! tensor %s used after free (as view %s)\n", ancestor->name, parent->name);
+                    }
+                    allocate_node(buffer, ancestor);
+                }
                 allocate_node(buffer, parent);
             }
 
             // allocate node
             allocate_node(buffer, node);
 
-            // update parents
-            if (is_view) {
-                struct ggml_tensor * ancestor = node;
-                do {
-                    ancestor = view_parent(ancestor);
-                } while (ggml_is_view(ancestor));
-                ancestor->n_views -= 1;
-                if (ancestor->n_views == 0) {
-                    ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
+            AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * parent = node->src[j];
+                if (parent == NULL) {
+                    break;
                 }
-            } else {
-                for (int j = 0; j < GGML_MAX_SRC; j++) {
-                    struct ggml_tensor * parent = node->src[j];
-                    if (parent == NULL) {
-                        break;
-                    }
+                AT_PRINTF("%s", parent->name);
+                if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
+                    AT_PRINTF(", ");
+                }
+            }
+            AT_PRINTF("\n");
+
+            // update parents
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * parent = node->src[j];
+                if (parent == NULL) {
+                    break;
+                }
+                parent->n_children -= 1;
+                if (parent->n_children == 0 && parent->n_views == 0) {
                     if (ggml_is_view(parent)) {
                         struct ggml_tensor * ancestor = parent;
                         do {
                             ancestor = view_parent(ancestor);
                         } while (ggml_is_view(ancestor));
                         ancestor->n_views -= 1;
-                        if (ancestor->n_views == 0) {
+                        if (ancestor->n_views == 0 && ancestor->n_children == 0) {
                             ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
                         }
                     }
                     else {
-                        parent->n_children -= 1;
-                        if (parent->n_children == 0) {
-                            // free parent
-                            ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
-                        }
+                        ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
                     }
                 }
             }
+
+            if (is_view) {
+                struct ggml_tensor * ancestor = node;
+                do {
+                    ancestor = view_parent(ancestor);
+                } while (ggml_is_view(ancestor));
+                ancestor->n_views -= 1;
+                if (ancestor->n_views == 0 && ancestor->n_children == 0) {
+                    ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
+                }
+            }
+
+            AT_PRINTF("\n");
         }
     }
 }
diff --git a/ggml.c b/ggml.c
index 8cf96383a..cff0b2f59 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4533,6 +4533,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
         /*.node_id      =*/ -1,
         /*.n_children   =*/ 0,
         /*.n_views      =*/ 0,
+        /*.freed        =*/ false,
         /*.perf_runs    =*/ 0,
         /*.perf_cycles  =*/ 0,
         /*.perf_time_us =*/ 0,
diff --git a/ggml.h b/ggml.h
index c93b7d1d4..50e81828a 100644
--- a/ggml.h
+++ b/ggml.h
@@ -425,6 +425,7 @@ extern "C" {
         int node_id; // used to build graphs
         int n_children;
         int n_views;
+        bool freed; // debug
 
         // performance
         int     perf_runs;
@@ -437,7 +438,7 @@ extern "C" {
 
         void * extra; // extra things e.g. for ggml-cuda.cu
 
-        char padding[12];
+        char padding[8];
     };
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
diff --git a/llama.cpp b/llama.cpp
index e16ba7ac8..1eefb8641 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -703,7 +703,9 @@ static bool kv_cache_init(
     const int64_t n_mem      = n_layer*n_ctx;
     const int64_t n_elements = n_embd*n_mem;
 
-    size_t size = 2u*n_elements*ggml_type_size(wtype) + 2u*MB;
+    size_t size = 2u*n_elements*ggml_type_size(wtype);
+
+    fprintf(stderr, "%s: allocating %.2f MB for kv cache\n", __func__, size / 1024.0 / 1024.0);
 
     cache.buf = ggml_buffer_alloc(backend, size, 2);
     cache.n = 0;