improved memory management fixes

2024-12-27 03:44:35 +00:00 · 2023-07-21 12:41:46 +02:00 · 2023-07-21 12:41:46 +02:00 · 3d679827e7
commit 3d679827e7
parent 56e9ae062c
4 changed files with 72 additions and 28 deletions
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -7,6 +7,9 @@

 #define UNUSED(x) (void)(x)

+//#define AT_PRINTF printf
+#define AT_PRINTF(...) ((void)0)
+
 // allocator

 static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
@ -146,16 +149,16 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str
    /////
    if (alloc->measure && allocator_ctx->size != MAX_SIZE_INIT) {
        allocator_ctx->size = MAX_SIZE_INIT;
-        //allocator_ctx->data = 0;
+        allocator_ctx->data = 0x1000;
        allocator_ctx->free_blocks[0].size = MAX_SIZE_INIT;
-        //allocator_ctx->free_blocks[0].addr = 0;
+        allocator_ctx->free_blocks[0].addr = 0x1000;
    }
    /////

    size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
    size = aligned_offset(NULL, size, allocator_ctx->alignment);

-    // printf("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
+    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);

    size_t max_avail = 0;

@ -173,7 +176,7 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str
        }
    }

-    // printf("block %d\n", best_fit_block);
+    AT_PRINTF("block %d\n", best_fit_block);

    if (best_fit_block == -1) {
        fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
@ -217,7 +220,8 @@ void ggml_allocator_default_free_tensor(struct ggml_backend_buffer * alloc, stru

    size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
    size = aligned_offset(NULL, size, allocator_ctx->alignment);
-    //printf("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks);
+    AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks);
+    tensor->freed = true;

    // see if we can merge with an existing block
    for (int i = 0; i < allocator_ctx->n_free_blocks; i++) {
@ -826,11 +830,13 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
            struct ggml_tensor * node = gf->nodes[i];
            node->n_children = 0;
            node->n_views = 0;
+            //node->freed = false;
        }
        for (int i = 0; i < gf->n_leafs; i++) {
            struct ggml_tensor * leaf = gf->leafs[i];
            leaf->n_children = 0;
            leaf->n_views = 0;
+            //leaf->freed = false;
        }
    }

@ -839,6 +845,13 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
        struct ggml_cgraph * gf = graphs[g];
        for (int i = 0; i < gf->n_nodes; i++) {
            struct ggml_tensor * node = gf->nodes[i];
+            if (ggml_is_view(node)) {
+                struct ggml_tensor * ancestor = node;
+                do {
+                    ancestor = view_parent(ancestor);
+                } while (ggml_is_view(ancestor));
+                ancestor->n_views += 1;
+            }
            for (int j = 0; j < GGML_MAX_SRC; j++) {
                struct ggml_tensor * parent = node->src[j];
                if (parent == NULL) {
@ -869,47 +882,74 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
                if (parent == NULL) {
                    break;
                }
+                if (parent->freed) {
+                    printf("!!!!!! tensor %s used after free\n", parent->name);
+                }
+                if (ggml_is_view(parent)) {
+                    struct ggml_tensor * ancestor = parent;
+                    do {
+                        ancestor = view_parent(ancestor);
+                    } while (ggml_is_view(ancestor));
+                    if (ancestor->freed) {
+                        printf("!!!!!! tensor %s used after free (as view %s)\n", ancestor->name, parent->name);
+                    }
+                    allocate_node(buffer, ancestor);
+                }
                allocate_node(buffer, parent);
            }

            // allocate node
            allocate_node(buffer, node);

-            // update parents
-            if (is_view) {
-                struct ggml_tensor * ancestor = node;
-                do {
-                    ancestor = view_parent(ancestor);
-                } while (ggml_is_view(ancestor));
-                ancestor->n_views -= 1;
-                if (ancestor->n_views == 0) {
-                    ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
+            AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * parent = node->src[j];
+                if (parent == NULL) {
+                    break;
                }
-            } else {
-                for (int j = 0; j < GGML_MAX_SRC; j++) {
-                    struct ggml_tensor * parent = node->src[j];
-                    if (parent == NULL) {
-                        break;
-                    }
+                AT_PRINTF("%s", parent->name);
+                if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
+                    AT_PRINTF(", ");
+                }
+            }
+            AT_PRINTF("\n");
+
+            // update parents
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * parent = node->src[j];
+                if (parent == NULL) {
+                    break;
+                }
+                parent->n_children -= 1;
+                if (parent->n_children == 0 && parent->n_views == 0) {
                    if (ggml_is_view(parent)) {
                        struct ggml_tensor * ancestor = parent;
                        do {
                            ancestor = view_parent(ancestor);
                        } while (ggml_is_view(ancestor));
                        ancestor->n_views -= 1;
-                        if (ancestor->n_views == 0) {
+                        if (ancestor->n_views == 0 && ancestor->n_children == 0) {
                            ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
                        }
                    }
                    else {
-                        parent->n_children -= 1;
-                        if (parent->n_children == 0) {
-                            // free parent
-                            ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
-                        }
+                        ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
                    }
                }
            }
+
+            if (is_view) {
+                struct ggml_tensor * ancestor = node;
+                do {
+                    ancestor = view_parent(ancestor);
+                } while (ggml_is_view(ancestor));
+                ancestor->n_views -= 1;
+                if (ancestor->n_views == 0 && ancestor->n_children == 0) {
+                    ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
+                }
+            }
+
+            AT_PRINTF("\n");
        }
    }
 }
--- a/ggml.c
+++ b/ggml.c
@ -4533,6 +4533,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
        /*.node_id      =*/ -1,
        /*.n_children   =*/ 0,
        /*.n_views      =*/ 0,
+        /*.freed        =*/ false,
        /*.perf_runs    =*/ 0,
        /*.perf_cycles  =*/ 0,
        /*.perf_time_us =*/ 0,
--- a/ggml.h
+++ b/ggml.h
@ -425,6 +425,7 @@ extern "C" {
        int node_id; // used to build graphs
        int n_children;
        int n_views;
+        bool freed; // debug

        // performance
        int     perf_runs;
@ -437,7 +438,7 @@ extern "C" {

        void * extra; // extra things e.g. for ggml-cuda.cu

-        char padding[12];
+        char padding[8];
    };

    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
--- a/llama.cpp
+++ b/llama.cpp
@ -703,7 +703,9 @@ static bool kv_cache_init(
    const int64_t n_mem      = n_layer*n_ctx;
    const int64_t n_elements = n_embd*n_mem;

-    size_t size = 2u*n_elements*ggml_type_size(wtype) + 2u*MB;
+    size_t size = 2u*n_elements*ggml_type_size(wtype);
+
+    fprintf(stderr, "%s: allocating %.2f MB for kv cache\n", __func__, size / 1024.0 / 1024.0);

    cache.buf = ggml_buffer_alloc(backend, size, 2);
    cache.n = 0;