From 3d679827e7d21a5a970582bc09afcc575871e0b5 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 21 Jul 2023 12:41:46 +0200 Subject: [PATCH] improved memory management fixes --- ggml-backend.c | 92 ++++++++++++++++++++++++++++++++++++-------------- ggml.c | 1 + ggml.h | 3 +- llama.cpp | 4 ++- 4 files changed, 72 insertions(+), 28 deletions(-) diff --git a/ggml-backend.c b/ggml-backend.c index f19454a15..a8fc3632b 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -7,6 +7,9 @@ #define UNUSED(x) (void)(x) +//#define AT_PRINTF printf +#define AT_PRINTF(...) ((void)0) + // allocator static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { @@ -146,16 +149,16 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str ///// if (alloc->measure && allocator_ctx->size != MAX_SIZE_INIT) { allocator_ctx->size = MAX_SIZE_INIT; - //allocator_ctx->data = 0; + allocator_ctx->data = 0x1000; allocator_ctx->free_blocks[0].size = MAX_SIZE_INIT; - //allocator_ctx->free_blocks[0].addr = 0; + allocator_ctx->free_blocks[0].addr = 0x1000; } ///// size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor); size = aligned_offset(NULL, size, allocator_ctx->alignment); - // printf("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); + AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); size_t max_avail = 0; @@ -173,7 +176,7 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str } } - // printf("block %d\n", best_fit_block); + AT_PRINTF("block %d\n", best_fit_block); if (best_fit_block == -1) { fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", @@ -217,7 +220,8 @@ void ggml_allocator_default_free_tensor(struct ggml_backend_buffer * alloc, stru size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor); size = aligned_offset(NULL, size, allocator_ctx->alignment); - //printf("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks); + AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks); + tensor->freed = true; // see if we can merge with an existing block for (int i = 0; i < allocator_ctx->n_free_blocks; i++) { @@ -826,11 +830,13 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s struct ggml_tensor * node = gf->nodes[i]; node->n_children = 0; node->n_views = 0; + //node->freed = false; } for (int i = 0; i < gf->n_leafs; i++) { struct ggml_tensor * leaf = gf->leafs[i]; leaf->n_children = 0; leaf->n_views = 0; + //leaf->freed = false; } } @@ -839,6 +845,13 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s struct ggml_cgraph * gf = graphs[g]; for (int i = 0; i < gf->n_nodes; i++) { struct ggml_tensor * node = gf->nodes[i]; + if (ggml_is_view(node)) { + struct ggml_tensor * ancestor = node; + do { + ancestor = view_parent(ancestor); + } while (ggml_is_view(ancestor)); + ancestor->n_views += 1; + } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * parent = node->src[j]; if (parent == NULL) { @@ -869,47 +882,74 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s if (parent == NULL) { break; } + if (parent->freed) { + printf("!!!!!! tensor %s used after free\n", parent->name); + } + if (ggml_is_view(parent)) { + struct ggml_tensor * ancestor = parent; + do { + ancestor = view_parent(ancestor); + } while (ggml_is_view(ancestor)); + if (ancestor->freed) { + printf("!!!!!! tensor %s used after free (as view %s)\n", ancestor->name, parent->name); + } + allocate_node(buffer, ancestor); + } allocate_node(buffer, parent); } // allocate node allocate_node(buffer, node); - // update parents - if (is_view) { - struct ggml_tensor * ancestor = node; - do { - ancestor = view_parent(ancestor); - } while (ggml_is_view(ancestor)); - ancestor->n_views -= 1; - if (ancestor->n_views == 0) { - ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor); + AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name); + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; } - } else { - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } + AT_PRINTF("%s", parent->name); + if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { + AT_PRINTF(", "); + } + } + AT_PRINTF("\n"); + + // update parents + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + parent->n_children -= 1; + if (parent->n_children == 0 && parent->n_views == 0) { if (ggml_is_view(parent)) { struct ggml_tensor * ancestor = parent; do { ancestor = view_parent(ancestor); } while (ggml_is_view(ancestor)); ancestor->n_views -= 1; - if (ancestor->n_views == 0) { + if (ancestor->n_views == 0 && ancestor->n_children == 0) { ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor); } } else { - parent->n_children -= 1; - if (parent->n_children == 0) { - // free parent - ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent); - } + ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent); } } } + + if (is_view) { + struct ggml_tensor * ancestor = node; + do { + ancestor = view_parent(ancestor); + } while (ggml_is_view(ancestor)); + ancestor->n_views -= 1; + if (ancestor->n_views == 0 && ancestor->n_children == 0) { + ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor); + } + } + + AT_PRINTF("\n"); } } } diff --git a/ggml.c b/ggml.c index 8cf96383a..cff0b2f59 100644 --- a/ggml.c +++ b/ggml.c @@ -4533,6 +4533,7 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.node_id =*/ -1, /*.n_children =*/ 0, /*.n_views =*/ 0, + /*.freed =*/ false, /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, diff --git a/ggml.h b/ggml.h index c93b7d1d4..50e81828a 100644 --- a/ggml.h +++ b/ggml.h @@ -425,6 +425,7 @@ extern "C" { int node_id; // used to build graphs int n_children; int n_views; + bool freed; // debug // performance int perf_runs; @@ -437,7 +438,7 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - char padding[12]; + char padding[8]; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); diff --git a/llama.cpp b/llama.cpp index e16ba7ac8..1eefb8641 100644 --- a/llama.cpp +++ b/llama.cpp @@ -703,7 +703,9 @@ static bool kv_cache_init( const int64_t n_mem = n_layer*n_ctx; const int64_t n_elements = n_embd*n_mem; - size_t size = 2u*n_elements*ggml_type_size(wtype) + 2u*MB; + size_t size = 2u*n_elements*ggml_type_size(wtype); + + fprintf(stderr, "%s: allocating %.2f MB for kv cache\n", __func__, size / 1024.0 / 1024.0); cache.buf = ggml_buffer_alloc(backend, size, 2); cache.n = 0;