improved memory management fixes

This commit is contained in:
slaren 2023-07-21 12:41:46 +02:00
parent 56e9ae062c
commit 3d679827e7
4 changed files with 72 additions and 28 deletions

View File

@ -7,6 +7,9 @@
#define UNUSED(x) (void)(x)
//#define AT_PRINTF printf
#define AT_PRINTF(...) ((void)0)
// allocator
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
@ -146,16 +149,16 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str
/////
if (alloc->measure && allocator_ctx->size != MAX_SIZE_INIT) {
allocator_ctx->size = MAX_SIZE_INIT;
//allocator_ctx->data = 0;
allocator_ctx->data = 0x1000;
allocator_ctx->free_blocks[0].size = MAX_SIZE_INIT;
//allocator_ctx->free_blocks[0].addr = 0;
allocator_ctx->free_blocks[0].addr = 0x1000;
}
/////
size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
size = aligned_offset(NULL, size, allocator_ctx->alignment);
// printf("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
size_t max_avail = 0;
@ -173,7 +176,7 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str
}
}
// printf("block %d\n", best_fit_block);
AT_PRINTF("block %d\n", best_fit_block);
if (best_fit_block == -1) {
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
@ -217,7 +220,8 @@ void ggml_allocator_default_free_tensor(struct ggml_backend_buffer * alloc, stru
size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
size = aligned_offset(NULL, size, allocator_ctx->alignment);
//printf("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks);
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks);
tensor->freed = true;
// see if we can merge with an existing block
for (int i = 0; i < allocator_ctx->n_free_blocks; i++) {
@ -826,11 +830,13 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
struct ggml_tensor * node = gf->nodes[i];
node->n_children = 0;
node->n_views = 0;
//node->freed = false;
}
for (int i = 0; i < gf->n_leafs; i++) {
struct ggml_tensor * leaf = gf->leafs[i];
leaf->n_children = 0;
leaf->n_views = 0;
//leaf->freed = false;
}
}
@ -839,6 +845,13 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
struct ggml_cgraph * gf = graphs[g];
for (int i = 0; i < gf->n_nodes; i++) {
struct ggml_tensor * node = gf->nodes[i];
if (ggml_is_view(node)) {
struct ggml_tensor * ancestor = node;
do {
ancestor = view_parent(ancestor);
} while (ggml_is_view(ancestor));
ancestor->n_views += 1;
}
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * parent = node->src[j];
if (parent == NULL) {
@ -869,47 +882,74 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
if (parent == NULL) {
break;
}
if (parent->freed) {
printf("!!!!!! tensor %s used after free\n", parent->name);
}
if (ggml_is_view(parent)) {
struct ggml_tensor * ancestor = parent;
do {
ancestor = view_parent(ancestor);
} while (ggml_is_view(ancestor));
if (ancestor->freed) {
printf("!!!!!! tensor %s used after free (as view %s)\n", ancestor->name, parent->name);
}
allocate_node(buffer, ancestor);
}
allocate_node(buffer, parent);
}
// allocate node
allocate_node(buffer, node);
// update parents
if (is_view) {
struct ggml_tensor * ancestor = node;
do {
ancestor = view_parent(ancestor);
} while (ggml_is_view(ancestor));
ancestor->n_views -= 1;
if (ancestor->n_views == 0) {
ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * parent = node->src[j];
if (parent == NULL) {
break;
}
} else {
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * parent = node->src[j];
if (parent == NULL) {
break;
}
AT_PRINTF("%s", parent->name);
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
AT_PRINTF(", ");
}
}
AT_PRINTF("\n");
// update parents
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * parent = node->src[j];
if (parent == NULL) {
break;
}
parent->n_children -= 1;
if (parent->n_children == 0 && parent->n_views == 0) {
if (ggml_is_view(parent)) {
struct ggml_tensor * ancestor = parent;
do {
ancestor = view_parent(ancestor);
} while (ggml_is_view(ancestor));
ancestor->n_views -= 1;
if (ancestor->n_views == 0) {
if (ancestor->n_views == 0 && ancestor->n_children == 0) {
ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
}
}
else {
parent->n_children -= 1;
if (parent->n_children == 0) {
// free parent
ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
}
ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
}
}
}
if (is_view) {
struct ggml_tensor * ancestor = node;
do {
ancestor = view_parent(ancestor);
} while (ggml_is_view(ancestor));
ancestor->n_views -= 1;
if (ancestor->n_views == 0 && ancestor->n_children == 0) {
ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
}
}
AT_PRINTF("\n");
}
}
}

1
ggml.c
View File

@ -4533,6 +4533,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
/*.node_id =*/ -1,
/*.n_children =*/ 0,
/*.n_views =*/ 0,
/*.freed =*/ false,
/*.perf_runs =*/ 0,
/*.perf_cycles =*/ 0,
/*.perf_time_us =*/ 0,

3
ggml.h
View File

@ -425,6 +425,7 @@ extern "C" {
int node_id; // used to build graphs
int n_children;
int n_views;
bool freed; // debug
// performance
int perf_runs;
@ -437,7 +438,7 @@ extern "C" {
void * extra; // extra things e.g. for ggml-cuda.cu
char padding[12];
char padding[8];
};
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);

View File

@ -703,7 +703,9 @@ static bool kv_cache_init(
const int64_t n_mem = n_layer*n_ctx;
const int64_t n_elements = n_embd*n_mem;
size_t size = 2u*n_elements*ggml_type_size(wtype) + 2u*MB;
size_t size = 2u*n_elements*ggml_type_size(wtype);
fprintf(stderr, "%s: allocating %.2f MB for kv cache\n", __func__, size / 1024.0 / 1024.0);
cache.buf = ggml_buffer_alloc(backend, size, 2);
cache.n = 0;