improved memory management

This commit is contained in:
slaren 2023-07-21 00:28:49 +02:00
parent de69f8f20d
commit cd6f5dec92
6 changed files with 350 additions and 198 deletions

View File

@ -175,8 +175,6 @@ int main(int argc, char ** argv)
llama_backend_free();
llama_backend_free();
return 0;
}

View File

@ -57,11 +57,9 @@ static void ggml_allocator_simple_alloc_tensor(struct ggml_backend_buffer * allo
}
alloc->max_size = MAX(alloc->max_size, context->offset + size);
if (alloc->measure) {
tensor->data = NULL;
} else {
tensor->data = (char*)context->data + context->offset;
if (!alloc->measure) {
if (alloc->interface.init_tensor) {
ggml_backend_buffer_init_tensor(alloc, tensor);
}
@ -71,7 +69,7 @@ static void ggml_allocator_simple_alloc_tensor(struct ggml_backend_buffer * allo
}
static void ggml_allocator_simple_free_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) {
GGML_ASSERT(!"ggml_simple_allocator cannot free individual tensors");
GGML_ASSERT(!"ggml_allocator_simple cannot free individual tensors");
UNUSED(alloc);
UNUSED(tensor);
@ -117,12 +115,206 @@ static struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size
return allocator;
}
//
//////////////////////////////////////////////////////////////
// backend buffer allocator - default - can free tensors
struct free_block {
void * addr;
size_t size;
};
#define MAX_FREE_BLOCKS 128
struct ggml_allocator_default_context {
void * data;
size_t size;
size_t alignment;
int n_free_blocks;
struct free_block free_blocks[1024];
};
void ggml_allocator_default_free_buffer(struct ggml_backend_buffer * alloc) {
struct ggml_allocator_default_context * allocator_ctx = (struct ggml_allocator_default_context *)alloc->context;
free(allocator_ctx);
}
static const size_t MAX_SIZE_INIT = (1ULL<<40)-1;
void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) {
struct ggml_allocator_default_context * allocator_ctx = (struct ggml_allocator_default_context *)alloc->context;
/////
if (alloc->measure && allocator_ctx->size != MAX_SIZE_INIT) {
allocator_ctx->size = MAX_SIZE_INIT;
//allocator_ctx->data = 0;
allocator_ctx->free_blocks[0].size = MAX_SIZE_INIT;
//allocator_ctx->free_blocks[0].addr = 0;
}
/////
size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
size = aligned_offset(NULL, size, allocator_ctx->alignment);
// printf("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
size_t max_avail = 0;
//fprintf(stderr, "%s: allocating %s - %zu bytes\n", __func__, tensor->name, size);
// find the best fitting free block
int best_fit_block = -1;
size_t best_fit_size = SIZE_MAX;
for (int i = 0; i < allocator_ctx->n_free_blocks; i++) {
struct free_block * block = &allocator_ctx->free_blocks[i];
max_avail = MAX(max_avail, block->size);
if (block->size >= size && block->size <= best_fit_size) {
best_fit_block = i;
best_fit_size = block->size;
}
}
// printf("block %d\n", best_fit_block);
if (best_fit_block == -1) {
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
__func__, size, max_avail);
GGML_ASSERT(!"not enough space in the buffer");
return;
}
struct free_block * block = &allocator_ctx->free_blocks[best_fit_block];
void * addr = block->addr;
block->addr = (char*)block->addr + size;
block->size -= size;
if (block->size == 0) {
// remove block if empty
allocator_ctx->n_free_blocks--;
for (int j = best_fit_block; j < allocator_ctx->n_free_blocks; j++) {
allocator_ctx->free_blocks[j] = allocator_ctx->free_blocks[j+1];
}
}
alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)allocator_ctx->data + size);
tensor->data = addr;
if (!alloc->measure) {
if (alloc->interface.init_tensor) {
ggml_backend_buffer_init_tensor(alloc, tensor);
}
}
}
// this is a very naive implementation, but for our case the number of free blocks should be very small
void ggml_allocator_default_free_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) {
struct ggml_allocator_default_context * allocator_ctx = (struct ggml_allocator_default_context *)alloc->context;
void * ptr = tensor->data;
if (ptr < allocator_ctx->data || (char*)ptr >= (char*)allocator_ctx->data + alloc->max_size) {
//fprintf(stderr, "%s: %s - tensor not in this buffer (%p - %p - %zu)\n", __func__, tensor->name, ptr, allocator_ctx->data, allocator_ctx->size);
//GGML_ASSERT(!"trying to free a tensor that was not allocated by this allocator");
return;
}
size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
size = aligned_offset(NULL, size, allocator_ctx->alignment);
//printf("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks);
// see if we can merge with an existing block
for (int i = 0; i < allocator_ctx->n_free_blocks; i++) {
struct free_block * block = &allocator_ctx->free_blocks[i];
// check if ptr is at the end of the block
if ((char*)block->addr + block->size == ptr) {
block->size += size;
// check if we can merge with the next block
if (i < allocator_ctx->n_free_blocks - 1 && (char*)block->addr + block->size == allocator_ctx->free_blocks[i+1].addr) {
block->size += allocator_ctx->free_blocks[i+1].size;
allocator_ctx->n_free_blocks--;
for (int j = i+1; j < allocator_ctx->n_free_blocks; j++) {
allocator_ctx->free_blocks[j] = allocator_ctx->free_blocks[j+1];
}
}
return;
}
// check if ptr is at the beginning of the block
if ((char*)ptr + size == block->addr) {
block->addr = ptr;
block->size += size;
// check if we can merge with the previous block
if (i > 0 && (char*)allocator_ctx->free_blocks[i-1].addr + allocator_ctx->free_blocks[i-1].size == block->addr) {
allocator_ctx->free_blocks[i-1].size += block->size;
allocator_ctx->n_free_blocks--;
for (int j = i; j < allocator_ctx->n_free_blocks; j++) {
allocator_ctx->free_blocks[j] = allocator_ctx->free_blocks[j+1];
}
}
return;
}
}
// otherwise, add a new block
if (allocator_ctx->n_free_blocks < MAX_FREE_BLOCKS) {
// insert the new block in the correct position to keep the array sorted
int insert_pos = 0;
while (insert_pos < allocator_ctx->n_free_blocks && allocator_ctx->free_blocks[insert_pos].addr < ptr) {
insert_pos++;
}
// shift all blocks from insert_pos onward to make room for the new block
for (int i = allocator_ctx->n_free_blocks; i > insert_pos; i--) {
allocator_ctx->free_blocks[i] = allocator_ctx->free_blocks[i-1];
}
// insert the new block
allocator_ctx->free_blocks[insert_pos].addr = ptr;
allocator_ctx->free_blocks[insert_pos].size = size;
allocator_ctx->n_free_blocks++;
}
else {
GGML_ASSERT(!"out of free blocks");
}
}
static void ggml_allocator_default_reset(struct ggml_backend_buffer * alloc) {
struct ggml_allocator_default_context * ctx = (struct ggml_allocator_default_context *)alloc->context;
ctx->n_free_blocks = 1; // TODO
size_t align_offset = aligned_offset(ctx->data, 0, ctx->alignment);
ctx->free_blocks[0].addr = (char *)ctx->data + align_offset;
ctx->free_blocks[0].size = ctx->size - align_offset;
}
static const struct ggml_backend_buffer_interface ggml_allocator_default_interface = {
/* .free_buffer = */ ggml_allocator_default_free_buffer,
/* .alloc_tensor = */ ggml_allocator_default_alloc_tensor,
/* .free_tensor = */ ggml_allocator_default_free_tensor,
/* .reset = */ ggml_allocator_default_reset,
/* .get_alloc_size = */ ggml_allocator_simple_get_alloc_size,
/* .init_tensor = */ NULL,
/* .free_data = */ NULL,
};
struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment) {
return ggml_allocator_simple_init(data, size, alignment);
struct ggml_allocator_default_context * ctx = malloc(sizeof(struct ggml_allocator_default_context) /* + n_free_blocks * sizeof(struct free_block) */);
ctx->data = data;
ctx->size = size;
ctx->alignment = alignment;
ctx->n_free_blocks = 1; // TODO
size_t align_offset = aligned_offset(data, 0, alignment);
ctx->free_blocks[0].addr = (char *)data + align_offset;
ctx->free_blocks[0].size = size - align_offset;
struct ggml_backend_buffer * allocator = malloc(sizeof(struct ggml_backend_buffer));
*allocator = (struct ggml_backend_buffer){
/* .interface = */ ggml_allocator_default_interface,
/* .context = */ ctx,
/* .backend = */ NULL,
/* .backend_data = */ NULL,
/* .measure = */ false,
/* .max_size = */ 0,
};
return allocator;
}
//struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment) {
// return ggml_allocator_simple_init(data, size, alignment);
//}
// buffer
struct ggml_buffer * ggml_buffer_alloc(struct ggml_backend * backend, size_t size, size_t max_tensors) {
@ -524,190 +716,6 @@ void ggml_graph_splits_compute(struct ggml_graph_splits * splits) {
//exit(0);
}
#if 0
// default allocator
struct free_block {
void * addr;
size_t size;
};
struct ggml_backend_default_allocator_context {
void * data;
size_t alignment;
int n_free_blocks;
struct free_block free_blocks[];
};
void ggml_backend_default_allocator_free_context(ggml_allocator_context_t ctx) {
struct ggml_backend_default_allocator_context * allocator_ctx = ctx;
free(allocator_ctx);
}
ggml_allocator_context_t ggml_backend_default_allocator_context(void * data, size_t size, size_t alignment, int n_free_blocks) {
struct ggml_backend_default_allocator_context * ctx = malloc(sizeof(struct ggml_backend_default_allocator_context) + n_free_blocks * sizeof(struct free_block));
ctx->data = data;
ctx->alignment = alignment;
ctx->n_free_blocks = 1;
size_t align_offset = align_offset(data, alignment);
ctx->free_blocks[0].addr = (char *)data + align_offset;
ctx->free_blocks[0].size = size - align_offset;
return ctx;
}
void * ggml_backend_default_allocator_alloc(ggml_allocator_context_t ctx, size_t size) {
struct ggml_backend_default_allocator_context * allocator_ctx = ctx;
size = align_size(size, allocator_ctx->alignment);
// find a free block
for (int i = 0; i < allocator_ctx->n_free_blocks; i++) {
struct free_block * block = &allocator_ctx->free_blocks[i];
if (block->size >= size) {
void * addr = block->addr;
block->addr += size;
block->size -= size;
if (block->size == 0) {
// remove block if empty
allocator_ctx->n_free_blocks--;
for (int j = i; j < allocator_ctx->n_free_blocks; j++) {
allocator_ctx->free_blocks[j] = allocator_ctx->free_blocks[j+1];
}
}
return addr;
}
}
return NULL;
}
// this is a very naive implementation, but for our case the number of free blocks should be very small
void ggml_backend_default_allocator_free(ggml_allocator_context_t ctx, void * ptr, size_t size) {
struct ggml_backend_default_allocator_context * allocator_ctx = ctx;
size = align_size(size, allocator_ctx->alignment);
// see if we can merge with an existing block
for (int i = 0; i < allocator_ctx->n_free_blocks; i++) {
struct free_block * block = &allocator_ctx->free_blocks[i];
// check if ptr is at the end of the block
if (block->addr + block->size == ptr) {
block->size += size;
// check if we can merge with the next block
if (i < allocator_ctx->n_free_blocks - 1 && block->addr + block->size == allocator_ctx->free_blocks[i+1].addr) {
block->size += allocator_ctx->free_blocks[i+1].size;
allocator_ctx->n_free_blocks--;
for (int j = i+1; j < allocator_ctx->n_free_blocks; j++) {
allocator_ctx->free_blocks[j] = allocator_ctx->free_blocks[j+1];
}
}
return;
}
// check if ptr is at the beginning of the block
if (ptr + size == block->addr) {
block->addr = ptr;
block->size += size;
// check if we can merge with the previous block
if (i > 0 && allocator_ctx->free_blocks[i-1].addr + allocator_ctx->free_blocks[i-1].size == block->addr) {
allocator_ctx->free_blocks[i-1].size += block->size;
allocator_ctx->n_free_blocks--;
for (int j = i; j < allocator_ctx->n_free_blocks; j++) {
allocator_ctx->free_blocks[j] = allocator_ctx->free_blocks[j+1];
}
}
return;
}
}
// otherwise, add a new block
if (allocator_ctx->n_free_blocks < MAX_FREE_BLOCKS) {
// insert the new block in the correct position to keep the array sorted
int insert_pos = 0;
while (insert_pos < allocator_ctx->n_free_blocks && allocator_ctx->free_blocks[insert_pos].addr < ptr) {
insert_pos++;
}
// shift all blocks from insert_pos onward to make room for the new block
for (int i = allocator_ctx->n_free_blocks; i > insert_pos; i--) {
allocator_ctx->free_blocks[i] = allocator_ctx->free_blocks[i-1];
}
// insert the new block
allocator_ctx->free_blocks[insert_pos].addr = ptr;
allocator_ctx->free_blocks[insert_pos].size = size;
allocator_ctx->n_free_blocks++;
}
else {
GGML_ASSERT(!"out of free blocks");
}
}
static bool ggml_is_view(struct ggml_tensor * t) {
return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
t->op == GGML_OP_PERMUTE || t->op == GGML_OP_NONE;
}
NOTE: id can be n_leaf OR n_node instead, we can determine the type by checking if the node is a leaf or not
void allocate_graph(struct ggml_cgraph * gf, struct ggml_buffer * buffer) {
int node_children_count[GGML_MAX_NODES*2];
int node_view_count[GGML_MAX_NODES*2];
memset(node_children_count, 0, sizeof(int) * (gf->n_nodes + gf->n_leafs));
memset(node_view_count, 0, sizeof(int) * (gf->n_nodes + gf->n_leafs));
// count number of children and views
for (int i = 0; i < gf->n_nodes; i++) {
struct ggml_tensor * node = gf->nodes[i];
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * parent = node->src[j];
if (parent == NULL) {
break;
}
// todo: ....
node_children_count[parent->id] += 1;
if (ggml_is_view(parent)) {
struct ggml_tensor * ancestor = parent;
do {
node_view_count[ancestor->id] += 1;
ancestor = ancestor->src[0];
} while (ggml_is_view(ancestor));
}
}
}
// allocate tensors
for (int i = 0; i < gf->n_nodes; i++) {
struct ggml_tensor * node = gf->nodes[i];
bool is_view = ggml_is_view(node);
if (is_view) {
// allocate view accordingly to the OP
node->data = node->src[0]->data; // + offset
struct ggml_tensor * ancestor = node->src[0];
while (ggml_is_view(ancestor)) {
ancestor = ancestor->src[0];
}
node_view_count[ancestor->id] -= 1;
} else {
if (node->data == NULL) {
// allocate tensor
// TODO: if last children and size == parent.size, then reuse parent tensor (auto in-place)
// may need a list of ops that can be in-place
ggml_backend_alloc_tensor(buffer, node);
}
}
// update parents
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * parent = node->src[j];
if (parent == NULL) {
break;
}
if (is_view) {
node_view_count[parent->id] -= 1;
}
node_children_count[parent->id] -= 1;
if (node_children_count[parent->id] == 0 && node_view_count[parent->id] == 0) {
// free parent
ggml_backend_free_tensor(buffer, parent);
}
}
}
}
#endif
void ggml_graph_allocate_tensors(struct ggml_cgraph * graph, struct ggml_context * ctx) {
ggml_graph_allocate_tensors_n(&graph, 1, ctx);
}
@ -717,6 +725,21 @@ static bool ggml_is_view(struct ggml_tensor * t) {
t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
}
struct ggml_tensor * view_parent(struct ggml_tensor * t) {
switch (t->op) {
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_TRANSPOSE:
case GGML_OP_PERMUTE:
return t->src[0];
case GGML_OP_CPY:
return t->src[1];
default:
return NULL;
}
}
#if 0
void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, struct ggml_context * ctx) {
struct ggml_buffer * buffer = ggml_get_buffer(ctx);
for (int i = 0; i < n_graphs; i++) {
@ -763,6 +786,134 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
}
//printf("\n\n\n");
}
#else
void allocate_node(struct ggml_buffer * buffer, struct ggml_tensor * node) {
if (node->data == NULL) {
if (ggml_is_view(node)) {
size_t offset;
switch(node->op) {
case GGML_OP_VIEW:
memcpy(&offset, node->op_params, sizeof(size_t));
node->data = (char *) node->src[0]->data + offset;
break;
case GGML_OP_RESHAPE:
case GGML_OP_TRANSPOSE:
case GGML_OP_PERMUTE:
node->data = node->src[0]->data;
break;
case GGML_OP_CPY:
node->data = node->src[1]->data;
break;
default:
GGML_ASSERT(!"unknown view op");
break;
}
} else {
//printf("allocating tensor %s\n", node->name);
ggml_backend_buffer_tensor_alloc(buffer->backend_buffer, node);
}
}
}
void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, struct ggml_context * ctx) {
struct ggml_buffer * buffer = ggml_get_buffer(ctx);
// reset counters
for (int g = 0; g < n_graphs; g++) {
struct ggml_cgraph * gf = graphs[g];
for (int i = 0; i < gf->n_nodes; i++) {
struct ggml_tensor * node = gf->nodes[i];
node->n_children = 0;
node->n_views = 0;
}
for (int i = 0; i < gf->n_leafs; i++) {
struct ggml_tensor * leaf = gf->leafs[i];
leaf->n_children = 0;
leaf->n_views = 0;
}
}
// count number of children and views
for (int g = 0; g < n_graphs; g++) {
struct ggml_cgraph * gf = graphs[g];
for (int i = 0; i < gf->n_nodes; i++) {
struct ggml_tensor * node = gf->nodes[i];
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * parent = node->src[j];
if (parent == NULL) {
break;
}
parent->n_children += 1;
if (ggml_is_view(parent)) {
struct ggml_tensor * ancestor = parent;
do {
ancestor = view_parent(ancestor);
} while (ggml_is_view(ancestor));
ancestor->n_views += 1;
}
}
}
}
// allocate tensors
for (int g = 0; g < n_graphs; g++) {
struct ggml_cgraph * gf = graphs[g];
for (int i = 0; i < gf->n_nodes; i++) {
struct ggml_tensor * node = gf->nodes[i];
bool is_view = ggml_is_view(node);
// allocate parents (leafs)
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * parent = node->src[j];
if (parent == NULL) {
break;
}
allocate_node(buffer, parent);
}
// allocate node
allocate_node(buffer, node);
// update parents
if (is_view) {
struct ggml_tensor * ancestor = node;
do {
ancestor = view_parent(ancestor);
} while (ggml_is_view(ancestor));
ancestor->n_views -= 1;
if (ancestor->n_views == 0) {
ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
}
} else {
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * parent = node->src[j];
if (parent == NULL) {
break;
}
if (ggml_is_view(parent)) {
struct ggml_tensor * ancestor = parent;
do {
ancestor = view_parent(ancestor);
} while (ggml_is_view(ancestor));
ancestor->n_views -= 1;
if (ancestor->n_views == 0) {
ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
}
}
else {
parent->n_children -= 1;
if (parent->n_children == 0) {
// free parent
ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
}
}
}
}
}
}
}
#endif
void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits) {
bool visited[GGML_MAX_SPLITS] = {false};

View File

@ -35,7 +35,7 @@ extern "C" {
// backend buffer helper functions
GGML_API void ggml_backend_buffer_free(struct ggml_backend_buffer * alloc);
static inline void ggml_backend_buffer_tensor_alloc(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.alloc_tensor(alloc, tensor); }
static inline void ggml_backend_buffer_free_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.free_tensor(alloc, tensor); }
static inline void ggml_backend_buffer_tensor_free(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.free_tensor(alloc, tensor); }
static inline void ggml_backend_buffer_reset(struct ggml_backend_buffer * alloc) { alloc->interface.reset(alloc); }
// default buffer allocator

2
ggml.c
View File

@ -4531,6 +4531,8 @@ struct ggml_tensor * ggml_new_tensor_impl(
/*.grad =*/ NULL,
/*.src =*/ { NULL },
/*.node_id =*/ -1,
/*.n_children =*/ 0,
/*.n_views =*/ 0,
/*.perf_runs =*/ 0,
/*.perf_cycles =*/ 0,
/*.perf_time_us =*/ 0,

6
ggml.h
View File

@ -423,21 +423,21 @@ extern "C" {
struct ggml_tensor * src[GGML_MAX_SRC];
int node_id; // used to build graphs
int n_children;
int n_views;
// performance
int perf_runs;
int64_t perf_cycles;
int64_t perf_time_us;
void * data;
char name[GGML_MAX_NAME];
void * extra; // extra things e.g. for ggml-cuda.cu
char padding[4];
char padding[12];
};
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);

View File

@ -164,6 +164,7 @@ struct llama_kv_cache {
~llama_kv_cache() {
if (ctx) {
ggml_buffer_free(buf);
ggml_free(ctx);
}
}
@ -1210,6 +1211,7 @@ static ggml_graph_splits llama_build_graph(
// TODO: this shouldn't be necessary
bool measuring = lctx.bufs_compute[0]->backend_buffer->measure;
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx_kv, GGML_TYPE_F32, 1);
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
if (!measuring) {
// this should be automatic
if (KQ_scale->data == NULL) {
@ -1217,7 +1219,6 @@ static ggml_graph_splits llama_build_graph(
}
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
}
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
if (embeddings_input) {
// use embeddings as input