improved memory management

2024-12-27 03:44:35 +00:00 · 2023-07-21 00:28:49 +02:00 · 2023-07-21 00:28:49 +02:00 · cd6f5dec92
commit cd6f5dec92
parent de69f8f20d
6 changed files with 350 additions and 198 deletions
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -175,8 +175,6 @@ int main(int argc, char ** argv)
    llama_backend_free();
    llama_backend_free();
    return 0;
 }
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -57,11 +57,9 @@ static void ggml_allocator_simple_alloc_tensor(struct ggml_backend_buffer * allo
    }
    alloc->max_size = MAX(alloc->max_size, context->offset + size);
    if (alloc->measure) {
        tensor->data = NULL;
    } else {
    tensor->data = (char*)context->data + context->offset;
    if (!alloc->measure) {
        if (alloc->interface.init_tensor) {
            ggml_backend_buffer_init_tensor(alloc, tensor);
        }
@ -71,7 +69,7 @@ static void ggml_allocator_simple_alloc_tensor(struct ggml_backend_buffer * allo
 }
 static void ggml_allocator_simple_free_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) {
-    GGML_ASSERT(!"ggml_simple_allocator cannot free individual tensors");
+    GGML_ASSERT(!"ggml_allocator_simple cannot free individual tensors");
    UNUSED(alloc);
    UNUSED(tensor);
@ -117,12 +115,206 @@ static struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size
    return allocator;
 }
-//
+//////////////////////////////////////////////////////////////
 // backend buffer allocator - default - can free tensors
 struct free_block {
    void * addr;
    size_t size;
 };
 #define MAX_FREE_BLOCKS 128
 struct ggml_allocator_default_context {
    void * data;
    size_t size;
    size_t alignment;
    int n_free_blocks;
    struct free_block free_blocks[1024];
 };
 void ggml_allocator_default_free_buffer(struct ggml_backend_buffer * alloc) {
    struct ggml_allocator_default_context * allocator_ctx = (struct ggml_allocator_default_context *)alloc->context;
    free(allocator_ctx);
 }
 static const size_t MAX_SIZE_INIT = (1ULL<<40)-1;
 void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) {
    struct ggml_allocator_default_context * allocator_ctx = (struct ggml_allocator_default_context *)alloc->context;
    /////
    if (alloc->measure && allocator_ctx->size != MAX_SIZE_INIT) {
        allocator_ctx->size = MAX_SIZE_INIT;
        //allocator_ctx->data = 0;
        allocator_ctx->free_blocks[0].size = MAX_SIZE_INIT;
        //allocator_ctx->free_blocks[0].addr = 0;
    }
    /////
    size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
    size = aligned_offset(NULL, size, allocator_ctx->alignment);
    // printf("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
    size_t max_avail = 0;
    //fprintf(stderr, "%s: allocating %s - %zu bytes\n", __func__, tensor->name, size);
    // find the best fitting free block
    int best_fit_block = -1;
    size_t best_fit_size = SIZE_MAX;
    for (int i = 0; i < allocator_ctx->n_free_blocks; i++) {
        struct free_block * block = &allocator_ctx->free_blocks[i];
        max_avail = MAX(max_avail, block->size);
        if (block->size >= size && block->size <= best_fit_size) {
            best_fit_block = i;
            best_fit_size = block->size;
        }
    }
    // printf("block %d\n", best_fit_block);
    if (best_fit_block == -1) {
        fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
                __func__, size, max_avail);
        GGML_ASSERT(!"not enough space in the buffer");
        return;
    }
    struct free_block * block = &allocator_ctx->free_blocks[best_fit_block];
    void * addr = block->addr;
    block->addr = (char*)block->addr + size;
    block->size -= size;
    if (block->size == 0) {
        // remove block if empty
        allocator_ctx->n_free_blocks--;
        for (int j = best_fit_block; j < allocator_ctx->n_free_blocks; j++) {
            allocator_ctx->free_blocks[j] = allocator_ctx->free_blocks[j+1];
        }
    }
    alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)allocator_ctx->data + size);
    tensor->data = addr;
    if (!alloc->measure) {
        if (alloc->interface.init_tensor) {
            ggml_backend_buffer_init_tensor(alloc, tensor);
        }
    }
 }
 // this is a very naive implementation, but for our case the number of free blocks should be very small
 void ggml_allocator_default_free_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) {
    struct ggml_allocator_default_context * allocator_ctx = (struct ggml_allocator_default_context *)alloc->context;
    void * ptr = tensor->data;
    if (ptr < allocator_ctx->data || (char*)ptr >= (char*)allocator_ctx->data + alloc->max_size) {
        //fprintf(stderr, "%s: %s - tensor not in this buffer (%p - %p - %zu)\n", __func__, tensor->name, ptr, allocator_ctx->data, allocator_ctx->size);
        //GGML_ASSERT(!"trying to free a tensor that was not allocated by this allocator");
        return;
    }
    size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
    size = aligned_offset(NULL, size, allocator_ctx->alignment);
    //printf("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks);
    // see if we can merge with an existing block
    for (int i = 0; i < allocator_ctx->n_free_blocks; i++) {
        struct free_block * block = &allocator_ctx->free_blocks[i];
        // check if ptr is at the end of the block
        if ((char*)block->addr + block->size == ptr) {
            block->size += size;
            // check if we can merge with the next block
            if (i < allocator_ctx->n_free_blocks - 1 && (char*)block->addr + block->size == allocator_ctx->free_blocks[i+1].addr) {
                block->size += allocator_ctx->free_blocks[i+1].size;
                allocator_ctx->n_free_blocks--;
                for (int j = i+1; j < allocator_ctx->n_free_blocks; j++) {
                    allocator_ctx->free_blocks[j] = allocator_ctx->free_blocks[j+1];
                }
            }
            return;
        }
        // check if ptr is at the beginning of the block
        if ((char*)ptr + size == block->addr) {
            block->addr = ptr;
            block->size += size;
            // check if we can merge with the previous block
            if (i > 0 && (char*)allocator_ctx->free_blocks[i-1].addr + allocator_ctx->free_blocks[i-1].size == block->addr) {
                allocator_ctx->free_blocks[i-1].size += block->size;
                allocator_ctx->n_free_blocks--;
                for (int j = i; j < allocator_ctx->n_free_blocks; j++) {
                    allocator_ctx->free_blocks[j] = allocator_ctx->free_blocks[j+1];
                }
            }
            return;
        }
    }
    // otherwise, add a new block
    if (allocator_ctx->n_free_blocks < MAX_FREE_BLOCKS) {
        // insert the new block in the correct position to keep the array sorted
        int insert_pos = 0;
        while (insert_pos < allocator_ctx->n_free_blocks && allocator_ctx->free_blocks[insert_pos].addr < ptr) {
            insert_pos++;
        }
        // shift all blocks from insert_pos onward to make room for the new block
        for (int i = allocator_ctx->n_free_blocks; i > insert_pos; i--) {
            allocator_ctx->free_blocks[i] = allocator_ctx->free_blocks[i-1];
        }
        // insert the new block
        allocator_ctx->free_blocks[insert_pos].addr = ptr;
        allocator_ctx->free_blocks[insert_pos].size = size;
        allocator_ctx->n_free_blocks++;
    }
    else {
        GGML_ASSERT(!"out of free blocks");
    }
 }
 static void ggml_allocator_default_reset(struct ggml_backend_buffer * alloc) {
    struct ggml_allocator_default_context * ctx = (struct ggml_allocator_default_context *)alloc->context;
    ctx->n_free_blocks = 1; // TODO
    size_t align_offset = aligned_offset(ctx->data, 0, ctx->alignment);
    ctx->free_blocks[0].addr = (char *)ctx->data + align_offset;
    ctx->free_blocks[0].size = ctx->size - align_offset;
 }
 static const struct ggml_backend_buffer_interface ggml_allocator_default_interface = {
    /* .free_buffer    = */ ggml_allocator_default_free_buffer,
    /* .alloc_tensor   = */ ggml_allocator_default_alloc_tensor,
    /* .free_tensor    = */ ggml_allocator_default_free_tensor,
    /* .reset          = */ ggml_allocator_default_reset,
    /* .get_alloc_size = */ ggml_allocator_simple_get_alloc_size,
    /* .init_tensor    = */ NULL,
    /* .free_data      = */ NULL,
 };
 struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment) {
-    return ggml_allocator_simple_init(data, size, alignment);
+    struct ggml_allocator_default_context * ctx = malloc(sizeof(struct ggml_allocator_default_context) /* + n_free_blocks * sizeof(struct free_block) */);
    ctx->data = data;
    ctx->size = size;
    ctx->alignment = alignment;
    ctx->n_free_blocks = 1; // TODO
    size_t align_offset = aligned_offset(data, 0, alignment);
    ctx->free_blocks[0].addr = (char *)data + align_offset;
    ctx->free_blocks[0].size = size - align_offset;
    struct ggml_backend_buffer * allocator = malloc(sizeof(struct ggml_backend_buffer));
    *allocator = (struct ggml_backend_buffer){
        /* .interface    = */ ggml_allocator_default_interface,
        /* .context      = */ ctx,
        /* .backend      = */ NULL,
        /* .backend_data = */ NULL,
        /* .measure      = */ false,
        /* .max_size     = */ 0,
    };
    return allocator;
 }
 //struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment) {
 //    return ggml_allocator_simple_init(data, size, alignment);
 //}
 // buffer
 struct ggml_buffer * ggml_buffer_alloc(struct ggml_backend * backend, size_t size, size_t max_tensors) {
@ -524,190 +716,6 @@ void ggml_graph_splits_compute(struct ggml_graph_splits * splits) {
    //exit(0);
 }
 #if 0
 // default allocator
 struct free_block {
    void * addr;
    size_t size;
 };
 struct ggml_backend_default_allocator_context {
    void * data;
    size_t alignment;
    int n_free_blocks;
    struct free_block free_blocks[];
 };
 void ggml_backend_default_allocator_free_context(ggml_allocator_context_t ctx) {
    struct ggml_backend_default_allocator_context * allocator_ctx = ctx;
    free(allocator_ctx);
 }
 ggml_allocator_context_t ggml_backend_default_allocator_context(void * data, size_t size, size_t alignment, int n_free_blocks) {
    struct ggml_backend_default_allocator_context * ctx = malloc(sizeof(struct ggml_backend_default_allocator_context) + n_free_blocks * sizeof(struct free_block));
    ctx->data = data;
    ctx->alignment = alignment;
    ctx->n_free_blocks = 1;
    size_t align_offset = align_offset(data, alignment);
    ctx->free_blocks[0].addr = (char *)data + align_offset;
    ctx->free_blocks[0].size = size - align_offset;
    return ctx;
 }
 void * ggml_backend_default_allocator_alloc(ggml_allocator_context_t ctx, size_t size) {
    struct ggml_backend_default_allocator_context * allocator_ctx = ctx;
    size = align_size(size, allocator_ctx->alignment);
    // find a free block
    for (int i = 0; i < allocator_ctx->n_free_blocks; i++) {
        struct free_block * block = &allocator_ctx->free_blocks[i];
        if (block->size >= size) {
            void * addr = block->addr;
            block->addr += size;
            block->size -= size;
            if (block->size == 0) {
                // remove block if empty
                allocator_ctx->n_free_blocks--;
                for (int j = i; j < allocator_ctx->n_free_blocks; j++) {
                    allocator_ctx->free_blocks[j] = allocator_ctx->free_blocks[j+1];
                }
            }
            return addr;
        }
    }
    return NULL;
 }
 // this is a very naive implementation, but for our case the number of free blocks should be very small
 void ggml_backend_default_allocator_free(ggml_allocator_context_t ctx, void * ptr, size_t size) {
    struct ggml_backend_default_allocator_context * allocator_ctx = ctx;
    size = align_size(size, allocator_ctx->alignment);
    // see if we can merge with an existing block
    for (int i = 0; i < allocator_ctx->n_free_blocks; i++) {
        struct free_block * block = &allocator_ctx->free_blocks[i];
        // check if ptr is at the end of the block
        if (block->addr + block->size == ptr) {
            block->size += size;
            // check if we can merge with the next block
            if (i < allocator_ctx->n_free_blocks - 1 && block->addr + block->size == allocator_ctx->free_blocks[i+1].addr) {
                block->size += allocator_ctx->free_blocks[i+1].size;
                allocator_ctx->n_free_blocks--;
                for (int j = i+1; j < allocator_ctx->n_free_blocks; j++) {
                    allocator_ctx->free_blocks[j] = allocator_ctx->free_blocks[j+1];
                }
            }
            return;
        }
        // check if ptr is at the beginning of the block
        if (ptr + size == block->addr) {
            block->addr = ptr;
            block->size += size;
            // check if we can merge with the previous block
            if (i > 0 && allocator_ctx->free_blocks[i-1].addr + allocator_ctx->free_blocks[i-1].size == block->addr) {
                allocator_ctx->free_blocks[i-1].size += block->size;
                allocator_ctx->n_free_blocks--;
                for (int j = i; j < allocator_ctx->n_free_blocks; j++) {
                    allocator_ctx->free_blocks[j] = allocator_ctx->free_blocks[j+1];
                }
            }
            return;
        }
    }
    // otherwise, add a new block
    if (allocator_ctx->n_free_blocks < MAX_FREE_BLOCKS) {
        // insert the new block in the correct position to keep the array sorted
        int insert_pos = 0;
        while (insert_pos < allocator_ctx->n_free_blocks && allocator_ctx->free_blocks[insert_pos].addr < ptr) {
            insert_pos++;
        }
        // shift all blocks from insert_pos onward to make room for the new block
        for (int i = allocator_ctx->n_free_blocks; i > insert_pos; i--) {
            allocator_ctx->free_blocks[i] = allocator_ctx->free_blocks[i-1];
        }
        // insert the new block
        allocator_ctx->free_blocks[insert_pos].addr = ptr;
        allocator_ctx->free_blocks[insert_pos].size = size;
        allocator_ctx->n_free_blocks++;
    }
    else {
        GGML_ASSERT(!"out of free blocks");
    }
 }
 static bool ggml_is_view(struct ggml_tensor * t) {
    return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
           t->op == GGML_OP_PERMUTE || t->op == GGML_OP_NONE;
 }
 NOTE: id can be n_leaf OR n_node instead, we can determine the type by checking if the node is a leaf or not
 void allocate_graph(struct ggml_cgraph * gf, struct ggml_buffer * buffer) {
    int node_children_count[GGML_MAX_NODES*2];
    int node_view_count[GGML_MAX_NODES*2];
    memset(node_children_count, 0, sizeof(int) * (gf->n_nodes + gf->n_leafs));
    memset(node_view_count, 0, sizeof(int) * (gf->n_nodes + gf->n_leafs));
    // count number of children and views
    for (int i = 0; i < gf->n_nodes; i++) {
        struct ggml_tensor * node = gf->nodes[i];
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            struct ggml_tensor * parent = node->src[j];
            if (parent == NULL) {
                break;
            }
            // todo: ....
            node_children_count[parent->id] += 1;
            if (ggml_is_view(parent)) {
                struct ggml_tensor * ancestor = parent;
                do {
                    node_view_count[ancestor->id] += 1;
                    ancestor = ancestor->src[0];
                } while (ggml_is_view(ancestor));
            }
        }
    }
    // allocate tensors
    for (int i = 0; i < gf->n_nodes; i++) {
        struct ggml_tensor * node = gf->nodes[i];
        bool is_view = ggml_is_view(node);
        if (is_view) {
            // allocate view accordingly to the OP
            node->data = node->src[0]->data; // + offset
            struct ggml_tensor * ancestor = node->src[0];
            while (ggml_is_view(ancestor)) {
                ancestor = ancestor->src[0];
            }
            node_view_count[ancestor->id] -= 1;
        } else {
            if (node->data == NULL) {
                // allocate tensor
                // TODO: if last children and size == parent.size, then reuse parent tensor (auto in-place)
                // may need a list of ops that can be in-place
                ggml_backend_alloc_tensor(buffer, node);
            }
        }
        // update parents
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            struct ggml_tensor * parent = node->src[j];
            if (parent == NULL) {
                break;
            }
            if (is_view) {
                node_view_count[parent->id] -= 1;
            }
            node_children_count[parent->id] -= 1;
            if (node_children_count[parent->id] == 0 && node_view_count[parent->id] == 0) {
                // free parent
                ggml_backend_free_tensor(buffer, parent);
            }
        }
    }
 }
 #endif
 void ggml_graph_allocate_tensors(struct ggml_cgraph * graph, struct ggml_context * ctx) {
    ggml_graph_allocate_tensors_n(&graph, 1, ctx);
 }
@ -717,6 +725,21 @@ static bool ggml_is_view(struct ggml_tensor * t) {
           t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
 }
 struct ggml_tensor * view_parent(struct ggml_tensor * t) {
    switch (t->op) {
        case GGML_OP_RESHAPE:
        case GGML_OP_VIEW:
        case GGML_OP_TRANSPOSE:
        case GGML_OP_PERMUTE:
            return t->src[0];
        case GGML_OP_CPY:
            return t->src[1];
        default:
            return NULL;
    }
 }
 #if 0
 void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, struct ggml_context * ctx) {
    struct ggml_buffer * buffer = ggml_get_buffer(ctx);
    for (int i = 0; i < n_graphs; i++) {
@ -763,6 +786,134 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
    }
    //printf("\n\n\n");
 }
 #else
 void allocate_node(struct ggml_buffer * buffer, struct ggml_tensor * node) {
    if (node->data == NULL) {
        if (ggml_is_view(node)) {
            size_t offset;
            switch(node->op) {
                case GGML_OP_VIEW:
                    memcpy(&offset, node->op_params, sizeof(size_t));
                    node->data = (char *) node->src[0]->data + offset;
                    break;
                case GGML_OP_RESHAPE:
                case GGML_OP_TRANSPOSE:
                case GGML_OP_PERMUTE:
                    node->data = node->src[0]->data;
                    break;
                case GGML_OP_CPY:
                    node->data = node->src[1]->data;
                    break;
                default:
                    GGML_ASSERT(!"unknown view op");
                    break;
            }
        } else {
            //printf("allocating tensor %s\n", node->name);
            ggml_backend_buffer_tensor_alloc(buffer->backend_buffer, node);
        }
    }
 }
 void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, struct ggml_context * ctx) {
    struct ggml_buffer * buffer = ggml_get_buffer(ctx);
    // reset counters
    for (int g = 0; g < n_graphs; g++) {
        struct ggml_cgraph * gf = graphs[g];
        for (int i = 0; i < gf->n_nodes; i++) {
            struct ggml_tensor * node = gf->nodes[i];
            node->n_children = 0;
            node->n_views = 0;
        }
        for (int i = 0; i < gf->n_leafs; i++) {
            struct ggml_tensor * leaf = gf->leafs[i];
            leaf->n_children = 0;
            leaf->n_views = 0;
        }
    }
    // count number of children and views
    for (int g = 0; g < n_graphs; g++) {
        struct ggml_cgraph * gf = graphs[g];
        for (int i = 0; i < gf->n_nodes; i++) {
            struct ggml_tensor * node = gf->nodes[i];
            for (int j = 0; j < GGML_MAX_SRC; j++) {
                struct ggml_tensor * parent = node->src[j];
                if (parent == NULL) {
                    break;
                }
                parent->n_children += 1;
                if (ggml_is_view(parent)) {
                    struct ggml_tensor * ancestor = parent;
                    do {
                        ancestor = view_parent(ancestor);
                    } while (ggml_is_view(ancestor));
                    ancestor->n_views += 1;
                }
            }
        }
    }
    // allocate tensors
    for (int g = 0; g < n_graphs; g++) {
        struct ggml_cgraph * gf = graphs[g];
        for (int i = 0; i < gf->n_nodes; i++) {
            struct ggml_tensor * node = gf->nodes[i];
            bool is_view = ggml_is_view(node);
            // allocate parents (leafs)
            for (int j = 0; j < GGML_MAX_SRC; j++) {
                struct ggml_tensor * parent = node->src[j];
                if (parent == NULL) {
                    break;
                }
                allocate_node(buffer, parent);
            }
            // allocate node
            allocate_node(buffer, node);
            // update parents
            if (is_view) {
                struct ggml_tensor * ancestor = node;
                do {
                    ancestor = view_parent(ancestor);
                } while (ggml_is_view(ancestor));
                ancestor->n_views -= 1;
                if (ancestor->n_views == 0) {
                    ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
                }
            } else {
                for (int j = 0; j < GGML_MAX_SRC; j++) {
                    struct ggml_tensor * parent = node->src[j];
                    if (parent == NULL) {
                        break;
                    }
                    if (ggml_is_view(parent)) {
                        struct ggml_tensor * ancestor = parent;
                        do {
                            ancestor = view_parent(ancestor);
                        } while (ggml_is_view(ancestor));
                        ancestor->n_views -= 1;
                        if (ancestor->n_views == 0) {
                            ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
                        }
                    }
                    else {
                        parent->n_children -= 1;
                        if (parent->n_children == 0) {
                            // free parent
                            ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
                        }
                    }
                }
            }
        }
    }
 }
 #endif
 void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits) {
    bool visited[GGML_MAX_SPLITS] = {false};
--- a/ggml-backend.h
+++ b/ggml-backend.h
@ -35,7 +35,7 @@ extern "C" {
    // backend buffer helper functions
    GGML_API      void ggml_backend_buffer_free(struct ggml_backend_buffer * alloc);
    static inline void ggml_backend_buffer_tensor_alloc(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.alloc_tensor(alloc, tensor); }
-    static inline void ggml_backend_buffer_free_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.free_tensor(alloc, tensor); }
+    static inline void ggml_backend_buffer_tensor_free(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.free_tensor(alloc, tensor); }
    static inline void ggml_backend_buffer_reset(struct ggml_backend_buffer * alloc) { alloc->interface.reset(alloc); }
    // default buffer allocator
--- a/ggml.c
+++ b/ggml.c
@ -4531,6 +4531,8 @@ struct ggml_tensor * ggml_new_tensor_impl(
        /*.grad         =*/ NULL,
        /*.src          =*/ { NULL },
        /*.node_id      =*/ -1,
        /*.n_children   =*/ 0,
        /*.n_views      =*/ 0,
        /*.perf_runs    =*/ 0,
        /*.perf_cycles  =*/ 0,
        /*.perf_time_us =*/ 0,
--- a/ggml.h
+++ b/ggml.h
@ -423,21 +423,21 @@ extern "C" {
        struct ggml_tensor * src[GGML_MAX_SRC];
        int node_id; // used to build graphs
        int n_children;
        int n_views;
        // performance
        int     perf_runs;
        int64_t perf_cycles;
        int64_t perf_time_us;
        void * data;
        char name[GGML_MAX_NAME];
        void * extra; // extra things e.g. for ggml-cuda.cu
-
+        char padding[12];
        char padding[4];
    };
    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
--- a/llama.cpp
+++ b/llama.cpp
@ -164,6 +164,7 @@ struct llama_kv_cache {
    ~llama_kv_cache() {
        if (ctx) {
            ggml_buffer_free(buf);
            ggml_free(ctx);
        }
    }
@ -1210,6 +1211,7 @@ static ggml_graph_splits llama_build_graph(
    // TODO: this shouldn't be necessary
    bool measuring = lctx.bufs_compute[0]->backend_buffer->measure;
    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx_kv, GGML_TYPE_F32, 1);
    ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
    if (!measuring) {
        // this should be automatic
        if (KQ_scale->data == NULL) {
@ -1217,7 +1219,6 @@ static ggml_graph_splits llama_build_graph(
        }
        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
    }
    ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
    if (embeddings_input) {
        // use embeddings as input