diff --git a/ggml-backend.c b/ggml-backend.c
index 01d76af5c..9b596ac41 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -15,16 +15,20 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
     return offset + align;
 }
 
-static inline size_t ggml_backend_buffer_get_alloc_size(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { return alloc->interface.get_alloc_size(alloc, tensor); }
-static inline void   ggml_backend_buffer_init_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.init_tensor(alloc, tensor); }
+static inline size_t ggml_backend_buffer_get_alloc_size(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) {
+    return alloc->interface.get_alloc_size(alloc, tensor);
+}
 
+static inline void ggml_backend_buffer_init_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) {
+    alloc->interface.init_tensor(alloc, tensor);
+}
 
 void ggml_backend_buffer_free(struct ggml_backend_buffer * alloc) {
     alloc->interface.free_buffer(alloc);
     free(alloc);
 }
 
-// backend buffer allocator - simple
+// backend buffer allocator - simple - cannot free tensors, good for weights and small contexts
 
 struct ggml_allocator_simple_context {
     void * data;
@@ -38,21 +42,32 @@ static void ggml_allocator_simple_free_buffer(struct ggml_backend_buffer * alloc
     free(context);
 }
 
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
 static void ggml_allocator_simple_alloc_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) {
     struct ggml_allocator_simple_context * context = (struct ggml_allocator_simple_context *)alloc->context;
+
     size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
-    if (context->offset + size > context->size) {
+
+    if (!alloc->measure && context->offset + size > context->size) {
         fprintf(stderr, "%s: not enough space in the buffer (needed %zu, available %zu)\n",
                 __func__, size, context->size - context->offset);
         GGML_ASSERT(!"not enough space in the buffer");
         return;
     }
-    void * ptr = (char*)context->data + context->offset;
-    context->offset = aligned_offset(context->data, context->offset + size, context->alignment);
-    tensor->data = ptr;
-    if (alloc->interface.init_tensor) {
-        alloc->interface.init_tensor(alloc, tensor);
+
+    alloc->max_size = MAX(alloc->max_size, context->offset + size);
+
+    if (alloc->measure) {
+        tensor->data = NULL;
+    } else {
+        tensor->data = (char*)context->data + context->offset;
+        if (alloc->interface.init_tensor) {
+            ggml_backend_buffer_init_tensor(alloc, tensor);
+        }
     }
+
+    context->offset = aligned_offset(context->data, context->offset + size, context->alignment);
 }
 
 static void ggml_allocator_simple_free_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) {
@@ -83,7 +98,7 @@ static const struct ggml_backend_buffer_interface ggml_allocator_simple_interfac
     /* .free_data      = */ NULL,
 };
 
-struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size, size_t alignment) {
+static struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size, size_t alignment) {
     struct ggml_allocator_simple_context * ctx = malloc(sizeof(struct ggml_allocator_simple_context));
     ctx->data = data;
     ctx->size = size;
@@ -94,20 +109,35 @@ struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size
     *allocator = (struct ggml_backend_buffer){
         /* .interface    = */ ggml_allocator_simple_interface,
         /* .context      = */ ctx,
+        /* .backend      = */ NULL,
         /* .backend_data = */ NULL,
+        /* .measure      = */ false,
+        /* .max_size     = */ 0,
     };
     return allocator;
 }
 
+//
+
+struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment) {
+    return ggml_allocator_simple_init(data, size, alignment);
+}
+
 // buffer
 
 struct ggml_buffer * ggml_buffer_alloc(struct ggml_backend * backend, size_t size, size_t max_tensors) {
     struct ggml_buffer * buffer = malloc(sizeof(struct ggml_buffer));
     buffer->mem_size = ggml_tensor_overhead() * max_tensors;
     buffer->mem_buffer = malloc(buffer->mem_size);
-    buffer->backend = backend;
     size += 128 * max_tensors; // alignment overhead
     buffer->backend_buffer = backend->interface.alloc_buffer(backend, size);
+    buffer->backend_buffer->backend = backend;
+    return buffer;
+}
+
+struct ggml_buffer * ggml_buffer_measure_alloc(struct ggml_backend * backend, size_t max_tensors) {
+    struct ggml_buffer * buffer = ggml_buffer_alloc(backend, 0, max_tensors);
+    buffer->backend_buffer->measure = true;
     return buffer;
 }
 
@@ -190,7 +220,7 @@ static void ggml_backend_cpu_free_buffer(struct ggml_backend_buffer * alloc) {
 static struct ggml_backend_buffer * ggml_backend_cpu_alloc_buffer(struct ggml_backend * backend, size_t size) {
     void * data = malloc(size);
 
-    struct ggml_backend_buffer * buffer = ggml_allocator_simple_init(data, size, TENSOR_ALIGNMENT);
+    struct ggml_backend_buffer * buffer = ggml_allocator_default_init(data, size, TENSOR_ALIGNMENT);
     buffer->interface.free_data = ggml_backend_cpu_free_buffer;
     buffer->backend_data = data;
 
@@ -674,3 +704,33 @@ void allocate_graph(struct ggml_cgraph * gf, struct ggml_buffer * buffer) {
 }
 
 #endif
+
+void ggml_graph_allocate_tensors(struct ggml_cgraph * graph) {
+    ggml_graph_allocate_tensors_n(&graph, 1);
+}
+
+void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs) {
+}
+
+
+void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits) {
+    bool visited[GGML_MAX_SPLITS] = {false};
+    for (int i = 0; i < splits->n_splits; i++) {
+        if (!visited[i]) {
+            struct ggml_graph_split * split = &splits->splits[i];
+            struct ggml_backend * backend = split->dst_inputs[0]->backend; // not great
+            struct ggml_cgraph * backend_graphs[GGML_MAX_SPLITS];
+            int num_graphs = 0;
+            for (int j = i; j < splits->n_splits; j++) {
+                if (splits->splits[j].dst_inputs[0]->backend == backend) {
+                    backend_graphs[num_graphs++] = splits->splits[j].graph;
+                    visited[j] = true;
+                    // TODO: need to ensure that the output tensors are never freed
+                    // maybe this can be done automatically in ggml_graph_calc_compute_buffer_size by assuming that n_childs == 0 => output tensor
+                }
+            }
+            ggml_graph_allocate_tensors_n(backend_graphs, num_graphs);
+        }
+    }
+}
+
diff --git a/ggml-backend.h b/ggml-backend.h
index d3b77a4aa..9cbdaca6c 100644
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -7,8 +7,7 @@ extern "C" {
 #endif
     struct ggml_backend;
 
-
-    // backend buffers
+    // backend buffer
     typedef void * ggml_buffer_context_t;
     struct ggml_backend_buffer;
 
@@ -27,7 +26,10 @@ extern "C" {
     struct ggml_backend_buffer {
         struct ggml_backend_buffer_interface interface;
         ggml_buffer_context_t context;
+        struct ggml_backend * backend;
         void * backend_data;
+        bool measure;
+        size_t max_size;
     };
 
     // backend buffer helper functions
@@ -36,11 +38,8 @@ extern "C" {
     static inline void ggml_backend_buffer_free_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.free_tensor(alloc, tensor); }
     static inline void ggml_backend_buffer_reset(struct ggml_backend_buffer * alloc) { alloc->interface.reset(alloc); }
 
-    // default buffer allocators
-    // simple buffer allocator: cannot free tensors, good for weights and small contexts
-    // default buffer allocator: can free tensors, good for compute contexts
-    GGML_API struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size, size_t alignment);
-    GGML_API struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment, int max_free_blocks);
+    // default buffer allocator
+    GGML_API struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment);
 
     // buffer
 
@@ -51,11 +50,12 @@ extern "C" {
         void * mem_buffer;
 
         // tensor data
-        struct ggml_backend * backend;
         struct ggml_backend_buffer * backend_buffer;
     };
 
-    GGML_API struct ggml_buffer * ggml_buffer_alloc(struct ggml_backend * backend, size_t size, size_t max_tensors);
+    GGML_API struct ggml_buffer * ggml_buffer_alloc        (struct ggml_backend * backend, size_t size, size_t max_tensors);
+    GGML_API struct ggml_buffer * ggml_buffer_measure_alloc(struct ggml_backend * backend, size_t max_tensors);
+    // measure buffers only calculate the maximum size of the buffer without allocating it - useful for pre-allocation
     GGML_API void ggml_buffer_free(struct ggml_buffer * buffer);
 
     // backend
@@ -152,6 +152,11 @@ extern "C" {
     // compute
     GGML_API void ggml_graph_splits_compute(struct ggml_graph_splits * splits);
 
+    // graph tensor allocator
+    GGML_API void ggml_graph_allocate_tensors(struct ggml_cgraph * graph);
+    GGML_API void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs);
+    GGML_API void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 2ca183e12..ccb1326b3 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1726,7 +1726,7 @@ static ggml_backend_buffer * ggml_backend_cuda_alloc_buffer(ggml_backend * backe
     void * data;
     CUDA_CHECK(cudaMalloc(&data, size));
 
-    ggml_backend_buffer * buffer = ggml_allocator_simple_init(data, size, TENSOR_ALIGNMENT);
+    ggml_backend_buffer * buffer = ggml_allocator_default_init(data, size, TENSOR_ALIGNMENT);
     buffer->interface.free_data = ggml_backend_cuda_free_buffer;
     buffer->backend_data = data;
 
diff --git a/ggml.c b/ggml.c
index 19db8241f..ddf784f4c 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4468,7 +4468,7 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
 }
 
 struct ggml_backend * ggml_get_ctx_backend(struct ggml_context * ctx) {
-    return ctx->buffer->backend;
+    return ctx->buffer->backend_buffer->backend;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -4520,7 +4520,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
     ggml_assert_aligned(result);
 
     *result = (struct ggml_tensor) {
-        /*.backend      =*/ ctx->buffer->backend,
+        /*.backend      =*/ ggml_get_ctx_backend(ctx),
         /*.type         =*/ type,
         /*.n_dims       =*/ n_dims,
         /*.ne           =*/ { 1, 1, 1, 1 },
diff --git a/llama.cpp b/llama.cpp
index 4529433b2..c858d08b3 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -113,20 +113,6 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF() {
     return k_sizes;
 }
 
-// this is mostly needed for temporary mul_mat buffers to dequantize the data
-// not actually needed if BLAS is disabled
-static const std::map<e_model, size_t> & MEM_REQ_EVAL() {
-    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_3B,   512ull * MB },
-        //{ MODEL_7B,   768ull * MB }, // FIXME: increased until improved memory management
-        { MODEL_7B,  2048ull * MB },
-        { MODEL_13B, 1024ull * MB },
-        { MODEL_30B, 1280ull * MB },
-        { MODEL_65B, 1536ull * MB },
-    };
-    return k_sizes;
-}
-
 // default hparams (LLaMA 7B)
 struct llama_hparams {
     uint32_t n_vocab = 32000;
@@ -1099,8 +1085,7 @@ static void llama_model_load_internal(
             ctx_sum += it.second;
         }
 
-        const size_t mem_required =
-            ctx_sum + MEM_REQ_EVAL().at(model.type);
+        const size_t mem_required = ctx_sum;
 
         // this is the memory required by one llama_state
         const size_t mem_required_state =
@@ -1191,7 +1176,8 @@ static ggml_graph_splits llama_build_graph(
     struct ggml_context * ctx_i = nullptr;
     struct ggml_context * ctx_o = nullptr;
     struct ggml_context * ctx_kv = nullptr;
-    // TODO: reuse vectors to avoid allocations
+
+    // TODO: reuse these vectors to avoid allocations during eval
     std::vector<ggml_context *> ctx_ls(n_layer);
     std::vector<struct ggml_context *> ctxs;
 
@@ -1212,10 +1198,17 @@ static ggml_graph_splits llama_build_graph(
         }
     }
 
+    bool measuring = lctx.bufs_compute[0]->backend_buffer->measure;
+
     struct ggml_tensor * inpL;
 
     // reuse the scale tensor for all layers since it requires a memory transfer
-    struct ggml_tensor * KQ_scale = ggml_new_f32(ctx_kv, 1.0f/sqrtf(float(n_embd)/n_head));
+    //struct ggml_tensor * KQ_scale = ggml_new_f32(ctx_kv, 1.0f/sqrtf(float(n_embd)/n_head));
+    // TODO: this shouldn't be necessary
+    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx_kv, GGML_TYPE_F32, 1);
+    if (!measuring) {
+        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
+    }
     ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
 
     if (embeddings_input) {
@@ -1459,6 +1452,8 @@ static ggml_graph_splits llama_build_graph(
     }
 
     ggml_graph_splits_build_forward(&splits, cur);
+    // TODO: this probably should be automatic on ggml_graph_splits_build_forward (and ggml_build_forward)
+    ggml_graph_splits_allocate_tensors(&splits);
 
     // plot the computation graph in dot format (for debugging purposes)
     //if (n_past%100 == 0) {
@@ -2621,17 +2616,6 @@ struct llama_context * llama_new_context_with_model(
             ctx->embedding.resize(hparams.n_embd);
         }
 
-        // initialize compute buffers
-        // TODO: size the buffers more accurately - depends on improved memory management
-        // TODO: skip if no cpu layers
-        for (auto & backend_data : model->backends) {
-            ggml_buffer * buf_compute = ggml_buffer_alloc(backend_data.backend, MEM_REQ_EVAL().at(ctx->model.type), 2048);
-            ctx->bufs_compute.push_back(buf_compute);
-        }
-        // TODO: pinned memory for faster host-device transfers
-        //ggml_cuda_host_register(*(void**)ctx->buf_compute_cpu.backend_buffer, MEM_REQ_EVAL().at(ctx->model.type) + 128*2048);
-
-
         // initialize the graph input/output buffers
         // input buffer
         {
@@ -2679,6 +2663,36 @@ struct llama_context * llama_new_context_with_model(
             ggml_free(ctx0);
         }
 
+        // initialize compute buffers
+        // calculate the required memory size
+
+        // create dummy compute buffers - not great, but we need backend-specific buffers to account for their requirements (e.g. alignment)
+        for (auto & backend_data : model->backends) {
+            ggml_buffer * buf_compute = ggml_buffer_measure_alloc(backend_data.backend, 2048);
+            ctx->bufs_compute.push_back(buf_compute);
+        }
+        // build worst-case graph
+        int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
+        int n_past = hparams.n_ctx - n_tokens;
+        /*ggml_graph_splits splits =*/ llama_build_graph(*ctx, n_tokens, n_past);
+
+        fprintf(stderr, "%s: compute ctx sizes:\n", __func__);
+        for (size_t i = 0; i < ctx->bufs_compute.size(); ++i) {
+            ggml_buffer * buf = ctx->bufs_compute[i];
+            ggml_backend * backend = buf->backend_buffer->backend;
+            size_t size = buf->backend_buffer->max_size;
+            fprintf(stderr, "%8s = %7.2f MB\n", ggml_backend_name(backend), size / 1024.0 / 1024.0);
+            ggml_buffer_free(buf);
+
+            // reallocate with the correct size
+            buf = ggml_buffer_alloc(buf->backend_buffer->backend, size, 2048);
+            ctx->bufs_compute[i] = buf;
+        }
+
+        // TODO: use pinned memory for faster host-device transfers
+        //ggml_cuda_host_register(*(void**)ctx->buf_compute_cpu.backend_buffer, MEM_REQ_EVAL().at(ctx->model.type) + 128*2048);
+
+
         // resized during inference
         if (params.logits_all) {
             ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);