diff --git a/ggml-backend.c b/ggml-backend.c index 01d76af5c..9b596ac41 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -15,16 +15,20 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen return offset + align; } -static inline size_t ggml_backend_buffer_get_alloc_size(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { return alloc->interface.get_alloc_size(alloc, tensor); } -static inline void ggml_backend_buffer_init_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.init_tensor(alloc, tensor); } +static inline size_t ggml_backend_buffer_get_alloc_size(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { + return alloc->interface.get_alloc_size(alloc, tensor); +} +static inline void ggml_backend_buffer_init_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { + alloc->interface.init_tensor(alloc, tensor); +} void ggml_backend_buffer_free(struct ggml_backend_buffer * alloc) { alloc->interface.free_buffer(alloc); free(alloc); } -// backend buffer allocator - simple +// backend buffer allocator - simple - cannot free tensors, good for weights and small contexts struct ggml_allocator_simple_context { void * data; @@ -38,21 +42,32 @@ static void ggml_allocator_simple_free_buffer(struct ggml_backend_buffer * alloc free(context); } +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + static void ggml_allocator_simple_alloc_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { struct ggml_allocator_simple_context * context = (struct ggml_allocator_simple_context *)alloc->context; + size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor); - if (context->offset + size > context->size) { + + if (!alloc->measure && context->offset + size > context->size) { fprintf(stderr, "%s: not enough space in the buffer (needed %zu, available %zu)\n", __func__, size, context->size - context->offset); GGML_ASSERT(!"not enough space in the buffer"); return; } - void * ptr = (char*)context->data + context->offset; - context->offset = aligned_offset(context->data, context->offset + size, context->alignment); - tensor->data = ptr; - if (alloc->interface.init_tensor) { - alloc->interface.init_tensor(alloc, tensor); + + alloc->max_size = MAX(alloc->max_size, context->offset + size); + + if (alloc->measure) { + tensor->data = NULL; + } else { + tensor->data = (char*)context->data + context->offset; + if (alloc->interface.init_tensor) { + ggml_backend_buffer_init_tensor(alloc, tensor); + } } + + context->offset = aligned_offset(context->data, context->offset + size, context->alignment); } static void ggml_allocator_simple_free_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { @@ -83,7 +98,7 @@ static const struct ggml_backend_buffer_interface ggml_allocator_simple_interfac /* .free_data = */ NULL, }; -struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size, size_t alignment) { +static struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size, size_t alignment) { struct ggml_allocator_simple_context * ctx = malloc(sizeof(struct ggml_allocator_simple_context)); ctx->data = data; ctx->size = size; @@ -94,20 +109,35 @@ struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size *allocator = (struct ggml_backend_buffer){ /* .interface = */ ggml_allocator_simple_interface, /* .context = */ ctx, + /* .backend = */ NULL, /* .backend_data = */ NULL, + /* .measure = */ false, + /* .max_size = */ 0, }; return allocator; } +// + +struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment) { + return ggml_allocator_simple_init(data, size, alignment); +} + // buffer struct ggml_buffer * ggml_buffer_alloc(struct ggml_backend * backend, size_t size, size_t max_tensors) { struct ggml_buffer * buffer = malloc(sizeof(struct ggml_buffer)); buffer->mem_size = ggml_tensor_overhead() * max_tensors; buffer->mem_buffer = malloc(buffer->mem_size); - buffer->backend = backend; size += 128 * max_tensors; // alignment overhead buffer->backend_buffer = backend->interface.alloc_buffer(backend, size); + buffer->backend_buffer->backend = backend; + return buffer; +} + +struct ggml_buffer * ggml_buffer_measure_alloc(struct ggml_backend * backend, size_t max_tensors) { + struct ggml_buffer * buffer = ggml_buffer_alloc(backend, 0, max_tensors); + buffer->backend_buffer->measure = true; return buffer; } @@ -190,7 +220,7 @@ static void ggml_backend_cpu_free_buffer(struct ggml_backend_buffer * alloc) { static struct ggml_backend_buffer * ggml_backend_cpu_alloc_buffer(struct ggml_backend * backend, size_t size) { void * data = malloc(size); - struct ggml_backend_buffer * buffer = ggml_allocator_simple_init(data, size, TENSOR_ALIGNMENT); + struct ggml_backend_buffer * buffer = ggml_allocator_default_init(data, size, TENSOR_ALIGNMENT); buffer->interface.free_data = ggml_backend_cpu_free_buffer; buffer->backend_data = data; @@ -674,3 +704,33 @@ void allocate_graph(struct ggml_cgraph * gf, struct ggml_buffer * buffer) { } #endif + +void ggml_graph_allocate_tensors(struct ggml_cgraph * graph) { + ggml_graph_allocate_tensors_n(&graph, 1); +} + +void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs) { +} + + +void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits) { + bool visited[GGML_MAX_SPLITS] = {false}; + for (int i = 0; i < splits->n_splits; i++) { + if (!visited[i]) { + struct ggml_graph_split * split = &splits->splits[i]; + struct ggml_backend * backend = split->dst_inputs[0]->backend; // not great + struct ggml_cgraph * backend_graphs[GGML_MAX_SPLITS]; + int num_graphs = 0; + for (int j = i; j < splits->n_splits; j++) { + if (splits->splits[j].dst_inputs[0]->backend == backend) { + backend_graphs[num_graphs++] = splits->splits[j].graph; + visited[j] = true; + // TODO: need to ensure that the output tensors are never freed + // maybe this can be done automatically in ggml_graph_calc_compute_buffer_size by assuming that n_childs == 0 => output tensor + } + } + ggml_graph_allocate_tensors_n(backend_graphs, num_graphs); + } + } +} + diff --git a/ggml-backend.h b/ggml-backend.h index d3b77a4aa..9cbdaca6c 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -7,8 +7,7 @@ extern "C" { #endif struct ggml_backend; - - // backend buffers + // backend buffer typedef void * ggml_buffer_context_t; struct ggml_backend_buffer; @@ -27,7 +26,10 @@ extern "C" { struct ggml_backend_buffer { struct ggml_backend_buffer_interface interface; ggml_buffer_context_t context; + struct ggml_backend * backend; void * backend_data; + bool measure; + size_t max_size; }; // backend buffer helper functions @@ -36,11 +38,8 @@ extern "C" { static inline void ggml_backend_buffer_free_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.free_tensor(alloc, tensor); } static inline void ggml_backend_buffer_reset(struct ggml_backend_buffer * alloc) { alloc->interface.reset(alloc); } - // default buffer allocators - // simple buffer allocator: cannot free tensors, good for weights and small contexts - // default buffer allocator: can free tensors, good for compute contexts - GGML_API struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size, size_t alignment); - GGML_API struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment, int max_free_blocks); + // default buffer allocator + GGML_API struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment); // buffer @@ -51,11 +50,12 @@ extern "C" { void * mem_buffer; // tensor data - struct ggml_backend * backend; struct ggml_backend_buffer * backend_buffer; }; - GGML_API struct ggml_buffer * ggml_buffer_alloc(struct ggml_backend * backend, size_t size, size_t max_tensors); + GGML_API struct ggml_buffer * ggml_buffer_alloc (struct ggml_backend * backend, size_t size, size_t max_tensors); + GGML_API struct ggml_buffer * ggml_buffer_measure_alloc(struct ggml_backend * backend, size_t max_tensors); + // measure buffers only calculate the maximum size of the buffer without allocating it - useful for pre-allocation GGML_API void ggml_buffer_free(struct ggml_buffer * buffer); // backend @@ -152,6 +152,11 @@ extern "C" { // compute GGML_API void ggml_graph_splits_compute(struct ggml_graph_splits * splits); + // graph tensor allocator + GGML_API void ggml_graph_allocate_tensors(struct ggml_cgraph * graph); + GGML_API void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs); + GGML_API void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits); + #ifdef __cplusplus } #endif diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 2ca183e12..ccb1326b3 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1726,7 +1726,7 @@ static ggml_backend_buffer * ggml_backend_cuda_alloc_buffer(ggml_backend * backe void * data; CUDA_CHECK(cudaMalloc(&data, size)); - ggml_backend_buffer * buffer = ggml_allocator_simple_init(data, size, TENSOR_ALIGNMENT); + ggml_backend_buffer * buffer = ggml_allocator_default_init(data, size, TENSOR_ALIGNMENT); buffer->interface.free_data = ggml_backend_cuda_free_buffer; buffer->backend_data = data; diff --git a/ggml.c b/ggml.c index 19db8241f..ddf784f4c 100644 --- a/ggml.c +++ b/ggml.c @@ -4468,7 +4468,7 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) { } struct ggml_backend * ggml_get_ctx_backend(struct ggml_context * ctx) { - return ctx->buffer->backend; + return ctx->buffer->backend_buffer->backend; } //////////////////////////////////////////////////////////////////////////////// @@ -4520,7 +4520,7 @@ struct ggml_tensor * ggml_new_tensor_impl( ggml_assert_aligned(result); *result = (struct ggml_tensor) { - /*.backend =*/ ctx->buffer->backend, + /*.backend =*/ ggml_get_ctx_backend(ctx), /*.type =*/ type, /*.n_dims =*/ n_dims, /*.ne =*/ { 1, 1, 1, 1 }, diff --git a/llama.cpp b/llama.cpp index 4529433b2..c858d08b3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -113,20 +113,6 @@ static const std::map & MEM_REQ_KV_SELF() { return k_sizes; } -// this is mostly needed for temporary mul_mat buffers to dequantize the data -// not actually needed if BLAS is disabled -static const std::map & MEM_REQ_EVAL() { - static std::map k_sizes = { - { MODEL_3B, 512ull * MB }, - //{ MODEL_7B, 768ull * MB }, // FIXME: increased until improved memory management - { MODEL_7B, 2048ull * MB }, - { MODEL_13B, 1024ull * MB }, - { MODEL_30B, 1280ull * MB }, - { MODEL_65B, 1536ull * MB }, - }; - return k_sizes; -} - // default hparams (LLaMA 7B) struct llama_hparams { uint32_t n_vocab = 32000; @@ -1099,8 +1085,7 @@ static void llama_model_load_internal( ctx_sum += it.second; } - const size_t mem_required = - ctx_sum + MEM_REQ_EVAL().at(model.type); + const size_t mem_required = ctx_sum; // this is the memory required by one llama_state const size_t mem_required_state = @@ -1191,7 +1176,8 @@ static ggml_graph_splits llama_build_graph( struct ggml_context * ctx_i = nullptr; struct ggml_context * ctx_o = nullptr; struct ggml_context * ctx_kv = nullptr; - // TODO: reuse vectors to avoid allocations + + // TODO: reuse these vectors to avoid allocations during eval std::vector ctx_ls(n_layer); std::vector ctxs; @@ -1212,10 +1198,17 @@ static ggml_graph_splits llama_build_graph( } } + bool measuring = lctx.bufs_compute[0]->backend_buffer->measure; + struct ggml_tensor * inpL; // reuse the scale tensor for all layers since it requires a memory transfer - struct ggml_tensor * KQ_scale = ggml_new_f32(ctx_kv, 1.0f/sqrtf(float(n_embd)/n_head)); + //struct ggml_tensor * KQ_scale = ggml_new_f32(ctx_kv, 1.0f/sqrtf(float(n_embd)/n_head)); + // TODO: this shouldn't be necessary + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx_kv, GGML_TYPE_F32, 1); + if (!measuring) { + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + } ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)"); if (embeddings_input) { @@ -1459,6 +1452,8 @@ static ggml_graph_splits llama_build_graph( } ggml_graph_splits_build_forward(&splits, cur); + // TODO: this probably should be automatic on ggml_graph_splits_build_forward (and ggml_build_forward) + ggml_graph_splits_allocate_tensors(&splits); // plot the computation graph in dot format (for debugging purposes) //if (n_past%100 == 0) { @@ -2621,17 +2616,6 @@ struct llama_context * llama_new_context_with_model( ctx->embedding.resize(hparams.n_embd); } - // initialize compute buffers - // TODO: size the buffers more accurately - depends on improved memory management - // TODO: skip if no cpu layers - for (auto & backend_data : model->backends) { - ggml_buffer * buf_compute = ggml_buffer_alloc(backend_data.backend, MEM_REQ_EVAL().at(ctx->model.type), 2048); - ctx->bufs_compute.push_back(buf_compute); - } - // TODO: pinned memory for faster host-device transfers - //ggml_cuda_host_register(*(void**)ctx->buf_compute_cpu.backend_buffer, MEM_REQ_EVAL().at(ctx->model.type) + 128*2048); - - // initialize the graph input/output buffers // input buffer { @@ -2679,6 +2663,36 @@ struct llama_context * llama_new_context_with_model( ggml_free(ctx0); } + // initialize compute buffers + // calculate the required memory size + + // create dummy compute buffers - not great, but we need backend-specific buffers to account for their requirements (e.g. alignment) + for (auto & backend_data : model->backends) { + ggml_buffer * buf_compute = ggml_buffer_measure_alloc(backend_data.backend, 2048); + ctx->bufs_compute.push_back(buf_compute); + } + // build worst-case graph + int n_tokens = std::min((int)hparams.n_ctx, params.n_batch); + int n_past = hparams.n_ctx - n_tokens; + /*ggml_graph_splits splits =*/ llama_build_graph(*ctx, n_tokens, n_past); + + fprintf(stderr, "%s: compute ctx sizes:\n", __func__); + for (size_t i = 0; i < ctx->bufs_compute.size(); ++i) { + ggml_buffer * buf = ctx->bufs_compute[i]; + ggml_backend * backend = buf->backend_buffer->backend; + size_t size = buf->backend_buffer->max_size; + fprintf(stderr, "%8s = %7.2f MB\n", ggml_backend_name(backend), size / 1024.0 / 1024.0); + ggml_buffer_free(buf); + + // reallocate with the correct size + buf = ggml_buffer_alloc(buf->backend_buffer->backend, size, 2048); + ctx->bufs_compute[i] = buf; + } + + // TODO: use pinned memory for faster host-device transfers + //ggml_cuda_host_register(*(void**)ctx->buf_compute_cpu.backend_buffer, MEM_REQ_EVAL().at(ctx->model.type) + 128*2048); + + // resized during inference if (params.logits_all) { ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);