From 295f85654ace12a1dbc9a59a06fb8c467645b661 Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 17 Jul 2023 19:03:51 +0200 Subject: [PATCH] allocators wip renamed ggml_backend functions changed ggml_buffer and ggml_backend to always be used as pointers rename ggml_tensor::params -> op_params --- examples/common.cpp | 18 +- ggml-backend.c | 473 +++++++++++++++++++++++++++++++++----------- ggml-backend.h | 118 ++++++----- ggml-cuda.cu | 118 +++++------ ggml-cuda.h | 8 +- ggml.c | 78 ++++---- ggml.h | 9 +- llama.cpp | 183 +++++++++-------- 8 files changed, 640 insertions(+), 365 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 8705127cb..2846837a7 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -327,24 +327,24 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.n_gpu_layers = std::stoi(argv[i]); #else fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU support\n"); #endif } else if (arg == "--main-gpu" || arg == "-mg") { if (++i >= argc) { invalid_param = true; break; } -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA params.main_gpu = std::stoi(argv[i]); #else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); + fprintf(stderr, "warning: llama.cpp was compiled without CUDA. It is not possible to set a main GPU.\n"); #endif } else if (arg == "--tensor-split" || arg == "-ts") { if (++i >= argc) { invalid_param = true; break; } -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA std::string arg_next = argv[i]; // split string by , and / @@ -361,14 +361,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } } #else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); -#endif // GGML_USE_CUBLAS + fprintf(stderr, "warning: llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n"); +#endif // GGML_USE_CUDA } else if (arg == "--low-vram" || arg == "-lv") { -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA params.low_vram = true; #else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); -#endif // GGML_USE_CUBLAS + fprintf(stderr, "warning: llama.cpp was compiled without CUDA. It is not possible to set lower vram usage.\n"); +#endif // GGML_USE_CUDA } else if (arg == "--no-mmap") { params.use_mmap = false; } else if (arg == "--mtest") { diff --git a/ggml-backend.c b/ggml-backend.c index 85a6cac05..01d76af5c 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -7,22 +7,114 @@ #define UNUSED(x) (void)(x) -// backend buffer +// allocator -struct ggml_buffer ggml_backend_alloc_buffer(struct ggml_backend * backend, size_t size, size_t max_tensors) { - struct ggml_buffer buffer; - buffer.mem_size = ggml_tensor_overhead() * max_tensors; - buffer.mem_buffer = malloc(buffer.mem_size); - buffer.backend = backend; +static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { + assert(alignment && !(alignment & (alignment - 1))); // power of 2 + size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment; + return offset + align; +} + +static inline size_t ggml_backend_buffer_get_alloc_size(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { return alloc->interface.get_alloc_size(alloc, tensor); } +static inline void ggml_backend_buffer_init_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.init_tensor(alloc, tensor); } + + +void ggml_backend_buffer_free(struct ggml_backend_buffer * alloc) { + alloc->interface.free_buffer(alloc); + free(alloc); +} + +// backend buffer allocator - simple + +struct ggml_allocator_simple_context { + void * data; + size_t size; + size_t offset; + size_t alignment; +}; + +static void ggml_allocator_simple_free_buffer(struct ggml_backend_buffer * alloc) { + struct ggml_allocator_simple_context * context = (struct ggml_allocator_simple_context *)alloc->context; + free(context); +} + +static void ggml_allocator_simple_alloc_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { + struct ggml_allocator_simple_context * context = (struct ggml_allocator_simple_context *)alloc->context; + size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor); + if (context->offset + size > context->size) { + fprintf(stderr, "%s: not enough space in the buffer (needed %zu, available %zu)\n", + __func__, size, context->size - context->offset); + GGML_ASSERT(!"not enough space in the buffer"); + return; + } + void * ptr = (char*)context->data + context->offset; + context->offset = aligned_offset(context->data, context->offset + size, context->alignment); + tensor->data = ptr; + if (alloc->interface.init_tensor) { + alloc->interface.init_tensor(alloc, tensor); + } +} + +static void ggml_allocator_simple_free_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { + GGML_ASSERT(!"ggml_simple_allocator cannot free individual tensors"); + + UNUSED(alloc); + UNUSED(tensor); +} + +static void ggml_allocator_simple_reset(struct ggml_backend_buffer * alloc) { + struct ggml_allocator_simple_context * context = (struct ggml_allocator_simple_context *)alloc->context; + context->offset = aligned_offset(context->data, 0, context->alignment); +} + +size_t ggml_allocator_simple_get_alloc_size(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { + return ggml_nbytes(tensor); + + UNUSED(alloc); +} + +static const struct ggml_backend_buffer_interface ggml_allocator_simple_interface = { + /* .free_buffer = */ ggml_allocator_simple_free_buffer, + /* .alloc_tensor = */ ggml_allocator_simple_alloc_tensor, + /* .free_tensor = */ ggml_allocator_simple_free_tensor, + /* .reset = */ ggml_allocator_simple_reset, + /* .get_alloc_size = */ ggml_allocator_simple_get_alloc_size, + /* .init_tensor = */ NULL, + /* .free_data = */ NULL, +}; + +struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size, size_t alignment) { + struct ggml_allocator_simple_context * ctx = malloc(sizeof(struct ggml_allocator_simple_context)); + ctx->data = data; + ctx->size = size; + ctx->offset = aligned_offset(data, 0, alignment); + ctx->alignment = alignment; + + struct ggml_backend_buffer * allocator = malloc(sizeof(struct ggml_backend_buffer)); + *allocator = (struct ggml_backend_buffer){ + /* .interface = */ ggml_allocator_simple_interface, + /* .context = */ ctx, + /* .backend_data = */ NULL, + }; + return allocator; +} + +// buffer + +struct ggml_buffer * ggml_buffer_alloc(struct ggml_backend * backend, size_t size, size_t max_tensors) { + struct ggml_buffer * buffer = malloc(sizeof(struct ggml_buffer)); + buffer->mem_size = ggml_tensor_overhead() * max_tensors; + buffer->mem_buffer = malloc(buffer->mem_size); + buffer->backend = backend; size += 128 * max_tensors; // alignment overhead - buffer.backend_buffer = backend->interface->alloc_buffer(backend->context, size); + buffer->backend_buffer = backend->interface.alloc_buffer(backend, size); return buffer; } -void ggml_backend_free_buffer(struct ggml_buffer * buffer) { - struct ggml_backend * backend = buffer->backend; - backend->interface->free_buffer(backend->context, buffer->backend_buffer); +void ggml_buffer_free(struct ggml_buffer * buffer) { + ggml_backend_buffer_free(buffer->backend_buffer); free(buffer->mem_buffer); + free(buffer); } // backend copy @@ -42,7 +134,7 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml return true; } -void ggml_backend_cpy_tensor(struct ggml_tensor * dst, struct ggml_tensor * src) { +void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) { //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]); //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]); GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); @@ -53,17 +145,17 @@ void ggml_backend_cpy_tensor(struct ggml_tensor * dst, struct ggml_tensor * src) return; } - if (dst->backend->interface->cpy_tensor_from != NULL) { - dst->backend->interface->cpy_tensor_from(dst->backend->context, src, dst); - } else if (src->backend->interface->cpy_tensor_to != NULL) { - src->backend->interface->cpy_tensor_to(src->backend->context, src, dst); + if (dst->backend->interface.cpy_tensor_from != NULL) { + dst->backend->interface.cpy_tensor_from(dst->backend->context, src, dst); + } else if (src->backend->interface.cpy_tensor_to != NULL) { + src->backend->interface.cpy_tensor_to(src->backend->context, src, dst); } else { // not ideal, but shouldn't be hit when copying from/to CPU // TODO: print a performance warning in debug builds size_t nbytes = ggml_nbytes(src); void * data = malloc(nbytes); - ggml_backend_get_tensor(src, data, 0, nbytes); - ggml_backend_set_tensor(dst, data, 0, nbytes); + ggml_backend_tensor_get(src, data, 0, nbytes); + ggml_backend_tensor_set(dst, data, 0, nbytes); free(data); } } @@ -76,105 +168,70 @@ struct ggml_backend_cpu_context { size_t work_size; }; -static const char * ggml_backend_cpu_name(ggml_backend_context_t ctx) { +static const char * ggml_backend_cpu_name(struct ggml_backend * backend) { return "CPU"; - UNUSED(ctx); + UNUSED(backend); } -static void ggml_backend_cpu_free_context(ggml_backend_context_t ctx) { - struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)ctx; +static void ggml_backend_cpu_free(struct ggml_backend * backend) { + struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; free(cpu_ctx->work_data); - free(ctx); + free(cpu_ctx); + free(backend); } -struct cpu_backend_buffer { - void * data; - size_t offset; - size_t size; -}; - static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 -static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { - assert(alignment && !(alignment & (alignment - 1))); // power of 2 - size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment; - return offset + align; +static void ggml_backend_cpu_free_buffer(struct ggml_backend_buffer * alloc) { + free(alloc->backend_data); } -static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_context_t ctx, size_t size) { - struct cpu_backend_buffer * buffer = malloc(sizeof(struct cpu_backend_buffer)); - buffer->data = malloc(size); - buffer->offset = aligned_offset(buffer->data, 0, TENSOR_ALIGNMENT); - buffer->size = size; +static struct ggml_backend_buffer * ggml_backend_cpu_alloc_buffer(struct ggml_backend * backend, size_t size) { + void * data = malloc(size); + + struct ggml_backend_buffer * buffer = ggml_allocator_simple_init(data, size, TENSOR_ALIGNMENT); + buffer->interface.free_data = ggml_backend_cpu_free_buffer; + buffer->backend_data = data; + return buffer; - UNUSED(ctx); + UNUSED(backend); } -static void ggml_backend_cpu_free_buffer(ggml_backend_context_t ctx, ggml_backend_buffer_t buffer) { - struct cpu_backend_buffer * cpu_buffer = (struct cpu_backend_buffer *)buffer; - free(cpu_buffer->data); - free(cpu_buffer); - - UNUSED(ctx); -} - -static void ggml_backend_cpu_reset_buffer(ggml_backend_context_t ctx, ggml_backend_buffer_t buffer) { - struct cpu_backend_buffer * cpu_buffer = (struct cpu_backend_buffer *)buffer; - cpu_buffer->offset = aligned_offset(cpu_buffer->data, 0, TENSOR_ALIGNMENT); - - UNUSED(ctx); -} - -static void ggml_backend_cpu_alloc_tensor(ggml_backend_context_t ctx, ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { - struct cpu_backend_buffer * cpu_buffer = (struct cpu_backend_buffer *)buffer; - - // TODO: make this error recoverable - if (cpu_buffer->offset + ggml_nbytes(tensor) > cpu_buffer->size) { - fprintf(stderr, "%s: not enough space in the buffer (needed %zu, available %zu)\n", - __func__, ggml_nbytes(tensor), cpu_buffer->size - cpu_buffer->offset); - GGML_ASSERT(false); - } - - tensor->data = (char*)cpu_buffer->data + cpu_buffer->offset; - cpu_buffer->offset = aligned_offset(cpu_buffer->data, cpu_buffer->offset + ggml_nbytes(tensor), TENSOR_ALIGNMENT); - - UNUSED(ctx); -} - -static void ggml_backend_cpu_set_tensor_async(ggml_backend_context_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_cpu_set_tensor_async(struct ggml_backend * backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); memcpy((char *)tensor->data + offset, data, size); - UNUSED(ctx); + UNUSED(backend); } -static void ggml_backend_cpu_get_tensor_async(ggml_backend_context_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +static void ggml_backend_cpu_get_tensor_async(struct ggml_backend * backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); memcpy(data, (const char *)tensor->data + offset, size); - UNUSED(ctx); + UNUSED(backend); } -static void ggml_backend_cpu_synchronize(ggml_backend_context_t ctx) { - UNUSED(ctx); +static void ggml_backend_cpu_synchronize(struct ggml_backend * backend) { + UNUSED(backend); } -static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_context_t ctx, struct ggml_tensor * src, struct ggml_tensor * dst) { - ggml_backend_get_tensor(src, dst->data, 0, ggml_nbytes(src)); +static void ggml_backend_cpu_cpy_tensor_from(struct ggml_backend * backend, struct ggml_tensor * src, struct ggml_tensor * dst) { + ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); - UNUSED(ctx); + UNUSED(backend); } -static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_context_t ctx, struct ggml_tensor * src, struct ggml_tensor * dst) { - ggml_backend_set_tensor_async(dst, src->data, 0, ggml_nbytes(src)); +static void ggml_backend_cpu_cpy_tensor_to(struct ggml_backend * backend, struct ggml_tensor * src, struct ggml_tensor * dst) { + // for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends + ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src)); - UNUSED(ctx); + UNUSED(backend); } struct ggml_backend_cpu_plan { @@ -182,8 +239,8 @@ struct ggml_backend_cpu_plan { struct ggml_cgraph cgraph; }; -static ggml_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_context_t ctx, struct ggml_cgraph * cgraph) { - struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)ctx; +static ggml_graph_plan_t ggml_backend_cpu_graph_plan_create(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { + struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_backend_cpu_plan * cpu_plan = malloc(sizeof(struct ggml_backend_cpu_plan)); @@ -197,25 +254,25 @@ static ggml_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_context return cpu_plan; } -static void ggml_backend_cpu_graph_plan_free(ggml_backend_context_t ctx, ggml_graph_plan_t plan) { +static void ggml_backend_cpu_graph_plan_free(struct ggml_backend * backend, ggml_graph_plan_t plan) { struct ggml_backend_cpu_plan * cpu_plan = (struct ggml_backend_cpu_plan *)plan; free(cpu_plan->cplan.work_data); free(cpu_plan); - UNUSED(ctx); + UNUSED(backend); } -static void ggml_backend_cpu_graph_plan_compute(ggml_backend_context_t ctx, ggml_graph_plan_t plan) { +static void ggml_backend_cpu_graph_plan_compute(struct ggml_backend * backend, ggml_graph_plan_t plan) { struct ggml_backend_cpu_plan * cpu_plan = (struct ggml_backend_cpu_plan *)plan; ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); - UNUSED(ctx); + UNUSED(backend); } -static void ggml_backend_cpu_graph_compute(ggml_backend_context_t ctx, struct ggml_cgraph * cgraph) { - struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)ctx; +static void ggml_backend_cpu_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { + struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); @@ -232,11 +289,8 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_context_t ctx, struct gg static struct ggml_backend_interface cpu_backend_interface = { /* .get_name = */ ggml_backend_cpu_name, - /* .free_context = */ ggml_backend_cpu_free_context, + /* .free = */ ggml_backend_cpu_free, /* .alloc_buffer = */ ggml_backend_cpu_alloc_buffer, - /* .free_buffer = */ ggml_backend_cpu_free_buffer, - /* .reset_buffer = */ ggml_backend_cpu_reset_buffer, - /* .alloc_tensor = */ ggml_backend_cpu_alloc_tensor, /* .set_tensor_async = */ ggml_backend_cpu_set_tensor_async, /* .get_tensor_async = */ ggml_backend_cpu_get_tensor_async, /* .synchronize = */ ggml_backend_cpu_synchronize, @@ -248,14 +302,16 @@ static struct ggml_backend_interface cpu_backend_interface = { /* .graph_compute = */ ggml_backend_cpu_graph_compute }; -struct ggml_backend ggml_backend_cpu_init(void) { +struct ggml_backend * ggml_backend_cpu_init(void) { struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context)); ctx->n_threads = GGML_DEFAULT_N_THREADS; ctx->work_data = NULL; ctx->work_size = 0; - struct ggml_backend cpu_backend = { - /* .interface = */ &cpu_backend_interface, + struct ggml_backend * cpu_backend = malloc(sizeof(struct ggml_backend)); + + *cpu_backend = (struct ggml_backend) { + /* .interface = */ cpu_backend_interface, /* .context = */ ctx }; return cpu_backend; @@ -287,26 +343,31 @@ void ggml_graph_splits_add_n_va(struct ggml_graph_splits * splits, struct ggml_t struct ggml_graph_split * split = &splits->splits[splits->n_splits]; + // check if the split is on the same backend as the previous one + // FIXME: need to check all the inputs if ((*inputs[0])->backend == ggml_get_ctx_backend(ctx)) { - if (splits->n_splits > 0) { - char name[GGML_MAX_NAME]; - vsnprintf(name, sizeof(name), fmt, args); + if (splits->n_splits == 0) { + // always add the first split + int i = 0; + while (inputs[i] != NULL) { + GGML_ASSERT(i < GGML_MAX_SPLIT_INPUTS); + split->src_inputs[i] = *inputs[i]; + split->dst_inputs[i] = *inputs[i]; + i++; + } + split->src_inputs[i] = NULL; + split->dst_inputs[i] = NULL; + } else { + // add to the previous split + char name[GGML_MAX_NAME - 2]; + int n = vsnprintf(name, sizeof(name), fmt, args); char new_name[GGML_MAX_NAME]; - snprintf(new_name, sizeof(new_name), "%s,%s", splits->splits[splits->n_splits - 1].name, name); + snprintf(new_name, sizeof(new_name), "%.*s,%s", GGML_MAX_NAME - n - 2, splits->splits[splits->n_splits - 1].name, name); strcpy(splits->splits[splits->n_splits - 1].name, new_name); return; } - // always add the first split - int i = 0; - while (inputs[i] != NULL) { - GGML_ASSERT(i < GGML_MAX_SPLIT_INPUTS); - split->src_inputs[i] = *inputs[i]; - split->dst_inputs[i] = *inputs[i]; - i++; - } - split->src_inputs[i] = NULL; - split->dst_inputs[i] = NULL; } else { + // add a new split int i = 0; while (inputs[i] != NULL) { GGML_ASSERT(i < GGML_MAX_SPLIT_INPUTS); @@ -360,8 +421,6 @@ void ggml_graph_splits_build_forward(struct ggml_graph_splits * splits, struct g // TODO: allocate graphs in context split->graph = (struct ggml_cgraph *) malloc(sizeof(struct ggml_cgraph)); memset(split->graph, 0, sizeof(struct ggml_cgraph)); - // *split->graph = ggml_build_forward_range(output, split->input); - // *split->graph = ggml_build_forward(output); for (int j = 0; outputs[j] != NULL; j++) { ggml_build_forward_expand(split->graph, outputs[j]); } @@ -404,10 +463,8 @@ void ggml_graph_splits_compute(struct ggml_graph_splits * splits) { // copy the input tensor to the backend uint64_t copy_start_us = ggml_time_us(); for (int j = 0; split->src_inputs[j] != NULL; j++) { - if (split->src_inputs[j] != split->dst_inputs[j]) { - //printf("\tcopying tensor %d (%s) (%lu bytes)\n", j, split->src_inputs[j]->name, ggml_nbytes(split->src_inputs[j])); - ggml_backend_cpy_tensor(split->dst_inputs[j], split->src_inputs[j]); - } + //printf("\tcopying tensor %d (%s) (%lu bytes)\n", j, split->src_inputs[j]->name, ggml_nbytes(split->src_inputs[j])); + ggml_backend_tensor_copy(split->src_inputs[j], split->dst_inputs[j]); } // ggml_backend_synchronize(split->dst_inputs[0]->backend); copy_us += ggml_time_us() - copy_start_us; @@ -433,3 +490,187 @@ void ggml_graph_splits_compute(struct ggml_graph_splits * splits) { //printf("splits: %d, nodes: %d, copy: %.2fms, compute_cpu: %.2fms, compute_gpu: %.2fms\n", splits->n_splits, n_nodes, copy_us / 1000.0, compute_cpu_us / 1000.0, compute_gpu_us / 1000.0); //exit(0); } + +#if 0 +// default allocator +struct free_block { + void * addr; + size_t size; +}; + +struct ggml_backend_default_allocator_context { + void * data; + size_t alignment; + int n_free_blocks; + struct free_block free_blocks[]; +}; + +void ggml_backend_default_allocator_free_context(ggml_allocator_context_t ctx) { + struct ggml_backend_default_allocator_context * allocator_ctx = ctx; + free(allocator_ctx); +} + +ggml_allocator_context_t ggml_backend_default_allocator_context(void * data, size_t size, size_t alignment, int n_free_blocks) { + struct ggml_backend_default_allocator_context * ctx = malloc(sizeof(struct ggml_backend_default_allocator_context) + n_free_blocks * sizeof(struct free_block)); + ctx->data = data; + ctx->alignment = alignment; + ctx->n_free_blocks = 1; + size_t align_offset = align_offset(data, alignment); + ctx->free_blocks[0].addr = (char *)data + align_offset; + ctx->free_blocks[0].size = size - align_offset; + return ctx; +} + +void * ggml_backend_default_allocator_alloc(ggml_allocator_context_t ctx, size_t size) { + struct ggml_backend_default_allocator_context * allocator_ctx = ctx; + size = align_size(size, allocator_ctx->alignment); + // find a free block + for (int i = 0; i < allocator_ctx->n_free_blocks; i++) { + struct free_block * block = &allocator_ctx->free_blocks[i]; + if (block->size >= size) { + void * addr = block->addr; + block->addr += size; + block->size -= size; + if (block->size == 0) { + // remove block if empty + allocator_ctx->n_free_blocks--; + for (int j = i; j < allocator_ctx->n_free_blocks; j++) { + allocator_ctx->free_blocks[j] = allocator_ctx->free_blocks[j+1]; + } + } + return addr; + } + } + return NULL; +} + +// this is a very naive implementation, but for our case the number of free blocks should be very small +void ggml_backend_default_allocator_free(ggml_allocator_context_t ctx, void * ptr, size_t size) { + struct ggml_backend_default_allocator_context * allocator_ctx = ctx; + size = align_size(size, allocator_ctx->alignment); + // see if we can merge with an existing block + for (int i = 0; i < allocator_ctx->n_free_blocks; i++) { + struct free_block * block = &allocator_ctx->free_blocks[i]; + // check if ptr is at the end of the block + if (block->addr + block->size == ptr) { + block->size += size; + // check if we can merge with the next block + if (i < allocator_ctx->n_free_blocks - 1 && block->addr + block->size == allocator_ctx->free_blocks[i+1].addr) { + block->size += allocator_ctx->free_blocks[i+1].size; + allocator_ctx->n_free_blocks--; + for (int j = i+1; j < allocator_ctx->n_free_blocks; j++) { + allocator_ctx->free_blocks[j] = allocator_ctx->free_blocks[j+1]; + } + } + return; + } + // check if ptr is at the beginning of the block + if (ptr + size == block->addr) { + block->addr = ptr; + block->size += size; + // check if we can merge with the previous block + if (i > 0 && allocator_ctx->free_blocks[i-1].addr + allocator_ctx->free_blocks[i-1].size == block->addr) { + allocator_ctx->free_blocks[i-1].size += block->size; + allocator_ctx->n_free_blocks--; + for (int j = i; j < allocator_ctx->n_free_blocks; j++) { + allocator_ctx->free_blocks[j] = allocator_ctx->free_blocks[j+1]; + } + } + return; + } + } + // otherwise, add a new block + if (allocator_ctx->n_free_blocks < MAX_FREE_BLOCKS) { + // insert the new block in the correct position to keep the array sorted + int insert_pos = 0; + while (insert_pos < allocator_ctx->n_free_blocks && allocator_ctx->free_blocks[insert_pos].addr < ptr) { + insert_pos++; + } + // shift all blocks from insert_pos onward to make room for the new block + for (int i = allocator_ctx->n_free_blocks; i > insert_pos; i--) { + allocator_ctx->free_blocks[i] = allocator_ctx->free_blocks[i-1]; + } + // insert the new block + allocator_ctx->free_blocks[insert_pos].addr = ptr; + allocator_ctx->free_blocks[insert_pos].size = size; + allocator_ctx->n_free_blocks++; + } + else { + GGML_ASSERT(!"out of free blocks"); + } +} + +static bool ggml_is_view(struct ggml_tensor * t) { + return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE || + t->op == GGML_OP_PERMUTE || t->op == GGML_OP_NONE; +} + + +NOTE: id can be n_leaf OR n_node instead, we can determine the type by checking if the node is a leaf or not + +void allocate_graph(struct ggml_cgraph * gf, struct ggml_buffer * buffer) { + int node_children_count[GGML_MAX_NODES*2]; + int node_view_count[GGML_MAX_NODES*2]; + memset(node_children_count, 0, sizeof(int) * (gf->n_nodes + gf->n_leafs)); + memset(node_view_count, 0, sizeof(int) * (gf->n_nodes + gf->n_leafs)); + + // count number of children and views + for (int i = 0; i < gf->n_nodes; i++) { + struct ggml_tensor * node = gf->nodes[i]; + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + // todo: .... + node_children_count[parent->id] += 1; + if (ggml_is_view(parent)) { + struct ggml_tensor * ancestor = parent; + do { + node_view_count[ancestor->id] += 1; + ancestor = ancestor->src[0]; + } while (ggml_is_view(ancestor)); + } + } + } + + // allocate tensors + for (int i = 0; i < gf->n_nodes; i++) { + struct ggml_tensor * node = gf->nodes[i]; + bool is_view = ggml_is_view(node); + if (is_view) { + // allocate view accordingly to the OP + node->data = node->src[0]->data; // + offset + struct ggml_tensor * ancestor = node->src[0]; + while (ggml_is_view(ancestor)) { + ancestor = ancestor->src[0]; + } + node_view_count[ancestor->id] -= 1; + } else { + if (node->data == NULL) { + // allocate tensor + // TODO: if last children and size == parent.size, then reuse parent tensor (auto in-place) + // may need a list of ops that can be in-place + ggml_backend_alloc_tensor(buffer, node); + } + } + + // update parents + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + if (is_view) { + node_view_count[parent->id] -= 1; + } + node_children_count[parent->id] -= 1; + if (node_children_count[parent->id] == 0 && node_view_count[parent->id] == 0) { + // free parent + ggml_backend_free_tensor(buffer, parent); + } + } + } +} + +#endif diff --git a/ggml-backend.h b/ggml-backend.h index ce5aac2b5..d3b77a4aa 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -5,12 +5,45 @@ #ifdef __cplusplus extern "C" { #endif - - typedef void * ggml_graph_plan_t; - typedef void * ggml_backend_context_t; - typedef void * ggml_backend_buffer_t; struct ggml_backend; + + // backend buffers + typedef void * ggml_buffer_context_t; + struct ggml_backend_buffer; + + struct ggml_backend_buffer_interface { + // allocator functions + void (*free_buffer) (struct ggml_backend_buffer * alloc); + void (*alloc_tensor) (struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor); + void (*free_tensor) (struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor); + void (*reset) (struct ggml_backend_buffer * alloc); + // functions overriden by the backend + size_t (*get_alloc_size)(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor); // pre-allocation callback + void (*init_tensor) (struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor); // post-allocation callback + void (*free_data) (struct ggml_backend_buffer * alloc); // free backend-specific data // TODO: better name + }; + + struct ggml_backend_buffer { + struct ggml_backend_buffer_interface interface; + ggml_buffer_context_t context; + void * backend_data; + }; + + // backend buffer helper functions + GGML_API void ggml_backend_buffer_free(struct ggml_backend_buffer * alloc); + static inline void ggml_backend_buffer_tensor_alloc(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.alloc_tensor(alloc, tensor); } + static inline void ggml_backend_buffer_free_tensor(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.free_tensor(alloc, tensor); } + static inline void ggml_backend_buffer_reset(struct ggml_backend_buffer * alloc) { alloc->interface.reset(alloc); } + + // default buffer allocators + // simple buffer allocator: cannot free tensors, good for weights and small contexts + // default buffer allocator: can free tensors, good for compute contexts + GGML_API struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size, size_t alignment); + GGML_API struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment, int max_free_blocks); + + // buffer + // buffers have space for the tensor structs in host memory, and tensor data in backend-specific memory struct ggml_buffer { // host memory @@ -19,75 +52,70 @@ extern "C" { // tensor data struct ggml_backend * backend; - ggml_backend_buffer_t backend_buffer; // backend-specific data + struct ggml_backend_buffer * backend_buffer; }; + GGML_API struct ggml_buffer * ggml_buffer_alloc(struct ggml_backend * backend, size_t size, size_t max_tensors); + GGML_API void ggml_buffer_free(struct ggml_buffer * buffer); + + // backend + typedef void * ggml_backend_context_t; + typedef void * ggml_graph_plan_t; + struct ggml_backend_interface { - const char * (*get_name)(ggml_backend_context_t ctx); + const char * (*get_name)(struct ggml_backend * backend); - void (*free_context)(ggml_backend_context_t ctx); + void (*free)(struct ggml_backend * backend); - // buffers - ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_context_t ctx, size_t size); - void (*free_buffer) (ggml_backend_context_t ctx, ggml_backend_buffer_t buffer); - void (*reset_buffer)(ggml_backend_context_t ctx, ggml_backend_buffer_t buffer); - void (*alloc_tensor)(ggml_backend_context_t ctx, ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - - // TODO: pinned buffers for faster transfers between host and device + // buffer allocation + struct ggml_backend_buffer * (*alloc_buffer)(struct ggml_backend * backend, size_t size); // tensor data access // these functions can be asynchronous. helper functions are provided for synchronous access that automatically call synchronize - void (*set_tensor_async)(ggml_backend_context_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*get_tensor_async)(ggml_backend_context_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - void (*synchronize)(ggml_backend_context_t ctx); + void (*set_tensor_async)(struct ggml_backend * backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor_async)(struct ggml_backend * backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + void (*synchronize) (struct ggml_backend * backend); // (optional) copy tensor between different backends, allow for single-copy tranfers - void (*cpy_tensor_from)(ggml_backend_context_t ctx, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*cpy_tensor_to) (ggml_backend_context_t ctx, struct ggml_tensor * src, struct ggml_tensor * dst); - + void (*cpy_tensor_from)(struct ggml_backend * backend, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*cpy_tensor_to) (struct ggml_backend * backend, struct ggml_tensor * src, struct ggml_tensor * dst); // compute graph with a plan - ggml_graph_plan_t (*graph_plan_create) (ggml_backend_context_t ctx, struct ggml_cgraph * cgraph); - void (*graph_plan_free) (ggml_backend_context_t ctx, ggml_graph_plan_t plan); - void (*graph_plan_compute)(ggml_backend_context_t ctx, ggml_graph_plan_t plan); + ggml_graph_plan_t (*graph_plan_create) (struct ggml_backend * backend, struct ggml_cgraph * cgraph); + void (*graph_plan_free) (struct ggml_backend * backend, ggml_graph_plan_t plan); + void (*graph_plan_compute)(struct ggml_backend * backend, ggml_graph_plan_t plan); // compute graph without a plan - void (*graph_compute) (ggml_backend_context_t ctx, struct ggml_cgraph * cgraph); + void (*graph_compute) (struct ggml_backend * backend, struct ggml_cgraph * cgraph); // check if a backend supports a given operation // this could be used to fallback automatically to the CPU backend if a backend doesn't support an operation - // bool (*supports_op)(ggml_backend_context_t ctx, struct ggml_tensor * op); + // bool (*supports_op)(struct ggml_backend * backend, struct ggml_tensor * op); }; struct ggml_backend { - struct ggml_backend_interface * interface; + struct ggml_backend_interface interface; ggml_backend_context_t context; }; // backend helper functions - static inline const char * ggml_backend_name(struct ggml_backend * backend) { return backend->interface->get_name(backend->context); } - static inline void ggml_backend_free_context(struct ggml_backend * backend) { backend->interface->free_context(backend->context); } - static inline void ggml_backend_set_tensor_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { tensor->backend->interface->set_tensor_async(tensor->backend->context, tensor, data, offset, size); } - static inline void ggml_backend_get_tensor_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { tensor->backend->interface->get_tensor_async(tensor->backend->context, tensor, data, offset, size); } - static inline void ggml_backend_set_tensor(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { tensor->backend->interface->set_tensor_async(tensor->backend->context, tensor, data, offset, size); tensor->backend->interface->synchronize(tensor->backend->context); } - static inline void ggml_backend_get_tensor(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { tensor->backend->interface->get_tensor_async(tensor->backend->context, tensor, data, offset, size); tensor->backend->interface->synchronize(tensor->backend->context); } - static inline void ggml_backend_synchronize(struct ggml_backend * backend) { backend->interface->synchronize(backend->context); } - static inline ggml_graph_plan_t ggml_backend_graph_plan_create(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { return backend->interface->graph_plan_create(backend->context, cgraph); } - static inline void ggml_backend_graph_plan_free(struct ggml_backend * backend, ggml_graph_plan_t plan) { backend->interface->graph_plan_free(backend->context, plan); } - static inline void ggml_backend_graph_plan_compute(struct ggml_backend * backend, ggml_graph_plan_t plan) { backend->interface->graph_plan_compute(backend->context, plan); } - static inline void ggml_backend_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { backend->interface->graph_compute(backend->context, cgraph); } - - // buffer and tensor allocation - GGML_API struct ggml_buffer ggml_backend_alloc_buffer(struct ggml_backend * backend, size_t size, size_t max_tensors); - GGML_API void ggml_backend_free_buffer(struct ggml_buffer * buffer); - static inline void ggml_backend_reset_buffer(struct ggml_buffer * buffer) { buffer->backend->interface->reset_buffer(buffer->backend->context, buffer->backend_buffer); } - static inline void ggml_backend_alloc_tensor(struct ggml_buffer * buffer, struct ggml_tensor * tensor) { buffer->backend->interface->alloc_tensor(buffer->backend->context, buffer->backend_buffer, tensor); } + static inline const char * ggml_backend_name(struct ggml_backend * backend) { return backend->interface.get_name(backend); } + static inline void ggml_backend_free(struct ggml_backend * backend) { backend->interface.free(backend); } + static inline void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { tensor->backend->interface.set_tensor_async(tensor->backend, tensor, data, offset, size); } + static inline void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { tensor->backend->interface.get_tensor_async(tensor->backend, tensor, data, offset, size); } + static inline void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { tensor->backend->interface.set_tensor_async(tensor->backend, tensor, data, offset, size); tensor->backend->interface.synchronize(tensor->backend); } + static inline void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { tensor->backend->interface.get_tensor_async(tensor->backend, tensor, data, offset, size); tensor->backend->interface.synchronize(tensor->backend); } + static inline void ggml_backend_synchronize(struct ggml_backend * backend) { backend->interface.synchronize(backend); } + static inline ggml_graph_plan_t ggml_backend_graph_plan_create(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { return backend->interface.graph_plan_create(backend, cgraph); } + static inline void ggml_backend_graph_plan_free(struct ggml_backend * backend, ggml_graph_plan_t plan) { backend->interface.graph_plan_free(backend, plan); } + static inline void ggml_backend_graph_plan_compute(struct ggml_backend * backend, ggml_graph_plan_t plan) { backend->interface.graph_plan_compute(backend, plan); } + static inline void ggml_backend_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { backend->interface.graph_compute(backend, cgraph); } // tensor copy between different backends - GGML_API void ggml_backend_cpy_tensor(struct ggml_tensor * dst, struct ggml_tensor * src); + GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); // CPU backend - GGML_API struct ggml_backend ggml_backend_cpu_init(void); + GGML_API struct ggml_backend * ggml_backend_cpu_init(void); GGML_API void ggml_backend_cpu_set_n_threads(struct ggml_backend * backend_cpu, int n_threads); /////////////////////////// diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 343eda0b2..2ca183e12 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -585,6 +585,14 @@ void ggml_cuda_host_free(void * ptr) { CUDA_CHECK(cudaFreeHost(ptr)); } +void ggml_cuda_host_register(void * ptr, size_t size) { + CUDA_CHECK(cudaHostRegister(ptr, size, 0)); +} + +void ggml_cuda_host_unregister(void * ptr) { + CUDA_CHECK(cudaHostUnregister(ptr)); +} + template static void ggml_cuda_op_add( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, @@ -792,9 +800,9 @@ static void ggml_cuda_op_rope( const int64_t ne00 = src0->ne[0]; const int64_t i01_diff = i01_high - i01_low; - const int n_past = ((int32_t *) dst->params)[0]; - const int n_dims = ((int32_t *) dst->params)[1]; - const int mode = ((int32_t *) dst->params)[2]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; //const int n_ctx = ((int32_t *) dst->params)[3]; GGML_ASSERT(mode == 0); @@ -822,7 +830,7 @@ static void ggml_cuda_op_diag_mask_inf( const int64_t ne01 = src0->ne[1]; const int64_t i01_diff = i01_high - i01_low; - const int n_past = ((int32_t *) dst->params)[0]; + const int n_past = ((int32_t *) dst->op_params)[0]; // compute diag_mask_inf_cuda((src0_t *)src0_d, (dst_t *)dst_d, ne00, i01_diff, ne01, n_past, stream); @@ -1689,16 +1697,17 @@ struct ggml_backend_cuda_context { ggml_cuda_context * cuda_ctx = ggml_cuda_init(); }; -static const char * ggml_backend_cuda_name(ggml_backend_context_t ctx) { +static const char * ggml_backend_cuda_name(ggml_backend * backend) { return "CUDA"; - UNUSED(ctx); + UNUSED(backend); } -static void ggml_backend_cuda_free_context(ggml_backend_context_t ctx) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)ctx; +static void ggml_backend_cuda_free(ggml_backend * backend) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_cuda_free(cuda_ctx->cuda_ctx); delete cuda_ctx; + delete backend; } struct cuda_backend_buffer { @@ -1709,116 +1718,82 @@ struct cuda_backend_buffer { static const size_t TENSOR_ALIGNMENT = 128; -static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { - assert(alignment && !(alignment & (alignment - 1))); // power of 2 - size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment; - return offset + align; +static void ggml_backend_cuda_free_buffer(struct ggml_backend_buffer * alloc) { + CUDA_CHECK(cudaFree(alloc->backend_data)); } -static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_context_t ctx, size_t size) { - cuda_backend_buffer * buffer = new cuda_backend_buffer; +static ggml_backend_buffer * ggml_backend_cuda_alloc_buffer(ggml_backend * backend, size_t size) { + void * data; + CUDA_CHECK(cudaMalloc(&data, size)); - CUDA_CHECK(cudaMalloc(&buffer->data, size)); - buffer->offset = 0; // cudaMalloc returns aligned pointers - buffer->size = size; + ggml_backend_buffer * buffer = ggml_allocator_simple_init(data, size, TENSOR_ALIGNMENT); + buffer->interface.free_data = ggml_backend_cuda_free_buffer; + buffer->backend_data = data; return buffer; - UNUSED(ctx); + UNUSED(backend); } -static void ggml_backend_cuda_free_buffer(ggml_backend_context_t ctx, ggml_backend_buffer_t buffer) { - cuda_backend_buffer * cuda_buffer = (cuda_backend_buffer *)buffer; - CUDA_CHECK(cudaFree(cuda_buffer->data)); - delete cuda_buffer; - - UNUSED(ctx); -} - -static void ggml_backend_cuda_reset_buffer(ggml_backend_context_t ctx, ggml_backend_buffer_t buffer) { - cuda_backend_buffer * cuda_buffer = (cuda_backend_buffer *)buffer; - cuda_buffer->offset = 0; - - UNUSED(ctx); -} - -static void ggml_backend_cuda_alloc_tensor(ggml_backend_context_t ctx, ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - cuda_backend_buffer * cuda_buffer = (cuda_backend_buffer *)buffer; - - if (cuda_buffer->offset + ggml_nbytes(tensor) > cuda_buffer->size) { - fprintf(stderr, "%s: not enough space in the CUDA buffer (needed %zu, available %zu)\n", - __func__, ggml_nbytes(tensor), cuda_buffer->size - cuda_buffer->offset); - GGML_ASSERT(false); - } - - tensor->data = (char*)cuda_buffer->data + cuda_buffer->offset; - cuda_buffer->offset = aligned_offset(cuda_buffer->data, cuda_buffer->offset + ggml_nbytes(tensor), TENSOR_ALIGNMENT); - - UNUSED(ctx); -} - -static void ggml_backend_cuda_set_tensor_async(ggml_backend_context_t ctx, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_cuda_set_tensor_async(ggml_backend * backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - //ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)ctx; + //ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; CUDA_CHECK(cudaMemcpyAsync((char*)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStream_main)); - UNUSED(ctx); + UNUSED(backend); } -static void ggml_backend_cuda_get_tensor_async(ggml_backend_context_t ctx, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +static void ggml_backend_cuda_get_tensor_async(ggml_backend * backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - //ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)ctx; + //ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; CUDA_CHECK(cudaMemcpyAsync(data, (const char*)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStream_main)); - UNUSED(ctx); + UNUSED(backend); } -static void ggml_backend_cuda_synchronize(ggml_backend_context_t ctx) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)ctx; +static void ggml_backend_cuda_synchronize(ggml_backend * backend) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_cuda_synchronize(cuda_ctx->cuda_ctx); } -static ggml_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_context_t ctx, ggml_cgraph * cgraph) { +static ggml_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend * backend, ggml_cgraph * cgraph) { GGML_ASSERT(false); return nullptr; - UNUSED(ctx); + UNUSED(backend); UNUSED(cgraph); } -static void ggml_backend_cuda_graph_plan_free(ggml_backend_context_t ctx, ggml_graph_plan_t plan) { +static void ggml_backend_cuda_graph_plan_free(ggml_backend * backend, ggml_graph_plan_t plan) { GGML_ASSERT(false); - UNUSED(ctx); + UNUSED(backend); UNUSED(plan); } -static void ggml_backend_cuda_graph_plan_compute(ggml_backend_context_t ctx, ggml_graph_plan_t plan) { +static void ggml_backend_cuda_graph_plan_compute(ggml_backend * backend, ggml_graph_plan_t plan) { GGML_ASSERT(false); - UNUSED(ctx); + UNUSED(backend); UNUSED(plan); } -static void ggml_backend_cuda_graph_compute(ggml_backend_context_t ctx, ggml_cgraph * cgraph) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)ctx; +static void ggml_backend_cuda_graph_compute(ggml_backend * backend, ggml_cgraph * cgraph) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_cuda_cgraph_compute(cuda_ctx->cuda_ctx, cgraph); } static ggml_backend_interface cuda_backend_interface = { /* .get_name = */ ggml_backend_cuda_name, - /* .free_context = */ ggml_backend_cuda_free_context, + /* .free = */ ggml_backend_cuda_free, /* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer, - /* .free_buffer = */ ggml_backend_cuda_free_buffer, - /* .reset_buffer = */ ggml_backend_cuda_reset_buffer, - /* .alloc_tensor = */ ggml_backend_cuda_alloc_tensor, /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async, /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async, /* .synchronize = */ ggml_backend_cuda_synchronize, @@ -1830,11 +1805,12 @@ static ggml_backend_interface cuda_backend_interface = { /* .graph_compute = */ ggml_backend_cuda_graph_compute }; -ggml_backend ggml_backend_cuda_init(void) { +ggml_backend * ggml_backend_cuda_init(void) { ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context; - ggml_backend cuda_backend = { - /* .interface = */ &cuda_backend_interface, + ggml_backend * cuda_backend = new ggml_backend; + *cuda_backend = (ggml_backend){ + /* .interface = */ cuda_backend_interface, /* .context = */ ctx }; return cuda_backend; diff --git a/ggml-cuda.h b/ggml-cuda.h index 5e232aaaa..c4fdf0bf2 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -6,12 +6,14 @@ extern "C" { #endif -void * ggml_cuda_host_malloc(size_t size); -void ggml_cuda_host_free(void * ptr); +GGML_API void * ggml_cuda_host_malloc(size_t size); +GGML_API void ggml_cuda_host_free(void * ptr); +GGML_API void ggml_cuda_host_register(void * ptr, size_t size); +GGML_API void ggml_cuda_host_unregister(void * ptr); // backend API -struct ggml_backend ggml_backend_cuda_init(); +GGML_API struct ggml_backend * ggml_backend_cuda_init(); #ifdef __cplusplus diff --git a/ggml.c b/ggml.c index d1fa4c8b8..19db8241f 100644 --- a/ggml.c +++ b/ggml.c @@ -4393,7 +4393,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { /*.compute_type =*/ params.compute_type, }; - ggml_backend_reset_buffer(params.buffer); + ggml_backend_buffer_reset(params.buffer->backend_buffer); GGML_ASSERT(ctx->mem_buffer != NULL); @@ -4526,17 +4526,17 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.ne =*/ { 1, 1, 1, 1 }, /*.nb =*/ { 0, 0, 0, 0 }, /*.op =*/ GGML_OP_NONE, + /*.op_params =*/ { 0 }, /*.is_param =*/ false, /*.grad =*/ NULL, /*.src =*/ { NULL }, + /*.node_id =*/ -1, /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, - /*.params =*/ { 0 }, /*.data =*/ data, /*.name =*/ { 0 }, /*.extra =*/ NULL, - /*.visited =*/ false, /*.pad =*/ { 0 }, }; @@ -4551,7 +4551,7 @@ struct ggml_tensor * ggml_new_tensor_impl( } if (data == NULL && !ctx->no_alloc) { - ggml_backend_alloc_tensor(ctx->buffer, result); + ggml_backend_buffer_tensor_alloc(ctx->buffer->backend_buffer, result); } // TODO: this should not be needed as long as we don't rely on aligned SIMD loads @@ -4730,7 +4730,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { } */ for (int i = 0; i < ggml_nelements(tensor); i++) { - ggml_backend_set_tensor(tensor, &value, sizeof(float)*i, sizeof(float)); + ggml_backend_tensor_set(tensor, &value, sizeof(float)*i, sizeof(float)); } } break; default: @@ -4839,7 +4839,7 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { GGML_ASSERT(tensor->nb[0] == sizeof(float)); //return ((float *)(tensor->data))[i]; float value; - ggml_backend_get_tensor(tensor, &value, sizeof(float)*i, sizeof(float)); + ggml_backend_tensor_get(tensor, &value, sizeof(float)*i, sizeof(float)); return value; } break; default: @@ -4912,6 +4912,11 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * return tensor; } +static void ggml_set_op_params(struct ggml_tensor * tensor, void * params, size_t params_size) { + GGML_ASSERT(params_size <= GGML_MAX_OP_PARAMS); + memcpy(tensor->op_params, params, params_size); +} + struct ggml_tensor * ggml_view_tensor( struct ggml_context * ctx, const struct ggml_tensor * src) { @@ -6385,8 +6390,7 @@ struct ggml_tensor * ggml_view_1d( struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset); ggml_format_name(result, "%s (view)", a->name); - assert(GGML_MAX_OP_PARAMS >= sizeof(offset)); - memcpy(result->params, &offset, sizeof(offset)); + ggml_set_op_params(result, &offset, sizeof(offset)); result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6417,8 +6421,7 @@ struct ggml_tensor * ggml_view_2d( struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset); ggml_format_name(result, "%s (view)", a->name); - assert(GGML_MAX_OP_PARAMS >= sizeof(offset)); - memcpy(result->params, &offset, sizeof(offset)); + ggml_set_op_params(result, &offset, sizeof(offset)); result->nb[1] = nb1; result->nb[2] = result->nb[1]*ne1; @@ -6455,8 +6458,7 @@ struct ggml_tensor * ggml_view_3d( struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset); ggml_format_name(result, "%s (view)", a->name); - assert(GGML_MAX_OP_PARAMS >= sizeof(offset)); - memcpy(result->params, &offset, sizeof(offset)); + ggml_set_op_params(result, &offset, sizeof(offset)); result->nb[1] = nb1; result->nb[2] = nb2; @@ -6495,8 +6497,7 @@ struct ggml_tensor * ggml_view_4d( struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset); ggml_format_name(result, "%s (view)", a->name); - assert(GGML_MAX_OP_PARAMS >= sizeof(offset)); - memcpy(result->params, &offset, sizeof(offset)); + ggml_set_op_params(result, &offset, sizeof(offset)); result->nb[1] = nb1; result->nb[2] = nb2; @@ -6569,8 +6570,7 @@ struct ggml_tensor * ggml_permute( result->src[1] = NULL; int32_t params[] = { axis0, axis1, axis2, axis3 }; - assert(GGML_MAX_OP_PARAMS >= sizeof(params)); - memcpy(result->params, params, sizeof(params)); + ggml_set_op_params(result, ¶ms, sizeof(params)); return result; } @@ -6694,8 +6694,7 @@ struct ggml_tensor * ggml_diag_mask_inf_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); int32_t params[] = { n_past, inplace ? 1 : 0 }; - assert(GGML_MAX_OP_PARAMS >= sizeof(params)); - memcpy(result->params, params, sizeof(params)); + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_DIAG_MASK_INF; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6854,12 +6853,10 @@ struct ggml_tensor * ggml_rope_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - // TODO: just use a struct int32_t params[6] = { n_past, n_dims, mode, n_ctx }; memcpy(params + 4, &freq_base, sizeof(float)); memcpy(params + 5, &freq_scale, sizeof(float)); - assert(GGML_MAX_OP_PARAMS >= sizeof(params)); - memcpy(result->params, ¶ms, sizeof(params)); + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_ROPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -11392,8 +11389,8 @@ static void ggml_compute_forward_diag_mask_f32( const int ith = params->ith; const int nth = params->nth; - const int n_past = ((int32_t *) dst->params)[0]; - const bool inplace = (bool)((int32_t *) dst->params)[1]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const bool inplace = (bool)((int32_t *) dst->op_params)[1]; GGML_ASSERT(n_past >= 0); @@ -11910,12 +11907,12 @@ static void ggml_compute_forward_rope_f32( float freq_base; float freq_scale; - const int n_past = ((int32_t *) dst->params)[0]; - const int n_dims = ((int32_t *) dst->params)[1]; - const int mode = ((int32_t *) dst->params)[2]; - const int n_ctx = ((int32_t *) dst->params)[3]; - memcpy(&freq_base, (int32_t *) dst->params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->params + 5, sizeof(float)); + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); assert(n_past >= 0); @@ -12039,12 +12036,12 @@ static void ggml_compute_forward_rope_f16( float freq_base; float freq_scale; - const int n_past = ((int32_t *) dst->params)[0]; - const int n_dims = ((int32_t *) dst->params)[1]; - const int mode = ((int32_t *) dst->params)[2]; - const int n_ctx = ((int32_t *) dst->params)[3]; - memcpy(&freq_base, (int32_t *) dst->params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->params + 5, sizeof(float)); + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); assert(n_past >= 0); @@ -15810,10 +15807,9 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * } // check if already visited - if (node->visited) { + if (node->node_id != -1) { return; } - node->visited = true; for (int i = 0; i < GGML_MAX_SRC; ++i) { if (node->src[i]) { @@ -15821,6 +15817,8 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * } } + //node->id = cgraph->n_nodes + cgraph->n_leafs; + // TODO: add ggml_dependency instead of checking for NULL if (node->op == GGML_OP_NONE && node->src[0] == NULL && node->src[1] == NULL && node->grad == NULL) { // reached a leaf node, not part of the gradient graph (e.g. a constant) @@ -15830,6 +15828,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * ggml_format_name(node, "leaf_%d", cgraph->n_leafs); } + node->node_id = cgraph->n_leafs; cgraph->leafs[cgraph->n_leafs] = node; cgraph->n_leafs++; } else { @@ -15839,6 +15838,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * ggml_format_name(node, "node_%d", cgraph->n_nodes); } + node->node_id = cgraph->n_nodes; cgraph->nodes[cgraph->n_nodes] = node; cgraph->grads[cgraph->n_nodes] = node->grad; cgraph->n_nodes++; @@ -15872,10 +15872,10 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * // TODO: this can be removed when ggml_build_forward_expand is removed void ggml_graph_close(struct ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; ++i) { - cgraph->nodes[i]->visited = false; + cgraph->nodes[i]->node_id = -1; } for (int i = 0; i < cgraph->n_leafs; ++i) { - cgraph->leafs[i]->visited = false; + cgraph->leafs[i]->node_id = -1; } cgraph->closed = true; } diff --git a/ggml.h b/ggml.h index 83b384c2c..49760666c 100644 --- a/ggml.h +++ b/ggml.h @@ -414,19 +414,21 @@ extern "C" { // compute data enum ggml_op op; + // op params - allocated as int32_t for alignment + int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(uint32_t)]; + bool is_param; struct ggml_tensor * grad; struct ggml_tensor * src[GGML_MAX_SRC]; + int node_id; // used to build graphs + // performance int perf_runs; int64_t perf_cycles; int64_t perf_time_us; - // op params - // allocated as int32_t to avoid alignment issues - int32_t params[GGML_MAX_OP_PARAMS / sizeof(uint32_t)]; void * data; @@ -434,7 +436,6 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - bool visited; // used to build graphs char padding[4]; }; diff --git a/llama.cpp b/llama.cpp index 61e31f45f..5cbffe7d3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -172,7 +172,7 @@ struct llama_kv_cache { struct ggml_context * ctx = NULL; - ggml_buffer buf; + ggml_buffer * buf; int n; // number of tokens currently in the cache @@ -225,29 +225,29 @@ struct llama_model { llama_vocab vocab; // backends - ggml_backend backend_cpu; - ggml_buffer buf_cpu; + ggml_backend * backend_cpu = NULL; + ggml_buffer * buf_cpu = NULL; ggml_context * ctx_cpu = NULL; #ifdef GGML_USE_CUDA - ggml_backend backend_cuda; - ggml_buffer buf_cuda; + ggml_backend * backend_cuda = NULL; + ggml_buffer * buf_cuda = NULL; ggml_context * ctx_cuda = NULL; #endif // backend assigned to each layer - ggml_backend * backend_input = NULL; - ggml_backend * backend_output = NULL; + ggml_backend * backend_inp = NULL; + ggml_backend * backend_out = NULL; std::vector backend_layers; ~llama_model() { if (ctx_cpu) { ggml_free(ctx_cpu); - ggml_backend_free_buffer(&buf_cpu); + ggml_buffer_free(buf_cpu); } #ifdef GGML_USE_CUDA if (ctx_cuda) { ggml_free(ctx_cuda); - ggml_backend_free_buffer(&buf_cuda); + ggml_buffer_free(buf_cuda); } #endif } @@ -286,9 +286,9 @@ struct llama_context { std::vector embedding; // memory buffers used to evaluate the model - ggml_buffer buf_compute_cpu = {}; + ggml_buffer * buf_compute_cpu; #ifdef GGML_USE_CUDA - ggml_buffer buf_compute_cuda = {}; + ggml_buffer * buf_compute_cuda; #endif // input tensors @@ -300,8 +300,19 @@ struct llama_context { struct ggml_tensor * graph_embeddings_out = nullptr; // buffers to store the inputs and outputs of the graphs - ggml_buffer buf_input = {}; - ggml_buffer buf_output = {}; + ggml_buffer * buf_input; + ggml_buffer * buf_output; + + /* + ~llama_context() { + if (model_owner) { + delete &model; + } + if (buf_compute_cpu) { + ggml_buffer_free(buf_compute_cpu); + } + } + */ }; template @@ -601,9 +612,6 @@ struct llama_model_loader { void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { size_t data_size = 0; size_t lock_size = 0; - for (const llama_load_tensor & lt : tensors_map.tensors) { - data_size += lt.size; - } if (use_mmap) { mapping.reset(new llama_mmap(&file_loader->file, false, ggml_is_numa())); @@ -613,14 +621,28 @@ struct llama_model_loader { } size_t done_size = 0; - std::vector tmp_buf; + std::vector load_buf; + size_t load_buf_size = 0; + for (llama_load_tensor & lt : tensors_map.tensors) { + bool is_cpu = lt.ggml_tensor->backend == model->backend_cpu; + if (!use_mmap && !is_cpu) { + load_buf_size = std::max(load_buf_size, lt.size); + } + data_size += lt.size; + } + if (load_buf_size > 0) { + load_buf.resize(load_buf_size); + // may improve CUDA loading speed without mmap + //ggml_cuda_host_register(load_buf.data(), load_buf.size()); + } + for (llama_load_tensor & lt : tensors_map.tensors) { if (progress_callback) { progress_callback((float) done_size / data_size, progress_callback_user_data); } LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already - bool is_cpu = lt.ggml_tensor->backend == &model->backend_cpu; + bool is_cpu = lt.ggml_tensor->backend == model->backend_cpu; // select buffer to load data into if (!use_mmap) { @@ -628,8 +650,7 @@ struct llama_model_loader { lt.data = (uint8_t *) lt.ggml_tensor->data; } else { // read to temporary buffer - tmp_buf.resize(lt.size); - lt.data = (uint8_t *) tmp_buf.data(); + lt.data = (uint8_t *) load_buf.data(); } } @@ -645,7 +666,7 @@ struct llama_model_loader { } } } else { - ggml_backend_set_tensor(lt.ggml_tensor, lt.data, 0, lt.size); + ggml_backend_tensor_set(lt.ggml_tensor, lt.data, 0, lt.size); if (use_mmap) { // hint the OS that we don't need the data anymore // TODO: this may be a bad idea with devices that use the system memory (Metal?) @@ -655,6 +676,9 @@ struct llama_model_loader { done_size += lt.size; } + //if (load_buf_size > 0) { + // ggml_cuda_host_unregister(load_buf.data()); + //} } void load_data_for(llama_load_tensor & lt) { @@ -701,11 +725,11 @@ static bool kv_cache_init( size_t size = 2u*n_elements*ggml_type_size(wtype) + 2u*MB; - cache.buf = ggml_backend_alloc_buffer(backend, size, 2); + cache.buf = ggml_buffer_alloc(backend, size, 2); cache.n = 0; struct ggml_init_params params = ggml_init_params_default(); - params.buffer = &cache.buf; + params.buffer = cache.buf; cache.ctx = ggml_init(params); @@ -771,7 +795,7 @@ void llama_backend_init(bool numa) { // needed to initialize f16 tables { struct ggml_init_params params = ggml_init_params_default(); - params.buffer = {0}; + params.buffer = NULL; struct ggml_context * ctx = ggml_init(params); ggml_free(ctx); } @@ -940,30 +964,30 @@ static void llama_model_load_internal( const uint32_t n_layer = hparams.n_layer; model.backend_cpu = ggml_backend_cpu_init(); - ggml_backend * backend_gpu = &model.backend_cpu; // hack until we have a proper backend selection + ggml_backend * backend_gpu = model.backend_cpu; // hack until we have a proper backend selection #ifdef GGML_USE_CUDA if (n_gpu_layers > 0) { model.backend_cuda = ggml_backend_cuda_init(); - backend_gpu = &model.backend_cuda; + backend_gpu = model.backend_cuda; } #endif // assign splits to the backends const int i_gpu_start = std::max(0, (int)n_layer - n_gpu_layers); - model.backend_input = n_gpu_layers > (int)n_layer ? backend_gpu : &model.backend_cpu; - model.backend_output = n_gpu_layers > 0 ? backend_gpu : &model.backend_cpu; + model.backend_inp = n_gpu_layers > (int)n_layer ? backend_gpu : model.backend_cpu; + model.backend_out = n_gpu_layers > 0 ? backend_gpu : model.backend_cpu; model.backend_layers.resize(n_layer); - std::fill(model.backend_layers.begin(), model.backend_layers.begin() + i_gpu_start, &model.backend_cpu); + std::fill(model.backend_layers.begin(), model.backend_layers.begin() + i_gpu_start, model.backend_cpu); std::fill(model.backend_layers.begin() + i_gpu_start, model.backend_layers.end(), backend_gpu); // calculate the size of each context std::unordered_map ctx_sizes; for (const llama_load_tensor & lt : ml->tensors_map.tensors) { if (lt.name == "tok_embeddings.weight") { - ctx_sizes[model.backend_input] += lt.size; + ctx_sizes[model.backend_inp] += lt.size; } else if (lt.name == "norm.weight" || lt.name == "output.weight") { - ctx_sizes[model.backend_output] += lt.size; + ctx_sizes[model.backend_out] += lt.size; } else { // parse layer number from name @@ -980,14 +1004,14 @@ static void llama_model_load_internal( // TODO: generalize support for mmap size_t mmap_size = 0; if (ml->use_mmap) { - mmap_size = ctx_sizes[&model.backend_cpu]; - ctx_sizes[&model.backend_cpu] = 0; + mmap_size = ctx_sizes[model.backend_cpu]; + ctx_sizes[model.backend_cpu] = 0; } fprintf(stderr, "%s: ggml ctx sizes:\n", __func__); for (const auto & it : ctx_sizes) { fprintf(stderr, "%8s = %7.2f MB", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0); - if (it.first == &model.backend_cpu && ml->use_mmap) { + if (it.first == model.backend_cpu && ml->use_mmap) { fprintf(stderr, " + %7.2f MB (mmap)", mmap_size / 1024.0 / 1024.0); } fprintf(stderr, "\n"); @@ -996,10 +1020,10 @@ static void llama_model_load_internal( // create the buffers and contexts { size_t cpu_num_tensors = ml->tensors_map.tensors.size(); - size_t ctx_size = ctx_sizes[&model.backend_cpu]; - model.buf_cpu = ggml_backend_alloc_buffer(&model.backend_cpu, ctx_size, cpu_num_tensors); + size_t ctx_size = ctx_sizes[model.backend_cpu]; + model.buf_cpu = ggml_buffer_alloc(model.backend_cpu, ctx_size, cpu_num_tensors); struct ggml_init_params params = ggml_init_params_default(); - params.buffer = &model.buf_cpu; + params.buffer = model.buf_cpu; params.no_alloc = ml->use_mmap; model.ctx_cpu = ggml_init(params); if (!model.ctx_cpu) { @@ -1011,10 +1035,10 @@ static void llama_model_load_internal( #ifdef GGML_USE_CUDA if (n_gpu_layers > 0) { size_t gpu_num_tensors = ml->tensors_map.tensors.size(); - size_t ctx_size = ctx_sizes[&model.backend_cuda]; - model.buf_cuda = ggml_backend_alloc_buffer(&model.backend_cuda, ctx_size, gpu_num_tensors); + size_t ctx_size = ctx_sizes[model.backend_cuda]; + model.buf_cuda = ggml_buffer_alloc(model.backend_cuda, ctx_size, gpu_num_tensors); struct ggml_init_params params = ggml_init_params_default(); - params.buffer = &model.buf_cuda; + params.buffer = model.buf_cuda; model.ctx_cuda = ggml_init(params); if (!model.ctx_cuda) { throw std::runtime_error(format("ggml_init() failed for CUDA backend")); @@ -1025,9 +1049,9 @@ static void llama_model_load_internal( // TODO: clean this ggml_context * ctx_input = model.ctx_cpu; - if (model.backend_input == backend_gpu) ctx_input = ctx_gpu; + if (model.backend_inp == backend_gpu) ctx_input = ctx_gpu; ggml_context * ctx_output = model.ctx_cpu; - if (model.backend_output == backend_gpu) ctx_output = ctx_gpu; + if (model.backend_out == backend_gpu) ctx_output = ctx_gpu; std::vector ctx_layers(n_layer, model.ctx_cpu); for (uint32_t i = 0; i < n_layer; ++i) { if (model.backend_layers[i] == backend_gpu) { @@ -1181,18 +1205,18 @@ static ggml_graph_splits llama_build_graph( // initialize contexts for every backend struct ggml_context * ctx_cpu = nullptr; - if (lctx.buf_compute_cpu.mem_size > 0) { + if (lctx.buf_compute_cpu != nullptr) { struct ggml_init_params params = ggml_init_params_default(); - params.buffer = &lctx.buf_compute_cpu; + params.buffer = lctx.buf_compute_cpu; params.compute_type = compute_type; ctx_cpu = ggml_init(params); } #ifdef GGML_USE_CUDA struct ggml_context * ctx_cuda = nullptr; - if (lctx.buf_compute_cuda.mem_size > 0) { + if (lctx.buf_compute_cuda != nullptr) { struct ggml_init_params params = ggml_init_params_default(); - params.buffer = &lctx.buf_compute_cuda; + params.buffer = lctx.buf_compute_cuda; params.compute_type = compute_type; ctx_cuda = ggml_init(params); } @@ -1204,26 +1228,30 @@ static ggml_graph_splits llama_build_graph( struct ggml_context * ctx_o = nullptr; struct ggml_context * ctx_kv = nullptr; - if (lctx.model.backend_input == &lctx.model.backend_cpu) ctx_i = ctx_cpu; - if (lctx.model.backend_output == &lctx.model.backend_cpu) ctx_o = ctx_cpu; + if (lctx.model.backend_inp == lctx.model.backend_cpu) ctx_i = ctx_cpu; + if (lctx.model.backend_out == lctx.model.backend_cpu) ctx_o = ctx_cpu; #ifdef GGML_USE_CUDA - if (lctx.model.backend_input == &lctx.model.backend_cuda) ctx_i = ctx_cuda; - if (lctx.model.backend_output == &lctx.model.backend_cuda) ctx_o = ctx_cuda; + if (lctx.model.backend_inp == lctx.model.backend_cuda) ctx_i = ctx_cuda; + if (lctx.model.backend_out == lctx.model.backend_cuda) ctx_o = ctx_cuda; #endif for (int il = 0; il < n_layer; il++) { - if (lctx.model.backend_layers[il] == &lctx.model.backend_cpu) ctx_ls[il] = ctx_cpu; + if (lctx.model.backend_layers[il] == lctx.model.backend_cpu) ctx_ls[il] = ctx_cpu; #ifdef GGML_USE_CUDA - if (lctx.model.backend_layers[il] == &lctx.model.backend_cuda) ctx_ls[il] = ctx_cuda; + if (lctx.model.backend_layers[il] == lctx.model.backend_cuda) ctx_ls[il] = ctx_cuda; #endif } - if (lctx.backend_kv == &lctx.model.backend_cpu) ctx_kv = ctx_cpu; + if (lctx.backend_kv == lctx.model.backend_cpu) ctx_kv = ctx_cpu; #ifdef GGML_USE_CUDA - if (lctx.backend_kv == &lctx.model.backend_cuda) ctx_kv = ctx_cuda; + if (lctx.backend_kv == lctx.model.backend_cuda) ctx_kv = ctx_cuda; #endif struct ggml_tensor * inpL; + // reuse the scale tensor for all layers since it requires a memory transfer + struct ggml_tensor * KQ_scale = ggml_new_f32(ctx_kv, 1.0f/sqrtf(float(n_embd)/n_head)); + ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)"); + if (embeddings_input) { // use embeddings as input struct ggml_tensor * embd_in = lctx.graph_embeddings_in; @@ -1236,10 +1264,6 @@ static ggml_graph_splits llama_build_graph( inpL = ggml_get_rows(ctx_i, model.tok_embeddings, token_in); } - // reuse the scale tensor for all layers since it requires a memory transfer - struct ggml_tensor * KQ_scale = ggml_new_f32(ctx_kv, 1.0f/sqrtf(float(n_embd)/n_head)); - ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)"); - struct ggml_tensor * cur = nullptr; for (int il = 0; il < n_layer; ++il) { struct ggml_context * ctx_l = ctx_ls[il]; @@ -1540,16 +1564,16 @@ static bool llama_eval_internal( // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads; - ggml_backend_cpu_set_n_threads(const_cast(&model.backend_cpu), n_threads); + ggml_backend_cpu_set_n_threads(const_cast(model.backend_cpu), n_threads); struct ggml_graph_splits splits = llama_build_graph(lctx, N, n_past, embd_input); if (tokens != nullptr) { // copy the tokens to the input tensor - ggml_backend_set_tensor_async(lctx.graph_tokens_in, tokens, 0, N*ggml_element_size(lctx.graph_tokens_in)); + ggml_backend_tensor_set_async(lctx.graph_tokens_in, tokens, 0, N*ggml_element_size(lctx.graph_tokens_in)); } else { // copy the embeddings to the input tensor - ggml_backend_set_tensor_async(lctx.graph_embeddings_in, embd, 0, N*n_embd*ggml_element_size(lctx.graph_embeddings_in)); + ggml_backend_tensor_set_async(lctx.graph_embeddings_in, embd, 0, N*n_embd*ggml_element_size(lctx.graph_embeddings_in)); } // run the computation @@ -1577,11 +1601,11 @@ static bool llama_eval_internal( if (lctx.logits_all) { logits_out.resize(n_vocab * N); - ggml_backend_get_tensor_async(lctx.graph_logits, logits_out.data(), 0, N*n_vocab*sizeof(float)); + ggml_backend_tensor_get_async(lctx.graph_logits, logits_out.data(), 0, N*n_vocab*sizeof(float)); } else { // return result for just the last token logits_out.resize(n_vocab); - ggml_backend_get_tensor_async(lctx.graph_logits, logits_out.data(), 0, n_vocab*sizeof(float)); + ggml_backend_tensor_get_async(lctx.graph_logits, logits_out.data(), 0, n_vocab*sizeof(float)); } } @@ -1589,13 +1613,13 @@ static bool llama_eval_internal( if (!lctx.embedding.empty()) { auto & embedding_out = lctx.embedding; embedding_out.resize(n_embd); - ggml_backend_get_tensor_async(lctx.graph_embeddings_out, embedding_out.data(), 0, n_embd*sizeof(float)); + ggml_backend_tensor_get_async(lctx.graph_embeddings_out, embedding_out.data(), 0, n_embd*sizeof(float)); } #ifdef GGML_USE_CUDA // wait for the async copy to finish if (lctx.model.n_gpu_layers > 0) { - ggml_backend_synchronize(const_cast(&lctx.model.backend_cuda)); + ggml_backend_synchronize(const_cast(lctx.model.backend_cuda)); } #endif @@ -2063,7 +2087,7 @@ void llama_sample_classifier_free_guidance( struct llama_context * guidance_ctx, float scale, float smooth_factor) { - int64_t t_start_sample_us = t_start_sample_us = ggml_time_us(); + int64_t t_start_sample_us = ggml_time_us(); assert(ctx); auto n_vocab = llama_n_vocab(ctx); @@ -2608,13 +2632,13 @@ struct llama_context * llama_new_context_with_model( // TODO: choose backend depending on n_layers/low_vram #ifdef GGML_USE_CUDA - if ((uint32_t)params.n_gpu_layers >= model->hparams.n_layer/2) { - ctx->backend_kv = &model->backend_cuda; + if ((uint32_t)params.n_gpu_layers >= model->hparams.n_layer/2 && !params.low_vram) { + ctx->backend_kv = model->backend_cuda; } else { - ctx->backend_kv = &model->backend_cpu; + ctx->backend_kv = model->backend_cpu; } #else - ctx->backend_kv = &model->backend_cpu; + ctx->backend_kv = model->backend_cpu; #endif ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; @@ -2639,10 +2663,12 @@ struct llama_context * llama_new_context_with_model( } // TODO: size the buffers more accurately - depends on improved memory management - ctx->buf_compute_cpu = ggml_backend_alloc_buffer(&model->backend_cpu, MEM_REQ_EVAL().at(ctx->model.type), 2048); + ctx->buf_compute_cpu = ggml_buffer_alloc(model->backend_cpu, MEM_REQ_EVAL().at(ctx->model.type), 2048); + // TODO: pinned memory for faster host-device transfers + //ggml_cuda_host_register(*(void**)ctx->buf_compute_cpu.backend_buffer, MEM_REQ_EVAL().at(ctx->model.type) + 128*2048); #ifdef GGML_USE_CUDA if (params.n_gpu_layers > 0) { - ctx->buf_compute_cuda = ggml_backend_alloc_buffer(&model->backend_cuda, MEM_REQ_EVAL().at(ctx->model.type), 2048); + ctx->buf_compute_cuda = ggml_buffer_alloc(model->backend_cuda, MEM_REQ_EVAL().at(ctx->model.type), 2048); } #endif @@ -2653,10 +2679,10 @@ struct llama_context * llama_new_context_with_model( buf_input_size += hparams.n_ctx * ggml_type_size(GGML_TYPE_F32); // input tokens // TODO: input embeddings should be optional to save memory buf_input_size += hparams.n_embd * hparams.n_ctx * ggml_type_size(GGML_TYPE_F32); // input embeddings - ctx->buf_input = ggml_backend_alloc_buffer(model->backend_input, buf_input_size, 2); + ctx->buf_input = ggml_buffer_alloc(model->backend_inp, buf_input_size, 2); struct ggml_init_params ggml_params = ggml_init_params_default(); - ggml_params.buffer = &ctx->buf_input; + ggml_params.buffer = ctx->buf_input; ggml_context * ctx0 = ggml_init(ggml_params); ctx->graph_tokens_in = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, hparams.n_ctx); @@ -2677,10 +2703,10 @@ struct llama_context * llama_new_context_with_model( if (params.embedding) { buf_output_size += hparams.n_embd * ggml_type_size(GGML_TYPE_F32); } - ctx->buf_output = ggml_backend_alloc_buffer(model->backend_output, buf_output_size, 2); + ctx->buf_output = ggml_buffer_alloc(model->backend_out, buf_output_size, 2); struct ggml_init_params ggml_params = ggml_init_params_default(); - ggml_params.buffer = &ctx->buf_output; + ggml_params.buffer = ctx->buf_output; ggml_context * ctx0 = ggml_init(ggml_params); ctx->graph_logits = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_vocab, params.logits_all ? hparams.n_ctx : 1); @@ -2706,7 +2732,7 @@ struct llama_context * llama_new_context_with_model( } fprintf(stderr, "%s: layer backends: ", __func__); - fprintf(stderr, "input: %s, ", ggml_backend_name(ctx->model.backend_input)); + fprintf(stderr, "input: %s, ", ggml_backend_name(ctx->model.backend_inp)); int start = 0; struct ggml_backend * prev_backend = ctx->model.backend_layers[0]; @@ -2721,7 +2747,7 @@ struct llama_context * llama_new_context_with_model( prev_backend = ctx->model.backend_layers[i]; } } - fprintf(stderr, "output: %s, ", ggml_backend_name(ctx->model.backend_output)); + fprintf(stderr, "output: %s, ", ggml_backend_name(ctx->model.backend_out)); fprintf(stderr, "kv: %s\n", ggml_backend_name(ctx->backend_kv)); #ifdef GGML_USE_MPI @@ -2753,6 +2779,7 @@ struct llama_context * llama_init_from_file( } void llama_free(struct llama_context * ctx) { + // TODO: free buffers - move this to destructor like llama_model if (ctx->model_owner) { delete &ctx->model; }