From 9c72e7e916254e624d1449e9fc5de7bfb6b0cf49 Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 16 Jul 2023 14:36:32 +0200 Subject: [PATCH] rebase to master (except ggml-cuda) --- ggml-backend.c | 2 +- ggml-cuda.cu | 30 +++++++++++------------------- ggml.c | 11 +++++------ ggml.h | 2 +- llama.cpp | 6 +++--- 5 files changed, 21 insertions(+), 30 deletions(-) diff --git a/ggml-backend.c b/ggml-backend.c index 8ad5fa925..85a6cac05 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -289,7 +289,7 @@ void ggml_graph_splits_add_n_va(struct ggml_graph_splits * splits, struct ggml_t if ((*inputs[0])->backend == ggml_get_ctx_backend(ctx)) { if (splits->n_splits > 0) { - char name[GGML_MAX_NAME - 1]; // silence -Wformat-truncation + char name[GGML_MAX_NAME]; vsnprintf(name, sizeof(name), fmt, args); char new_name[GGML_MAX_NAME]; snprintf(new_name, sizeof(new_name), "%s,%s", splits->splits[splits->n_splits - 1].name, name); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index d31823d81..343eda0b2 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1475,8 +1475,8 @@ static void ggml_cuda_mul_mat(ggml_cuda_context * ctx, ggml_tensor * src0, ggml_ } static void ggml_cuda_exec_node(ggml_cuda_context * ctx, ggml_tensor * node, cudaStream_t stream) { - ggml_tensor * src0 = node->src0; - ggml_tensor * src1 = node->src1; + ggml_tensor * src0 = node->src[0]; + ggml_tensor * src1 = node->src[1]; ggml_tensor * dst = node; #if 0 @@ -1551,8 +1551,6 @@ static void ggml_cuda_exec_node(ggml_cuda_context * ctx, ggml_tensor * node, cud } } -static const int GGML_MAX_PARENTS = 2 + GGML_MAX_OPT; - static bool ggml_is_noop(ggml_tensor * t) { return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE || t->op == GGML_OP_PERMUTE || t->op == GGML_OP_NONE; @@ -1581,26 +1579,20 @@ static void ggml_cuda_graph_exec_parallel(ggml_cuda_context * ctx, ggml_cgraph * ggml_tensor * node = gf->nodes[i]; const bool is_noop = ggml_is_noop(node); - // build a list of parents - ggml_tensor * parents[GGML_MAX_PARENTS] = { node->src0, node->src1 }; - for (int j = 0; j < GGML_MAX_OPT; j++) { - parents[j + 2] = node->opt[j]; - } - // assign an stream for the node cudaStream_t stream = nullptr; // take a stream from a parent - for (int j = 0; j < GGML_MAX_PARENTS; j++) { - if (parents[j] && stream_map.count(parents[j]) && stream_map[parents[j]] != nullptr) { - stream = stream_map[parents[j]]; - stream_map.erase(parents[j]); + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (node->src[j] && stream_map.count(node->src[j]) && stream_map[node->src[j]] != nullptr) { + stream = stream_map[node->src[j]]; + stream_map.erase(node->src[j]); if (is_noop) { // if this is a noop, we can use the parent's event stream_map[node] = stream; - if (event_map.count(parents[j]) > 0) { - event_map[node] = event_map[parents[j]]; + if (event_map.count(node->src[j]) > 0) { + event_map[node] = event_map[node->src[j]]; } } break; @@ -1624,9 +1616,9 @@ static void ggml_cuda_graph_exec_parallel(ggml_cuda_context * ctx, ggml_cgraph * // wait on parent streams bool waited = false; - for (int j = 0; j < GGML_MAX_PARENTS; j++) { - if (parents[j] && event_map.count(parents[j]) > 0) { - CUDA_CHECK(cudaStreamWaitEvent(stream, event_map[parents[j]], 0)); + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (node->src[j] && event_map.count(node->src[j]) > 0) { + CUDA_CHECK(cudaStreamWaitEvent(stream, event_map[node->src[j]], 0)); waited = true; } } diff --git a/ggml.c b/ggml.c index 1ff77d6ec..d1fa4c8b8 100644 --- a/ggml.c +++ b/ggml.c @@ -6855,7 +6855,9 @@ struct ggml_tensor * ggml_rope_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); // TODO: just use a struct - int32_t params[] = { n_past, n_dims, mode, n_ctx, *(int32_t*)&freq_base, *(int32_t*)&freq_scale}; + int32_t params[6] = { n_past, n_dims, mode, n_ctx }; + memcpy(params + 4, &freq_base, sizeof(float)); + memcpy(params + 5, &freq_scale, sizeof(float)); assert(GGML_MAX_OP_PARAMS >= sizeof(params)); memcpy(result->params, ¶ms, sizeof(params)); @@ -7127,13 +7129,11 @@ struct ggml_tensor* ggml_pool_1d( }; struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); - ggml_scratch_save(ctx); struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4); ((int32_t*)c->data)[0] = op; ((int32_t*)c->data)[1] = k0; ((int32_t*)c->data)[2] = s0; ((int32_t*)c->data)[3] = p0; - ggml_scratch_load(ctx); result->op = GGML_OP_POOL_1D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -7170,7 +7170,6 @@ struct ggml_tensor* ggml_pool_2d( }; struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); - ggml_scratch_save(ctx); struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7); ((int32_t*)c->data)[0] = op; ((int32_t*)c->data)[1] = k0; @@ -7179,7 +7178,6 @@ struct ggml_tensor* ggml_pool_2d( ((int32_t*)c->data)[4] = s1; ((int32_t*)c->data)[5] = p0; ((int32_t*)c->data)[6] = p1; - ggml_scratch_load(ctx); result->op = GGML_OP_POOL_2D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -15823,7 +15821,8 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * } } - if (node->op == GGML_OP_NONE && node->src0 == NULL && node->src1 == NULL && node->grad == NULL) { + // TODO: add ggml_dependency instead of checking for NULL + if (node->op == GGML_OP_NONE && node->src[0] == NULL && node->src[1] == NULL && node->grad == NULL) { // reached a leaf node, not part of the gradient graph (e.g. a constant) GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES); diff --git a/ggml.h b/ggml.h index 949f65656..83b384c2c 100644 --- a/ggml.h +++ b/ggml.h @@ -199,7 +199,7 @@ #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_SRC 6 #define GGML_MAX_NAME 48 -#define GGML_MAX_OP_PARAMS 16 +#define GGML_MAX_OP_PARAMS 32 #define GGML_DEFAULT_N_THREADS 4 diff --git a/llama.cpp b/llama.cpp index 7869275fc..8a40002d8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1168,7 +1168,7 @@ static ggml_graph_splits llama_build_graph( struct ggml_graph_splits splits = ggml_graph_split_init(); - // initalize contexts for every backend + // initialize contexts for every backend struct ggml_context * ctx_cpu = nullptr; // TODO: don't create context if there are no CPU layers @@ -1295,8 +1295,8 @@ static ggml_graph_splits llama_build_graph( // TODO: replace with ggml_dependency / ggml_depends_on k = ggml_view_tensor(ctx_kv, kv_self.k); v = ggml_view_tensor(ctx_kv, kv_self.v); - k->src0 = k_cpy; - v->src0 = v_cpy; + k->src[0] = k_cpy; + v->src[0] = v_cpy; } struct ggml_tensor * Q =