From ee8fb399aa0cecaf6c0e6ce56d89184fd166191f Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 9 Dec 2023 12:42:25 +0100 Subject: [PATCH] ggml : add n_as argument to ggml_mul_mat_id --- ggml-cuda.cu | 4 +++- ggml-metal.m | 2 +- ggml.c | 14 ++++++-------- ggml.h | 1 + llama.cpp | 6 +++--- tests/test-backend-ops.cpp | 4 +++- 6 files changed, 17 insertions(+), 14 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c1c7c30e5..04a5d2078 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -8244,6 +8244,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s const struct ggml_tensor * ids = src0; const int32_t id = dst->op_params[0]; + const int32_t n_as = dst->op_params[1]; + const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; std::vector ids_host(ggml_nbytes(ids)); @@ -8272,7 +8274,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); - GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]); + GGML_ASSERT(row_id >= 0 && row_id < n_as); const struct ggml_tensor * src0_row = dst->src[row_id + 2]; diff --git a/ggml-metal.m b/ggml-metal.m index 595bb6c0f..8389373a8 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1460,7 +1460,7 @@ void ggml_metal_graph_compute( GGML_ASSERT(src0t == GGML_TYPE_I32); - const int n_as = ne00; + const int n_as = ((int32_t *) dst->op_params)[1]; // TODO: make this more general GGML_ASSERT(n_as <= 8); diff --git a/ggml.c b/ggml.c index 1c81e7912..9982c2dad 100644 --- a/ggml.c +++ b/ggml.c @@ -4076,12 +4076,11 @@ struct ggml_tensor * ggml_mul_mat( struct ggml_tensor * ggml_mul_mat_id( struct ggml_context * ctx, struct ggml_tensor * as[], + int n_as, struct ggml_tensor * ids, int id, struct ggml_tensor * b) { - int64_t n_as = ids->ne[0]; - GGML_ASSERT(ids->type == GGML_TYPE_I32); GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); GGML_ASSERT(ids->ne[1] == b->ne[1]); @@ -4099,6 +4098,7 @@ struct ggml_tensor * ggml_mul_mat_id( struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne); ggml_set_op_params_i32(result, 0, id); + ggml_set_op_params_i32(result, 1, n_as); result->op = GGML_OP_MUL_MAT_ID; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -4106,8 +4106,7 @@ struct ggml_tensor * ggml_mul_mat_id( result->src[1] = b; // TODO: n_as is the selected experts, but it should be the total number of experts - //for (int64_t i = 0; i < n_as; i++) { - for (int64_t i = 0; i < 8; i++) { + for (int i = 0; i < n_as; i++) { struct ggml_tensor * a = as[i]; GGML_ASSERT(ggml_are_same_shape(as[0], a)); GGML_ASSERT(ggml_can_mul_mat(a, b)); @@ -9757,14 +9756,13 @@ static void ggml_compute_forward_mul_mat_id( } const struct ggml_tensor * ids = src0; - const int id = ggml_get_op_params_i32(dst, 0); + const int id = ggml_get_op_params_i32(dst, 0); + const int n_as = ggml_get_op_params_i32(dst, 1); for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]); - // TODO: this assert seems wrong? - //printf("row_id = %d, ids->ne[0] = %d, id = %d\n", row_id, ids->ne[0], id); - //GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]); + GGML_ASSERT(row_id >= 0 && row_id < n_as); const struct ggml_tensor * src0_row = dst->src[row_id + 2]; ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1); diff --git a/ggml.h b/ggml.h index b154b6dae..bb09160b9 100644 --- a/ggml.h +++ b/ggml.h @@ -1052,6 +1052,7 @@ extern "C" { GGML_API struct ggml_tensor * ggml_mul_mat_id( struct ggml_context * ctx, struct ggml_tensor * as[], + int n_as, struct ggml_tensor * ids, int id, struct ggml_tensor * b); diff --git a/llama.cpp b/llama.cpp index 3c4da6a1c..3b2a67979 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4270,11 +4270,11 @@ struct llm_build_context { ggml_tensor ** ffn_down_exp = (ggml_tensor **) model.layers[il].ffn_down_exp; cur_expert = ggml_mul(ctx0, - ggml_mul_mat_id(ctx0, ffn_up_exp, selected_experts, i, cur), + ggml_mul_mat_id(ctx0, ffn_up_exp, n_experts, selected_experts, i, cur), ggml_silu(ctx0, - ggml_mul_mat_id(ctx0, ffn_gate_exp, selected_experts, i, cur))); // [n_tokens, n_embd] + ggml_mul_mat_id(ctx0, ffn_gate_exp, n_experts, selected_experts, i, cur))); // [n_tokens, n_embd] - cur_expert = ggml_mul_mat_id(ctx0, ffn_down_exp, selected_experts, i, cur_expert); // [n_tokens, n_embd] + cur_expert = ggml_mul_mat_id(ctx0, ffn_down_exp, n_experts, selected_experts, i, cur_expert); // [n_tokens, n_embd] cur_expert = ggml_mul(ctx0, cur_expert, ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0])); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 5b1b8cb7c..dddc2b899 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -343,6 +343,8 @@ struct test_case { ud->ok = false; } return true; + + GGML_UNUSED(index); }; ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud); @@ -803,7 +805,7 @@ struct test_mul_mat_id : public test_case { } ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n); ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n); - ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), ids, id, b); + ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), n_mats, ids, id, b); return out; }