From ee8fb399aa0cecaf6c0e6ce56d89184fd166191f Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Sat, 9 Dec 2023 12:42:25 +0100
Subject: [PATCH] ggml : add n_as argument to ggml_mul_mat_id

---
 ggml-cuda.cu               |  4 +++-
 ggml-metal.m               |  2 +-
 ggml.c                     | 14 ++++++--------
 ggml.h                     |  1 +
 llama.cpp                  |  6 +++---
 tests/test-backend-ops.cpp |  4 +++-
 6 files changed, 17 insertions(+), 14 deletions(-)
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index c1c7c30e5..04a5d2078 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -8244,6 +8244,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
 
     const struct ggml_tensor * ids = src0;
     const int32_t id = dst->op_params[0];
+    const int32_t n_as = dst->op_params[1];
+
     const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
 
     std::vector<char> ids_host(ggml_nbytes(ids));
@@ -8272,7 +8274,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
 
         const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
 
-        GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]);
+        GGML_ASSERT(row_id >= 0 && row_id < n_as);
 
         const struct ggml_tensor * src0_row = dst->src[row_id + 2];
 
diff --git a/ggml-metal.m b/ggml-metal.m
index 595bb6c0f..8389373a8 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1460,7 +1460,7 @@ void ggml_metal_graph_compute(
 
                             GGML_ASSERT(src0t == GGML_TYPE_I32);
 
-                            const int n_as = ne00;
+                            const int n_as = ((int32_t *) dst->op_params)[1];
 
                             // TODO: make this more general
                             GGML_ASSERT(n_as <= 8);
diff --git a/ggml.c b/ggml.c
index 1c81e7912..9982c2dad 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4076,12 +4076,11 @@ struct ggml_tensor * ggml_mul_mat(
 struct ggml_tensor * ggml_mul_mat_id(
         struct ggml_context * ctx,
         struct ggml_tensor  * as[],
+        int                   n_as,
         struct ggml_tensor  * ids,
         int                   id,
         struct ggml_tensor  * b) {
 
-    int64_t n_as = ids->ne[0];
-
     GGML_ASSERT(ids->type == GGML_TYPE_I32);
     GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
     GGML_ASSERT(ids->ne[1] == b->ne[1]);
@@ -4099,6 +4098,7 @@ struct ggml_tensor * ggml_mul_mat_id(
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
 
     ggml_set_op_params_i32(result, 0, id);
+    ggml_set_op_params_i32(result, 1, n_as);
 
     result->op   = GGML_OP_MUL_MAT_ID;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -4106,8 +4106,7 @@ struct ggml_tensor * ggml_mul_mat_id(
     result->src[1] = b;
 
     // TODO: n_as is the selected experts, but it should be the total number of experts
-    //for (int64_t i = 0; i < n_as; i++) {
-    for (int64_t i = 0; i < 8; i++) {
+    for (int i = 0; i < n_as; i++) {
         struct ggml_tensor * a = as[i];
         GGML_ASSERT(ggml_are_same_shape(as[0], a));
         GGML_ASSERT(ggml_can_mul_mat(a, b));
@@ -9757,14 +9756,13 @@ static void ggml_compute_forward_mul_mat_id(
     }
 
     const struct ggml_tensor * ids = src0;
-    const int id = ggml_get_op_params_i32(dst, 0);
+    const int id   = ggml_get_op_params_i32(dst, 0);
+    const int n_as = ggml_get_op_params_i32(dst, 1);
 
     for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
         const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
 
-        // TODO: this assert seems wrong?
-        //printf("row_id = %d, ids->ne[0] = %d, id = %d\n", row_id, ids->ne[0], id);
-        //GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]);
+        GGML_ASSERT(row_id >= 0 && row_id < n_as);
 
         const struct ggml_tensor * src0_row = dst->src[row_id + 2];
         ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
diff --git a/ggml.h b/ggml.h
index b154b6dae..bb09160b9 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1052,6 +1052,7 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_mul_mat_id(
             struct ggml_context * ctx,
             struct ggml_tensor  * as[],
+            int                   n_as,
             struct ggml_tensor  * ids,
             int                   id,
             struct ggml_tensor  * b);
diff --git a/llama.cpp b/llama.cpp
index 3c4da6a1c..3b2a67979 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4270,11 +4270,11 @@ struct llm_build_context {
                     ggml_tensor ** ffn_down_exp = (ggml_tensor **) model.layers[il].ffn_down_exp;
 
                     cur_expert = ggml_mul(ctx0,
-                            ggml_mul_mat_id(ctx0, ffn_up_exp, selected_experts, i, cur),
+                            ggml_mul_mat_id(ctx0, ffn_up_exp, n_experts, selected_experts, i, cur),
                             ggml_silu(ctx0,
-                                ggml_mul_mat_id(ctx0, ffn_gate_exp, selected_experts, i, cur))); // [n_tokens, n_embd]
+                                ggml_mul_mat_id(ctx0, ffn_gate_exp, n_experts, selected_experts, i, cur))); // [n_tokens, n_embd]
 
-                    cur_expert = ggml_mul_mat_id(ctx0, ffn_down_exp, selected_experts, i, cur_expert); // [n_tokens, n_embd]
+                    cur_expert = ggml_mul_mat_id(ctx0, ffn_down_exp, n_experts, selected_experts, i, cur_expert); // [n_tokens, n_embd]
                     cur_expert = ggml_mul(ctx0, cur_expert,
                             ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
 
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 5b1b8cb7c..dddc2b899 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -343,6 +343,8 @@ struct test_case {
                 ud->ok = false;
             }
             return true;
+
+            GGML_UNUSED(index);
         };
 
         ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud);
@@ -803,7 +805,7 @@ struct test_mul_mat_id : public test_case {
         }
         ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
         ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n);
-        ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), ids, id, b);
+        ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), n_mats, ids, id, b);
         return out;
     }