From 16bdfa42acb09175e88cf97e9d9e4e48f616d120 Mon Sep 17 00:00:00 2001
From: "Meng, Hengyu" <hengyu.meng@intel.com>
Date: Mon, 15 Jul 2024 19:32:15 +0800
Subject: [PATCH] [SYCL] add concat through dim 1/2 (#8483)

* add concat through dim 1/2
---
 ggml/src/ggml-sycl.cpp         |  67 +----------
 ggml/src/ggml-sycl/backend.hpp |   1 +
 ggml/src/ggml-sycl/concat.cpp  | 195 +++++++++++++++++++++++++++++++++
 ggml/src/ggml-sycl/concat.hpp  |  21 ++++
 4 files changed, 218 insertions(+), 66 deletions(-)
 create mode 100644 ggml/src/ggml-sycl/concat.cpp
 create mode 100644 ggml/src/ggml-sycl/concat.hpp

diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index 5a890237f..36518ff93 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -291,29 +291,6 @@ static void sqr_f32(const float * x, float * dst, const int k,
     dst[i] = x[i] * x[i];
 }
 
-static void concat_f32(const float  *x,const float  *y, float *dst, const int ne0, const int ne02,
-                       const sycl::nd_item<3> &item_ct1) {
-    int nidx = item_ct1.get_local_id(2) +
-               item_ct1.get_group(2) * item_ct1.get_local_range(2);
-    if (nidx >= ne0) {
-        return;
-    }
-    // operation
-    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
-                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
-    if (item_ct1.get_group(0) < ne02) { // src0
-        int offset_src =
-            nidx + item_ct1.get_group(1) * ne0 +
-            item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
-            dst[offset_dst] = x[offset_src];
-    } else {
-        int offset_src =
-            nidx + item_ct1.get_group(1) * ne0 +
-            (item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1);
-            dst[offset_dst] = y[offset_src];
-    }
-}
-
 static void upscale_f32(const float  *x, float *dst, const int nb00, const int nb01,
                         const int nb02, const int nb03, const int ne10, const int ne11,
                         const int ne12, const int ne13, const float sf0, const float sf1,
@@ -1347,20 +1324,6 @@ static void sqr_f32_sycl(const float *x, float *dst, const int k,
         });
 }
 
-static void concat_f32_sycl(const float *x, const float *y, float *dst,
-                            const int ne0, int ne1, int ne2, int ne02,
-                            queue_ptr stream) {
-    int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
-    sycl::range<3> gridDim(ne2, ne1, num_blocks);
-    stream->parallel_for(
-        sycl::nd_range<3>(gridDim *
-                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            concat_f32(x, y, dst, ne0, ne02, item_ct1);
-        });
-}
-
 static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
                              const int nb02, const int nb03, const int ne10, const int ne11,
                              const int ne12, const int ne13, const float sf0, const float sf1,
@@ -2429,28 +2392,6 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor
     (void) src1_dd;
 }
 
-inline void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
-                                const ggml_tensor *src1, ggml_tensor *dst,
-                                const float *src0_dd, const float *src1_dd,
-                                float *dst_dd,
-                                const queue_ptr &main_stream) {
-#pragma message("TODO: generalize concat kernel for dim != 2")
-#pragma message("      https://github.com/ggerganov/llama.cpp/pull/7563")
-    int dim = dst->op_params[0];
-    GGML_ASSERT(dim == 2);
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    for (int i3 = 0; i3 < dst->ne[3]; i3++) {
-        concat_f32_sycl(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
-    }
-
-    (void) src1;
-    (void) dst;
-}
-
 inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
                                  const ggml_tensor *src1, ggml_tensor *dst,
                                  const float *src0_dd, const float *src1_dd,
@@ -3359,12 +3300,6 @@ static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, const ggml_ten
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
-static void ggml_sycl_concat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_concat);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
-}
-
 static void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_upscale);
@@ -4101,7 +4036,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
             func = ggml_sycl_group_norm;
             break;
         case GGML_OP_CONCAT:
-            func = ggml_sycl_concat;
+            func = ggml_sycl_op_concat;
             break;
         case GGML_OP_UPSCALE:
             func = ggml_sycl_upscale;
diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp
index 2a789edfc..067181de3 100644
--- a/ggml/src/ggml-sycl/backend.hpp
+++ b/ggml/src/ggml-sycl/backend.hpp
@@ -13,6 +13,7 @@
 #ifndef GGML_SYCL_BACKEND_HPP
 #define GGML_SYCL_BACKEND_HPP
 
+#include "concat.hpp"
 #include "common.hpp"
 #include "convert.hpp"
 #include "dequantize.hpp"
diff --git a/ggml/src/ggml-sycl/concat.cpp b/ggml/src/ggml-sycl/concat.cpp
new file mode 100644
index 000000000..632eedb9d
--- /dev/null
+++ b/ggml/src/ggml-sycl/concat.cpp
@@ -0,0 +1,195 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#include "concat.hpp"
+#include "common.hpp"
+
+static void concat_f32_dim0(const float *x, const float *y, float *dst,
+                            const int ne0, const int ne00,
+                            const sycl::nd_item<3> &item_ct1) {
+  int nidx = item_ct1.get_local_id(2) +
+             item_ct1.get_group(2) * item_ct1.get_local_range(2);
+  if (nidx >= ne0) {
+    return;
+  }
+  // operation
+  int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
+                   item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+  if (nidx < ne00) { // src0
+    int offset_src = nidx + item_ct1.get_group(1) * ne00 +
+                     item_ct1.get_group(0) * ne00 * item_ct1.get_group_range(1);
+    dst[offset_dst] = x[offset_src];
+  } else {
+    int offset_src =
+        nidx - ne00 + item_ct1.get_group(1) * (ne0 - ne00) +
+        item_ct1.get_group(0) * (ne0 - ne00) * item_ct1.get_group_range(1);
+    dst[offset_dst] = y[offset_src];
+  }
+}
+
+static void concat_f32_dim1(const float *x, const float *y, float *dst,
+                            const int ne0, const int ne01,
+                            const sycl::nd_item<3> &item_ct1) {
+  int nidx = item_ct1.get_local_id(2) +
+             item_ct1.get_group(2) * item_ct1.get_local_range(2);
+  if (nidx >= ne0) {
+    return;
+  }
+  // operation
+  int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
+                   item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+  if (item_ct1.get_group(1) < ne01) { // src0
+    int offset_src =
+        nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * ne01;
+    dst[offset_dst] = x[offset_src];
+  } else {
+    int offset_src =
+        nidx + (item_ct1.get_group(1) - ne01) * ne0 +
+        item_ct1.get_group(0) * ne0 * (item_ct1.get_group_range(1) - ne01);
+    dst[offset_dst] = y[offset_src];
+  }
+}
+
+static void concat_f32_dim2(const float *x, const float *y, float *dst,
+                            const int ne0, const int ne02,
+                            const sycl::nd_item<3> &item_ct1) {
+  int nidx = item_ct1.get_local_id(2) +
+             item_ct1.get_group(2) * item_ct1.get_local_range(2);
+  if (nidx >= ne0) {
+    return;
+  }
+  // operation
+  int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
+                   item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+  if (item_ct1.get_group(0) < ne02) { // src0
+    int offset_src = nidx + item_ct1.get_group(1) * ne0 +
+                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+    dst[offset_dst] = x[offset_src];
+  } else {
+    int offset_src =
+        nidx + item_ct1.get_group(1) * ne0 +
+        (item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1);
+    dst[offset_dst] = y[offset_src];
+  }
+}
+
+static void concat_f32_sycl(const float *x, const float *y, float *dst,
+                            int ne00, int ne01, int ne02, int ne0, int ne1,
+                            int ne2, int dim, queue_ptr stream) {
+  int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
+  sycl::range<3> gridDim(ne2, ne1, num_blocks);
+  switch (dim) {
+  case 0:
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim *
+                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+          concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1);
+        });
+    break;
+  case 1:
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim *
+                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+          concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1);
+        });
+    break;
+  default:
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim *
+                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+          concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1);
+        });
+    break;
+  }
+}
+
+// non-contiguous kernel (slow)
+static void concat_f32_sycl_non_cont(
+    queue_ptr stream, const char *src0, const char *src1, char *dst,
+    int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, uint64_t nb00,
+    uint64_t nb01, uint64_t nb02, uint64_t nb03, int64_t /*ne10*/,
+    int64_t /*ne11*/, int64_t /*ne12*/, int64_t /*ne13*/, uint64_t nb10,
+    uint64_t nb11, uint64_t nb12, uint64_t nb13, int64_t ne0, int64_t ne1,
+    int64_t ne2, int64_t ne3, uint64_t nb0, uint64_t nb1, uint64_t nb2,
+    uint64_t nb3, int32_t dim) {
+  sycl::range<3> gridDim(ne3, ne2, ne1);
+  stream->parallel_for(
+      sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)),
+      [=](sycl::nd_item<3> item_ct1) {
+        int64_t i3 = item_ct1.get_group(0);
+        int64_t i2 = item_ct1.get_group(1);
+        int64_t i1 = item_ct1.get_group(2);
+
+        int64_t o[4] = {0, 0, 0, 0};
+        o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
+
+        const float *x;
+
+        for (int i0 = item_ct1.get_local_id(2); i0 < ne0;
+             i0 += item_ct1.get_local_range(2)) {
+          if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+            x = (const float *)(src0 + (i3)*nb03 + (i2)*nb02 + (i1)*nb01 +
+                                (i0)*nb00);
+          } else {
+            x = (const float *)(src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 +
+                                (i1 - o[1]) * nb11 + (i0 - o[0]) * nb10);
+          }
+
+          float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
+
+          *y = *x;
+        }
+      });
+}
+
+void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                const ggml_tensor *src1, ggml_tensor *dst) {
+  queue_ptr stream = ctx.stream();
+
+  const int32_t dim = ((int32_t *)dst->op_params)[0];
+
+  if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
+    const float *src0_d = (const float *)src0->data;
+    const float *src1_d = (const float *)src1->data;
+
+    float *dst_d = (float *)dst->data;
+
+    if (dim != 3) {
+      for (int i3 = 0; i3 < dst->ne[3]; i3++) {
+        concat_f32_sycl(
+            src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
+            dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1],
+            src0->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
+      }
+    } else {
+      const size_t size0 = ggml_nbytes(src0);
+      const size_t size1 = ggml_nbytes(src1);
+
+      SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
+      SYCL_CHECK(CHECK_TRY_ERROR(
+          stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
+    }
+  } else
+    concat_f32_sycl_non_cont(
+        stream, (const char *)src0->data, (const char *)src1->data,
+        (char *)dst->data, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+        src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src1->ne[0],
+        src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1],
+        src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
+        dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
+}
diff --git a/ggml/src/ggml-sycl/concat.hpp b/ggml/src/ggml-sycl/concat.hpp
new file mode 100644
index 000000000..5a04feaab
--- /dev/null
+++ b/ggml/src/ggml-sycl/concat.hpp
@@ -0,0 +1,21 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_CONCAT_HPP
+#define GGML_SYCL_CONCAT_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                const ggml_tensor *src1, ggml_tensor *dst);
+
+#endif // GGML_SYCL_CONCAT_HPP