From c18610b4ee29ca056bb4f2d375a4ad1b16f44ef7 Mon Sep 17 00:00:00 2001 From: leo-pony Date: Fri, 22 Nov 2024 14:07:20 +0800 Subject: [PATCH] CANN: Support Ascend310P to accelerate F32 and F16 Model (#10216) * CANN Support Ascend310P to accelerate F32 and F16 Model * Add compile option soc type macro ASCEND_310P to ggml-cann lib * Remove unused code * Remove the ascend soc_type hard code compile option in CMakelist.txt --- ggml/src/ggml-cann/CMakeLists.txt | 29 ++++++++++++++++ ggml/src/ggml-cann/aclnn_ops.cpp | 18 ++++++++++ ggml/src/ggml-cann/kernels/CMakeLists.txt | 7 ++-- ggml/src/ggml-cann/kernels/dup.cpp | 32 +++++++++++++----- ggml/src/ggml-cann/kernels/get_row_f16.cpp | 37 +++++++++++++-------- ggml/src/ggml-cann/kernels/get_row_f32.cpp | 36 ++++++++++++-------- ggml/src/ggml-cann/kernels/get_row_q4_0.cpp | 5 ++- 7 files changed, 123 insertions(+), 41 deletions(-) diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt index c8e15c6d4..756200b89 100644 --- a/ggml/src/ggml-cann/CMakeLists.txt +++ b/ggml/src/ggml-cann/CMakeLists.txt @@ -3,6 +3,33 @@ if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOM message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}") endif() +# Auto-detech Soc type and Soc version, if detect failed, will abort build +set(SOC_VERSION "") +function(detect_ascend_soc_type SOC_VERSION) + execute_process( + COMMAND bash -c "npu-smi info|awk -F' ' 'NF > 0 && NR==7 {print $3}'" + OUTPUT_VARIABLE npu_info + RESULT_VARIABLE npu_result + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if("${npu_info}" STREQUAL "" OR ${npu_result}) + message(FATAL_ERROR "Auto-detech ascend soc type failed, please specify manually or check ascend device working normally.") + endif() + set(${SOC_VERSION} "Ascend${npu_info}" PARENT_SCOPE) +endfunction() + +if(NOT SOC_TYPE) + detect_ascend_soc_type(SOC_VERSION) + set(SOC_TYPE "${SOC_VERSION}") + message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}") +else() + string(TOLOWER ${SOC_TYPE} SOC_VERSION) +endif() + +# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P. +string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}") +set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}") + if (CANN_INSTALL_DIR) # Only Support Linux. if (NOT UNIX) @@ -39,6 +66,8 @@ if (CANN_INSTALL_DIR) target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS}) target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64) + target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") + message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}") message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}") else() diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index a4ec8418e..1f4ee986c 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2312,6 +2312,14 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { switch (src0->type) { case GGML_TYPE_F32: + { +#ifdef ASCEND_310P + // Special operation for get_row_f32 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes + if ((src0->ne[0] % 8) != 0) { + size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); + ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len)); + } +#endif aclrtlaunch_ascendc_get_row_f32( 24, ctx.stream(), src0->data, src1->data, dst->data, ((ggml_tensor*)src0->extra)->ne, @@ -2320,7 +2328,16 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, ((ggml_tensor*)dst->extra)->nb); break; + } case GGML_TYPE_F16: + { +#ifdef ASCEND_310P + // Special operation for get_row_f16 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes + if ((src0->ne[0] % 16) != 0) { + size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16 + ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len)); + } +#endif aclrtlaunch_ascendc_get_row_f16( 24, ctx.stream(), src0->data, src1->data, dst->data, ((ggml_tensor*)src0->extra)->ne, @@ -2329,6 +2346,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, ((ggml_tensor*)dst->extra)->nb); break; + } case GGML_TYPE_Q4_0: aclrtlaunch_ascendc_get_row_q4_0( 24, ctx.stream(), src0->data, src1->data, dst->data, diff --git a/ggml/src/ggml-cann/kernels/CMakeLists.txt b/ggml/src/ggml-cann/kernels/CMakeLists.txt index 5b4fef91b..6a4e17cce 100644 --- a/ggml/src/ggml-cann/kernels/CMakeLists.txt +++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt @@ -1,7 +1,3 @@ -if (NOT SOC_TYPE) - set (SOC_TYPE "Ascend910B3") -endif() - file(GLOB SRC_FILES get_row_f32.cpp get_row_f16.cpp @@ -13,7 +9,6 @@ file(GLOB SRC_FILES dup.cpp ) -string(TOLOWER ${SOC_TYPE} SOC_VERSION) set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR}) set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim") @@ -30,4 +25,6 @@ ascendc_library(ascendc_kernels STATIC ${SRC_FILES} ) +message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.") +ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP) diff --git a/ggml/src/ggml-cann/kernels/dup.cpp b/ggml/src/ggml-cann/kernels/dup.cpp index e2c651152..99f03e058 100644 --- a/ggml/src/ggml-cann/kernels/dup.cpp +++ b/ggml/src/ggml-cann/kernels/dup.cpp @@ -5,6 +5,7 @@ using namespace AscendC; #define BUFFER_NUM 2 +const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template template class DupByRows { @@ -19,6 +20,7 @@ class DupByRows { // Input has four dims. int64_t op_block_num = GetBlockNum(); int64_t op_block_idx = GetBlockIdx(); + assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM); // param num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3]; @@ -51,24 +53,36 @@ class DupByRows { __aicore__ inline void copy_in() { LocalTensor src_local = src_queue.AllocTensor(); - - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = num_elem * sizeof(SRC_T); - DataCopyPadExtParams padParams; - DataCopyPad(src_local, src_gm, dataCopyParams, padParams); - + const size_t elem_per_block = 32 / sizeof(SRC_T); + size_t tail = num_elem % elem_per_block; + size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem; + DataCopy(src_local, src_gm, cpy_elements_len); src_queue.EnQue(src_local); } __aicore__ inline void copy_out() { LocalTensor dst_local = dst_queue.DeQue(); - +#ifdef ASCEND_310P + const size_t elem_per_block = 32 / sizeof(DST_T); + size_t tail = num_elem % elem_per_block; + size_t len = num_elem & ~(elem_per_block - 1); + if (len > 0) { + DataCopy(dst_gm, dst_local, len); + } + if(tail != 0) { + for (size_t i = tail; i < elem_per_block; i++) { + dst_local[len + i].SetValue(0, 0); + } + SetAtomicAdd(); + DataCopy(dst_gm[len], dst_local[len], elem_per_block); + SetAtomicNone(); + } +#else DataCopyExtParams dataCopyParams; dataCopyParams.blockCount = 1; dataCopyParams.blockLen = num_elem * sizeof(DST_T); DataCopyPad(dst_gm, dst_local, dataCopyParams); - +#endif dst_queue.FreeTensor(dst_local); } diff --git a/ggml/src/ggml-cann/kernels/get_row_f16.cpp b/ggml/src/ggml-cann/kernels/get_row_f16.cpp index c704b5b2e..416b45104 100644 --- a/ggml/src/ggml-cann/kernels/get_row_f16.cpp +++ b/ggml/src/ggml-cann/kernels/get_row_f16.cpp @@ -14,7 +14,7 @@ class GET_ROW_F16 { int64_t *output_ne_ub, size_t *output_nb_ub) { // TODO, use template for F16/f32 int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); + op_block_idx = GetBlockIdx(); for (int i = 0; i < 4; i++) { input_ne[i] = input_ne_ub[i]; @@ -59,32 +59,42 @@ class GET_ROW_F16 { } __aicore__ inline void copy_in(uint32_t offset, size_t len) { + size_t origin_len = len; LocalTensor input_local = input_queue.AllocTensor(); - size_t tail = len % 32; - len = len & ~31; - DataCopy(input_local, input_gm[offset], len); + const size_t elem_per_block = 32 / sizeof(half); + size_t tail = len % elem_per_block; + len = len & ~(elem_per_block - 1); if(tail != 0) { - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = tail * sizeof(half); - DataCopyPadExtParams padParams; - DataCopyPad(input_local[len], input_gm[offset + len], - dataCopyParams, padParams); + len += elem_per_block; } + DataCopy(input_local, input_gm[offset], len); input_queue.EnQue(input_local); } __aicore__ inline void copy_out(uint32_t offset, size_t len) { LocalTensor output_local = output_queue.DeQue(); - size_t tail = len % 32; - len = len & ~31; - DataCopy(output_gm[offset], output_local, len); + const size_t elem_per_block = 32 / sizeof(float); + size_t tail = len % elem_per_block; + len = len & ~(elem_per_block - 1); + if (len > 0) { + DataCopy(output_gm[offset], output_local, len); + } + if(tail != 0) { +#ifdef ASCEND_310P + for (size_t i = tail; i < elem_per_block; i++) { + output_local[len + i].SetValue(0, 0); + } + SetAtomicAdd(); + DataCopy(output_gm[offset + len], output_local[len], elem_per_block); + SetAtomicNone(); +#else DataCopyExtParams dataCopyParams; dataCopyParams.blockCount = 1; dataCopyParams.blockLen = tail * sizeof(float); DataCopyPad(output_gm[offset + len], output_local[len], dataCopyParams); +#endif } output_queue.FreeTensor(output_local); } @@ -150,6 +160,7 @@ class GET_ROW_F16 { GlobalTensor output_gm; TQue input_queue; TQue output_queue; + int64_t op_block_idx; }; template diff --git a/ggml/src/ggml-cann/kernels/get_row_f32.cpp b/ggml/src/ggml-cann/kernels/get_row_f32.cpp index 9db080af3..02116905b 100644 --- a/ggml/src/ggml-cann/kernels/get_row_f32.cpp +++ b/ggml/src/ggml-cann/kernels/get_row_f32.cpp @@ -13,7 +13,7 @@ class GET_ROW_F32 { int64_t *indices_ne_ub, size_t *indices_nb_ub, int64_t *output_ne_ub, size_t *output_nb_ub) { int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); + op_block_idx = GetBlockIdx(); for (int i = 0; i < 4; i++) { input_ne[i] = input_ne_ub[i]; @@ -55,31 +55,40 @@ class GET_ROW_F32 { __aicore__ inline void copy_in(uint32_t offset, size_t len) { LocalTensor input_local = input_queue.AllocTensor(); - size_t tail = len % 32; - len = len & ~31; - DataCopy(input_local, input_gm[offset], len); + const size_t elem_per_block = 32 / sizeof(float); + size_t tail = len % elem_per_block; + len = len & ~(elem_per_block - 1); if(tail != 0) { - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = tail * sizeof(float); - DataCopyPadExtParams padParams; - DataCopyPad(input_local[len], input_gm[offset + len], - dataCopyParams, padParams); + len += elem_per_block; } + DataCopy(input_local, input_gm[offset], len); input_queue.EnQue(input_local); } __aicore__ inline void copy_out(uint32_t offset, size_t len) { LocalTensor output_local = output_queue.DeQue(); - size_t tail = len % 32; - len = len & ~31; - DataCopy(output_gm[offset], output_local, len); + const size_t elem_per_block = 32 / sizeof(float); + size_t tail = len % elem_per_block; + len = len & ~(elem_per_block - 1); + if (len > 0) { + DataCopy(output_gm[offset], output_local, len); + } + if(tail != 0) { +#ifdef ASCEND_310P + for (size_t i = tail; i < elem_per_block; i++) { + output_local[len + i].SetValue(0, 0); + } + SetAtomicAdd(); + DataCopy(output_gm[offset + len], output_local[len], elem_per_block); + SetAtomicNone(); +#else DataCopyExtParams dataCopyParams; dataCopyParams.blockCount = 1; dataCopyParams.blockLen = tail * sizeof(float); DataCopyPad(output_gm[offset + len], output_local[len], dataCopyParams); +#endif } output_queue.FreeTensor(output_local); } @@ -144,6 +153,7 @@ class GET_ROW_F32 { GlobalTensor output_gm; TQue input_queue; TQue output_queue; + int64_t op_block_idx; }; template diff --git a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp index a80bfeec2..377211096 100644 --- a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +++ b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp @@ -110,9 +110,12 @@ class GET_ROW_Q4_0 { LocalTensor output_local = output_queue.AllocTensor(); // TODO: cast more data to speed up. +#ifdef ASCEND_310P + // TODO: 310P support quantification +#else Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0); Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0); - +#endif // Only mul need compile by group. half scale = scale_gm.GetValue(scale_offset);