mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-13 20:14:29 +00:00
CANN: Fix SOC_TYPE compile bug (#10519)
* CANN: Fix the bug build fail on Ascend310P under two cases: 1) Manual specify SOC_TYPE 2) Under some unusual compile environment * Update the cann backend News content: Support F16 and F32 data type model for Ascend 310P NPU. * fix CANN compile fail bug: the assert in ascend kernel function doesn't supportted on some CANN version
This commit is contained in:
parent
b7420131bf
commit
605fa66c50
@ -23,6 +23,8 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
|||||||
|
|
||||||
## News
|
## News
|
||||||
|
|
||||||
|
- 2024.11
|
||||||
|
- Support F16 and F32 data type model for Ascend 310P NPU.
|
||||||
- 2024.8
|
- 2024.8
|
||||||
- Support `Q4_0` and `Q8_0` data type for Ascend NPU.
|
- Support `Q4_0` and `Q8_0` data type for Ascend NPU.
|
||||||
- 2024.7
|
- 2024.7
|
||||||
@ -43,6 +45,7 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
|||||||
| Ascend NPU | Status |
|
| Ascend NPU | Status |
|
||||||
|:-----------------------------:|:-------:|
|
|:-----------------------------:|:-------:|
|
||||||
| Atlas 300T A2 | Support |
|
| Atlas 300T A2 | Support |
|
||||||
|
| Atlas 300I Duo | Support |
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
|
@ -22,13 +22,14 @@ if(NOT SOC_TYPE)
|
|||||||
detect_ascend_soc_type(SOC_VERSION)
|
detect_ascend_soc_type(SOC_VERSION)
|
||||||
set(SOC_TYPE "${SOC_VERSION}")
|
set(SOC_TYPE "${SOC_VERSION}")
|
||||||
message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
|
message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
|
||||||
else()
|
|
||||||
string(TOLOWER ${SOC_TYPE} SOC_VERSION)
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P.
|
string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
|
||||||
|
|
||||||
|
# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND_310P.
|
||||||
string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
|
string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
|
||||||
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
|
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
|
||||||
|
string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
|
||||||
|
|
||||||
if (CANN_INSTALL_DIR)
|
if (CANN_INSTALL_DIR)
|
||||||
# Only Support Linux.
|
# Only Support Linux.
|
||||||
|
@ -25,6 +25,6 @@ ascendc_library(ascendc_kernels STATIC
|
|||||||
${SRC_FILES}
|
${SRC_FILES}
|
||||||
)
|
)
|
||||||
|
|
||||||
message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.")
|
message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
|
||||||
ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
|
ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
|
||||||
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
|
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
|
||||||
|
@ -20,7 +20,6 @@ class DupByRows {
|
|||||||
// Input has four dims.
|
// Input has four dims.
|
||||||
int64_t op_block_num = GetBlockNum();
|
int64_t op_block_num = GetBlockNum();
|
||||||
int64_t op_block_idx = GetBlockIdx();
|
int64_t op_block_idx = GetBlockIdx();
|
||||||
assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM);
|
|
||||||
|
|
||||||
// param
|
// param
|
||||||
num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
|
num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
|
||||||
|
@ -2,6 +2,15 @@
|
|||||||
|
|
||||||
// optimize me. Use template to avoid copy code.
|
// optimize me. Use template to avoid copy code.
|
||||||
using namespace AscendC;
|
using namespace AscendC;
|
||||||
|
#ifdef ASCEND_310P // 310P not support 4bit get row
|
||||||
|
extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
|
||||||
|
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
||||||
|
GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
|
||||||
|
GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
||||||
|
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
||||||
|
printf("Ascend310P not support 4bit get row.\n");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
|
||||||
#define BUFFER_NUM 2
|
#define BUFFER_NUM 2
|
||||||
|
|
||||||
@ -110,12 +119,9 @@ class GET_ROW_Q4_0 {
|
|||||||
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
||||||
|
|
||||||
// TODO: cast more data to speed up.
|
// TODO: cast more data to speed up.
|
||||||
#ifdef ASCEND_310P
|
|
||||||
// TODO: 310P support quantification
|
|
||||||
#else
|
|
||||||
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
|
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
|
||||||
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
|
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
|
||||||
#endif
|
|
||||||
// Only mul need compile by group.
|
// Only mul need compile by group.
|
||||||
half scale = scale_gm.GetValue(scale_offset);
|
half scale = scale_gm.GetValue(scale_offset);
|
||||||
|
|
||||||
@ -194,3 +200,5 @@ extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
|
|||||||
indices_nb_ub, output_ne_ub, output_nb_ub);
|
indices_nb_ub, output_ne_ub, output_nb_ub);
|
||||||
op.calculate();
|
op.calculate();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif // #ifdef ASCEND_310P
|
||||||
|
@ -1,6 +1,14 @@
|
|||||||
#include "kernel_operator.h"
|
#include "kernel_operator.h"
|
||||||
|
|
||||||
using namespace AscendC;
|
using namespace AscendC;
|
||||||
|
#ifdef ASCEND_310P
|
||||||
|
extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
|
||||||
|
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||||
|
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||||
|
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
||||||
|
printf("Ascend310P not support f16->8bit quantization.\n");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
|
||||||
#define BUFFER_NUM 2
|
#define BUFFER_NUM 2
|
||||||
#define QK8_0 32
|
#define QK8_0 32
|
||||||
@ -206,3 +214,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
|
|||||||
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
||||||
op.calculate();
|
op.calculate();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif // #ifdef ASCEND_310P
|
||||||
|
@ -1,6 +1,14 @@
|
|||||||
#include "kernel_operator.h"
|
#include "kernel_operator.h"
|
||||||
|
|
||||||
using namespace AscendC;
|
using namespace AscendC;
|
||||||
|
#ifdef ASCEND_310P // 310P not support f32->8bit quantization
|
||||||
|
extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
|
||||||
|
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||||
|
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||||
|
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
||||||
|
printf("Ascend310P not support f32->8bit quantization.\n");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
|
||||||
#define BUFFER_NUM 2
|
#define BUFFER_NUM 2
|
||||||
#define QK8_0 32
|
#define QK8_0 32
|
||||||
@ -204,3 +212,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
|
|||||||
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
||||||
op.calculate();
|
op.calculate();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif // #ifdef ASCEND_310P
|
||||||
|
@ -1,6 +1,21 @@
|
|||||||
#include "kernel_operator.h"
|
#include "kernel_operator.h"
|
||||||
|
|
||||||
using namespace AscendC;
|
using namespace AscendC;
|
||||||
|
#ifdef ASCEND_310P // 310P not support float->4bit quantization
|
||||||
|
extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
|
||||||
|
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||||
|
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||||
|
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
||||||
|
printf("Ascend310P not support f32->4bit quantization.\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
|
||||||
|
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||||
|
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||||
|
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
||||||
|
printf("Ascend310P not support f16->4bit quantization.\n");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
|
||||||
#define BUFFER_NUM 2
|
#define BUFFER_NUM 2
|
||||||
#define Group_Size 32
|
#define Group_Size 32
|
||||||
@ -276,3 +291,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
|
|||||||
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
||||||
op.calculate();
|
op.calculate();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif // #ifdef ASCEND_310P
|
||||||
|
Loading…
Reference in New Issue
Block a user