ggml : add loongarch lsx and lasx support (#6454)

* add loongarch lsx and lasx optimize code * Add loongarch compilation support to makefile * revert stb_image.h * opt bytes_from_nibbles_32 and sum_i16_pairs_float * fix undeclared * format code * update * update 2 --------- Co-authored-by: Jinyang He <hejinyang@loongson.cn>
2024-12-25 02:44:36 +00:00 · 2024-05-20 15:19:21 +08:00 · 2024-05-20 15:19:21 +08:00 · 65c58207ec
commit 65c58207ec
parent 1cc0155d04
5 changed files with 2316 additions and 8 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -134,6 +134,8 @@ set(LLAMA_SCHED_MAX_COPIES  "4" CACHE STRING "llama: max input copies for pipeli
 option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER                    "llama: build server example"                      ON)
 option(LLAMA_LASX                            "llama: enable lasx"                               ON)
 option(LLAMA_LSX                             "llama: enable lsx"                                ON)
 # add perf arguments
 option(LLAMA_PERF                            "llama: enable perf"                               OFF)
@ -1108,6 +1110,17 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
    endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
    message(STATUS "loongarch64 detected")
    list(APPEND ARCH_FLAGS -march=loongarch64)
    if (LLAMA_LASX)
        list(APPEND ARCH_FLAGS -mlasx)
    endif()
    if (LLAMA_LSX)
        list(APPEND ARCH_FLAGS -mlsx)
    endif()
 else()
    message(STATUS "Unknown architecture")
 endif()
--- a/5
+++ b/5
@ -379,6 +379,11 @@ ifneq ($(filter ppc64le%,$(UNAME_M)),)
 	CUDA_POWER_ARCH = 1
 endif
 ifneq ($(filter loongarch64%,$(UNAME_M)),)
 	MK_CFLAGS   += -mlasx
 	MK_CXXFLAGS += -mlasx
 endif
 else
 	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
--- a/ggml-impl.h
+++ b/ggml-impl.h
@ -455,6 +455,34 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
 #include <riscv_vector.h>
 #endif
 #if defined(__loongarch64)
 #if defined(__loongarch_asx)
 #include <lasxintrin.h>
 #endif
 #if defined(__loongarch_sx)
 #include <lsxintrin.h>
 #endif
 #endif
 #if defined(__loongarch_asx)
 typedef union {
    int32_t i;
    float f;
 } ft_union;
 /* float type data load instructions */
 static __m128 __lsx_vreplfr2vr_s(float val) {
    ft_union fi_tmpval = {.f = val};
    return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
 }
 static __m256 __lasx_xvreplfr2vr_s(float val) {
    ft_union fi_tmpval = {.f = val};
    return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
 }
 #endif
 #ifdef __F16C__
 #ifdef _MSC_VER
--- a/ggml-quants.c
+++ b/ggml-quants.c
--- a/ggml.c
+++ b/ggml.c
@ -1523,6 +1523,195 @@ static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
 #define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
 #define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
 #elif defined(__loongarch_asx)
 #define GGML_SIMD
 // F32 LASX
 #define GGML_F32_STEP 32
 #define GGML_F32_EPR  8
 #define GGML_F32x8         __m256
 #define GGML_F32x8_ZERO    (__m256)__lasx_xvldi(0)
 #define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
 #define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
 #define GGML_F32x8_STORE(x,y)   __lasx_xvst((y), (x), 0)
 #define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
 #define GGML_F32x8_ADD     __lasx_xvfadd_s
 #define GGML_F32x8_MUL     __lasx_xvfmul_s
 #define GGML_F32x8_REDUCE(res, x)                                 \
 do {                                                              \
    int offset = GGML_F32_ARR >> 1;                               \
    for (int i = 0; i < offset; ++i) {                            \
        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
    }                                                             \
    offset >>= 1;                                                 \
    for (int i = 0; i < offset; ++i) {                            \
        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
    }                                                             \
    offset >>= 1;                                                 \
    for (int i = 0; i < offset; ++i) {                            \
        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
    }                                                             \
    float *tmp_p = (float *)&x[0]; \
    res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7];  \
 } while (0)
 // TODO: is this optimal ?
 #define GGML_F32_VEC        GGML_F32x8
 #define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
 #define GGML_F32_VEC_SET1   GGML_F32x8_SET1
 #define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
 #define GGML_F32_VEC_STORE  GGML_F32x8_STORE
 #define GGML_F32_VEC_FMA    GGML_F32x8_FMA
 #define GGML_F32_VEC_ADD    GGML_F32x8_ADD
 #define GGML_F32_VEC_MUL    GGML_F32x8_MUL
 #define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
 // F16 LASX
 #define GGML_F16_STEP 32
 #define GGML_F16_EPR  8
 // F16 arithmetic is not supported by AVX, so we use F32 instead
 #define GGML_F32Cx8             __m256
 #define GGML_F32Cx8_ZERO    (__m256)__lasx_xvldi(0)
 #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
 static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
    float tmp[8];
    for (int i = 0; i < 8; i++) {
        tmp[i] = GGML_FP16_TO_FP32(x[i]);
    }
    return (__m256)__lasx_xvld(tmp, 0);
 }
 static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
    float arr[8];
    __lasx_xvst(y, arr, 0);
    for (int i = 0; i < 8; i++)
        x[i] = GGML_FP32_TO_FP16(arr[i]);
 }
 #define GGML_F32Cx8_LOAD(x)     __lasx_f32cx8_load(x)
 #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
 #define GGML_F32Cx8_FMA         GGML_F32x8_FMA
 #define GGML_F32Cx8_ADD         __lasx_xvfadd_s
 #define GGML_F32Cx8_MUL         __lasx_xvfmul_s
 #define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
 #define GGML_F16_VEC                GGML_F32Cx8
 #define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
 #define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
 #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
 #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
 #define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
 #define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
 #define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
 #define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
 #elif defined(__loongarch_sx)
 #define GGML_SIMD
 // F32 LSX
 #define GGML_F32_STEP 32
 #define GGML_F32_EPR  4
 #define GGML_F32x4         __m128
 #define GGML_F32x4_ZERO    __lsx_vldi(0)
 #define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
 #define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
 #define GGML_F32x4_STORE((x),(y))   __lsx_vst((y), (x), 0)
 #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
 #define GGML_F32x4_ADD     __lsx_vfadd_s
 #define GGML_F32x4_MUL     __lsx_vfmul_s
 #define GGML_F32x4_REDUCE(res, x)                                 \
 {                                                                 \
    int offset = GGML_F32_ARR >> 1;                               \
    for (int i = 0; i < offset; ++i) {                            \
        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
    }                                                             \
    offset >>= 1;                                                 \
    for (int i = 0; i < offset; ++i) {                            \
        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
    }                                                             \
    offset >>= 1;                                                 \
    for (int i = 0; i < offset; ++i) {                            \
        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
    }                                                             \
    __m128i tmp = __lsx_vsrli_d((__m128i)x[0], 32); \
    tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, x[0]); \
    tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
    const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
    tmp = __lsx_vsrli_d((__m128i)t0, 32); \
    tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, t0); \
    tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
    res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0);        \
 }
 #define GGML_F32_VEC        GGML_F32x4
 #define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
 #define GGML_F32_VEC_SET1   GGML_F32x4_SET1
 #define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
 #define GGML_F32_VEC_STORE  GGML_F32x4_STORE
 #define GGML_F32_VEC_FMA    GGML_F32x4_FMA
 #define GGML_F32_VEC_ADD    GGML_F32x4_ADD
 #define GGML_F32_VEC_MUL    GGML_F32x4_MUL
 #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
 // F16 LSX
 #define GGML_F16_STEP 32
 #define GGML_F16_EPR  4
 static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
    float tmp[4];
    tmp[0] = GGML_FP16_TO_FP32(x[0]);
    tmp[1] = GGML_FP16_TO_FP32(x[1]);
    tmp[2] = GGML_FP16_TO_FP32(x[2]);
    tmp[3] = GGML_FP16_TO_FP32(x[3]);
    return __lsx_vld(tmp, 0);
 }
 static inline void __lsx_f16x4_store(ggml_fp16_t *x, __m128 y) {
    float arr[4];
    __lsx_vst(y, arr, 0);
    x[0] = GGML_FP32_TO_FP16(arr[0]);
    x[1] = GGML_FP32_TO_FP16(arr[1]);
    x[2] = GGML_FP32_TO_FP16(arr[2]);
    x[3] = GGML_FP32_TO_FP16(arr[3]);
 }
 #define GGML_F32Cx4             __m128
 #define GGML_F32Cx4_ZERO        __lsx_vldi(0)
 #define GGML_F32Cx4_SET1(x)     __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
 #define GGML_F32Cx4_LOAD(x)     __lsx_f16x4_load(x)
 #define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
 #define GGML_F32Cx4_FMA         GGML_F32x4_FMA
 #define GGML_F32Cx4_ADD         __lsx_vfadd_s
 #define GGML_F32Cx4_MUL         __lsx_vfmul_s
 #define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
 #define GGML_F16_VEC                 GGML_F32Cx4
 #define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
 #define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
 #define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
 #define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
 #define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
 #define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
 #define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
 #define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
 #endif
 // GGML_F32_ARR / GGML_F16_ARR