mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 02:44:36 +00:00
ggml : add loongarch lsx and lasx support (#6454)
* add loongarch lsx and lasx optimize code * Add loongarch compilation support to makefile * revert stb_image.h * opt bytes_from_nibbles_32 and sum_i16_pairs_float * fix undeclared * format code * update * update 2 --------- Co-authored-by: Jinyang He <hejinyang@loongson.cn>
This commit is contained in:
parent
1cc0155d04
commit
65c58207ec
@ -134,6 +134,8 @@ set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeli
|
|||||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||||
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
|
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
|
||||||
|
option(LLAMA_LASX "llama: enable lasx" ON)
|
||||||
|
option(LLAMA_LSX "llama: enable lsx" ON)
|
||||||
|
|
||||||
# add perf arguments
|
# add perf arguments
|
||||||
option(LLAMA_PERF "llama: enable perf" OFF)
|
option(LLAMA_PERF "llama: enable perf" OFF)
|
||||||
@ -1108,6 +1110,17 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
|||||||
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
|
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
|
||||||
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
||||||
endif()
|
endif()
|
||||||
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||||
|
message(STATUS "loongarch64 detected")
|
||||||
|
|
||||||
|
list(APPEND ARCH_FLAGS -march=loongarch64)
|
||||||
|
if (LLAMA_LASX)
|
||||||
|
list(APPEND ARCH_FLAGS -mlasx)
|
||||||
|
endif()
|
||||||
|
if (LLAMA_LSX)
|
||||||
|
list(APPEND ARCH_FLAGS -mlsx)
|
||||||
|
endif()
|
||||||
|
|
||||||
else()
|
else()
|
||||||
message(STATUS "Unknown architecture")
|
message(STATUS "Unknown architecture")
|
||||||
endif()
|
endif()
|
||||||
|
5
Makefile
5
Makefile
@ -379,6 +379,11 @@ ifneq ($(filter ppc64le%,$(UNAME_M)),)
|
|||||||
CUDA_POWER_ARCH = 1
|
CUDA_POWER_ARCH = 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifneq ($(filter loongarch64%,$(UNAME_M)),)
|
||||||
|
MK_CFLAGS += -mlasx
|
||||||
|
MK_CXXFLAGS += -mlasx
|
||||||
|
endif
|
||||||
|
|
||||||
else
|
else
|
||||||
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
||||||
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
||||||
|
28
ggml-impl.h
28
ggml-impl.h
@ -455,6 +455,34 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|||||||
#include <riscv_vector.h>
|
#include <riscv_vector.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__loongarch64)
|
||||||
|
#if defined(__loongarch_asx)
|
||||||
|
#include <lasxintrin.h>
|
||||||
|
#endif
|
||||||
|
#if defined(__loongarch_sx)
|
||||||
|
#include <lsxintrin.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__loongarch_asx)
|
||||||
|
|
||||||
|
typedef union {
|
||||||
|
int32_t i;
|
||||||
|
float f;
|
||||||
|
} ft_union;
|
||||||
|
|
||||||
|
/* float type data load instructions */
|
||||||
|
static __m128 __lsx_vreplfr2vr_s(float val) {
|
||||||
|
ft_union fi_tmpval = {.f = val};
|
||||||
|
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
||||||
|
ft_union fi_tmpval = {.f = val};
|
||||||
|
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __F16C__
|
#ifdef __F16C__
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
|
2089
ggml-quants.c
2089
ggml-quants.c
File diff suppressed because it is too large
Load Diff
189
ggml.c
189
ggml.c
@ -1523,6 +1523,195 @@ static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
|||||||
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
||||||
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
||||||
|
|
||||||
|
#elif defined(__loongarch_asx)
|
||||||
|
|
||||||
|
#define GGML_SIMD
|
||||||
|
|
||||||
|
// F32 LASX
|
||||||
|
#define GGML_F32_STEP 32
|
||||||
|
#define GGML_F32_EPR 8
|
||||||
|
|
||||||
|
#define GGML_F32x8 __m256
|
||||||
|
#define GGML_F32x8_ZERO (__m256)__lasx_xvldi(0)
|
||||||
|
#define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
|
||||||
|
#define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
|
||||||
|
#define GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0)
|
||||||
|
#define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
|
||||||
|
#define GGML_F32x8_ADD __lasx_xvfadd_s
|
||||||
|
#define GGML_F32x8_MUL __lasx_xvfmul_s
|
||||||
|
#define GGML_F32x8_REDUCE(res, x) \
|
||||||
|
do { \
|
||||||
|
int offset = GGML_F32_ARR >> 1; \
|
||||||
|
for (int i = 0; i < offset; ++i) { \
|
||||||
|
x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
|
||||||
|
} \
|
||||||
|
offset >>= 1; \
|
||||||
|
for (int i = 0; i < offset; ++i) { \
|
||||||
|
x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
|
||||||
|
} \
|
||||||
|
offset >>= 1; \
|
||||||
|
for (int i = 0; i < offset; ++i) { \
|
||||||
|
x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
|
||||||
|
} \
|
||||||
|
float *tmp_p = (float *)&x[0]; \
|
||||||
|
res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \
|
||||||
|
} while (0)
|
||||||
|
// TODO: is this optimal ?
|
||||||
|
|
||||||
|
#define GGML_F32_VEC GGML_F32x8
|
||||||
|
#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
|
||||||
|
#define GGML_F32_VEC_SET1 GGML_F32x8_SET1
|
||||||
|
#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
|
||||||
|
#define GGML_F32_VEC_STORE GGML_F32x8_STORE
|
||||||
|
#define GGML_F32_VEC_FMA GGML_F32x8_FMA
|
||||||
|
#define GGML_F32_VEC_ADD GGML_F32x8_ADD
|
||||||
|
#define GGML_F32_VEC_MUL GGML_F32x8_MUL
|
||||||
|
#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
|
||||||
|
|
||||||
|
// F16 LASX
|
||||||
|
|
||||||
|
#define GGML_F16_STEP 32
|
||||||
|
#define GGML_F16_EPR 8
|
||||||
|
|
||||||
|
// F16 arithmetic is not supported by AVX, so we use F32 instead
|
||||||
|
|
||||||
|
#define GGML_F32Cx8 __m256
|
||||||
|
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
||||||
|
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
||||||
|
|
||||||
|
static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
|
||||||
|
float tmp[8];
|
||||||
|
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (__m256)__lasx_xvld(tmp, 0);
|
||||||
|
}
|
||||||
|
static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
||||||
|
float arr[8];
|
||||||
|
|
||||||
|
__lasx_xvst(y, arr, 0);
|
||||||
|
|
||||||
|
for (int i = 0; i < 8; i++)
|
||||||
|
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
||||||
|
}
|
||||||
|
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
||||||
|
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|
||||||
|
|
||||||
|
#define GGML_F32Cx8_FMA GGML_F32x8_FMA
|
||||||
|
#define GGML_F32Cx8_ADD __lasx_xvfadd_s
|
||||||
|
#define GGML_F32Cx8_MUL __lasx_xvfmul_s
|
||||||
|
#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
|
||||||
|
|
||||||
|
#define GGML_F16_VEC GGML_F32Cx8
|
||||||
|
#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
|
||||||
|
#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
|
||||||
|
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
|
||||||
|
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
|
||||||
|
#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
|
||||||
|
#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
|
||||||
|
#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
|
||||||
|
#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
|
||||||
|
|
||||||
|
#elif defined(__loongarch_sx)
|
||||||
|
|
||||||
|
#define GGML_SIMD
|
||||||
|
|
||||||
|
// F32 LSX
|
||||||
|
|
||||||
|
#define GGML_F32_STEP 32
|
||||||
|
#define GGML_F32_EPR 4
|
||||||
|
|
||||||
|
#define GGML_F32x4 __m128
|
||||||
|
#define GGML_F32x4_ZERO __lsx_vldi(0)
|
||||||
|
#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
||||||
|
#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
|
||||||
|
#define GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0)
|
||||||
|
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
||||||
|
#define GGML_F32x4_ADD __lsx_vfadd_s
|
||||||
|
#define GGML_F32x4_MUL __lsx_vfmul_s
|
||||||
|
#define GGML_F32x4_REDUCE(res, x) \
|
||||||
|
{ \
|
||||||
|
int offset = GGML_F32_ARR >> 1; \
|
||||||
|
for (int i = 0; i < offset; ++i) { \
|
||||||
|
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
|
||||||
|
} \
|
||||||
|
offset >>= 1; \
|
||||||
|
for (int i = 0; i < offset; ++i) { \
|
||||||
|
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
|
||||||
|
} \
|
||||||
|
offset >>= 1; \
|
||||||
|
for (int i = 0; i < offset; ++i) { \
|
||||||
|
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
|
||||||
|
} \
|
||||||
|
__m128i tmp = __lsx_vsrli_d((__m128i)x[0], 32); \
|
||||||
|
tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, x[0]); \
|
||||||
|
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
||||||
|
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
|
||||||
|
tmp = __lsx_vsrli_d((__m128i)t0, 32); \
|
||||||
|
tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, t0); \
|
||||||
|
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
||||||
|
res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define GGML_F32_VEC GGML_F32x4
|
||||||
|
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
||||||
|
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
||||||
|
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
||||||
|
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
||||||
|
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
||||||
|
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
||||||
|
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
||||||
|
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
||||||
|
|
||||||
|
// F16 LSX
|
||||||
|
|
||||||
|
#define GGML_F16_STEP 32
|
||||||
|
#define GGML_F16_EPR 4
|
||||||
|
|
||||||
|
static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
|
||||||
|
float tmp[4];
|
||||||
|
|
||||||
|
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
||||||
|
tmp[1] = GGML_FP16_TO_FP32(x[1]);
|
||||||
|
tmp[2] = GGML_FP16_TO_FP32(x[2]);
|
||||||
|
tmp[3] = GGML_FP16_TO_FP32(x[3]);
|
||||||
|
|
||||||
|
return __lsx_vld(tmp, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void __lsx_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
||||||
|
float arr[4];
|
||||||
|
|
||||||
|
__lsx_vst(y, arr, 0);
|
||||||
|
|
||||||
|
x[0] = GGML_FP32_TO_FP16(arr[0]);
|
||||||
|
x[1] = GGML_FP32_TO_FP16(arr[1]);
|
||||||
|
x[2] = GGML_FP32_TO_FP16(arr[2]);
|
||||||
|
x[3] = GGML_FP32_TO_FP16(arr[3]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define GGML_F32Cx4 __m128
|
||||||
|
#define GGML_F32Cx4_ZERO __lsx_vldi(0)
|
||||||
|
#define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
||||||
|
#define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
|
||||||
|
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
|
||||||
|
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
|
||||||
|
#define GGML_F32Cx4_ADD __lsx_vfadd_s
|
||||||
|
#define GGML_F32Cx4_MUL __lsx_vfmul_s
|
||||||
|
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
||||||
|
|
||||||
|
#define GGML_F16_VEC GGML_F32Cx4
|
||||||
|
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
||||||
|
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
||||||
|
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
||||||
|
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
|
||||||
|
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
||||||
|
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
||||||
|
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
||||||
|
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// GGML_F32_ARR / GGML_F16_ARR
|
// GGML_F32_ARR / GGML_F16_ARR
|
||||||
|
Loading…
Reference in New Issue
Block a user