diff --git a/ggml-quants.c b/ggml-quants.c index 1128d66e2..9f864e5c4 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -6088,6 +6088,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r const uint8_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; + const __m128i mins_and_scales = __lsx_vld((const __m128i*)x[i].scales, 0); const __m128i scales8 = __lsx_vand_v(mins_and_scales, m4); const __m128i mins8 = __lsx_vand_v(__lsx_vsrli_h(mins_and_scales, 4), m4); @@ -6807,6 +6808,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const uint8_t * restrict q3 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; // Set up scales memcpy(aux, x[i].scales, 12); __m128i scales128 = lsx_set_w( @@ -6830,8 +6833,6 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r int is = 0; __m256i xvbit; - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; for (int j = 0; j < QK_K/128; ++j) { // load low 2 bits @@ -7404,6 +7405,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r *s = vec_extract(vsumf0, 0); #elif defined __loongarch_asx + GGML_UNUSED(kmask1); + GGML_UNUSED(kmask2); + GGML_UNUSED(kmask3); const __m256i m4 = __lasx_xvreplgr2vr_b(0xF); @@ -7416,6 +7420,11 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; const uint8_t * restrict q4 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -7455,16 +7464,17 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r __m256 vd = __lasx_xvreplfr2vr_s(d); acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc); + } acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee)); __m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0); acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1); + ft_union fi; fi.i = __lsx_vpickve2gr_w(acc_m, 0); *s = hsum_float_8(acc) + fi.f ; - #else const uint8_t * scales = (const uint8_t*)&utmp[0]; @@ -8002,6 +8012,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r *s = vec_extract(vsumf0, 0); #elif defined __loongarch_asx + GGML_UNUSED(kmask1); + GGML_UNUSED(kmask2); + GGML_UNUSED(kmask3); const __m256i m4 = __lasx_xvreplgr2vr_b(0xF); const __m128i mzero = __lsx_vldi(0); @@ -8020,6 +8033,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0])); @@ -8069,10 +8087,12 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r p16_1 = lasx_madd_h(scale_1, p16_1); sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1)); + } __m256 vd = __lasx_xvreplfr2vr_s(d); acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc); + } *s = hsum_float_8(acc) + summs; diff --git a/ggml.c b/ggml.c index 76803639c..f479dc3e1 100644 --- a/ggml.c +++ b/ggml.c @@ -1576,11 +1576,11 @@ do { \ // F16 arithmetic is not supported by AVX, so we use F32 instead -#define GGML_F32Cx8 __m256 +#define GGML_F32Cx8 __m256 #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0) #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x)) -static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t *x) { +static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) { float tmp[8]; for (int i = 0; i < 8; i++) { @@ -1589,13 +1589,14 @@ static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t *x) { return (__m256)__lasx_xvld(tmp, 0); } -static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) { +static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) { float arr[8]; __lasx_xvst(y, arr, 0); - for (int i = 0; i < 8; i++) + for (int i = 0; i < 8; i++) { x[i] = GGML_FP32_TO_FP16(arr[i]); + } } #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x) #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y) @@ -1671,7 +1672,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) { #define GGML_F16_STEP 32 #define GGML_F16_EPR 4 -static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) { +static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) { float tmp[4]; tmp[0] = GGML_FP16_TO_FP32(x[0]); @@ -1682,7 +1683,7 @@ static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) { return __lsx_vld(tmp, 0); } -static inline void __lsx_f16x4_store(ggml_fp16_t *x, __m128 y) { +static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { float arr[4]; __lsx_vst(y, arr, 0); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 766a01752..cfa707315 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -129,8 +129,11 @@ llama_target_and_test(test-rope.cpp) llama_target_and_test(test-model-load-cancel.cpp LABEL "model") llama_target_and_test(test-autorelease.cpp LABEL "model") -llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) -target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server) +# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8 +if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") + llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) + target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server) +endif() # dummy executable - not installed get_filename_component(TEST_TARGET test-c.c NAME_WE)