From 8fbd59308b54729a191dcf3aee3388abfa7dd6e3 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Fri, 28 Jun 2024 22:52:57 -0400 Subject: [PATCH] ggml-quants : attempt to fix Arm 32-bit support --- ggml/src/ggml-impl.h | 11 ++++------- ggml/src/ggml-quants.c | 8 ++++---- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 1d2336190..374b5ae04 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -177,7 +177,7 @@ typedef __fp16 ggml_fp16_internal_t; // 32-bit ARM compatibility -// vaddvq_s16 +// vaddlvq_s16 // vpaddq_s16 // vpaddq_s32 // vaddvq_s32 @@ -187,12 +187,9 @@ typedef __fp16 ggml_fp16_internal_t; // vzip1_u8 // vzip2_u8 -inline static int32_t vaddvq_s16(int16x8_t v) { - return - (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + - (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) + - (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) + - (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7); +inline static int32_t vaddlvq_s16(int16x8_t v) { + int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v))); + return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2); } inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 1f7460ac4..df4320f5b 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -11483,10 +11483,10 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r // WARNING: reading 3 bytes further than necessary const uint8x16_t x13b = vld1q_u8((const uint8_t *) x); - uint8x16_t x0 = vqtbl1q_u8(x13b, mask0); - uint8x16_t x1 = vqtbl1q_u8(x13b, mask1); - uint8x16_t x2 = vqtbl1q_u8(x13b, mask2); - uint8x16_t x3 = vqtbl1q_u8(x13b, mask3); + uint8x16_t x0 = ggml_vqtbl1q_u8(x13b, mask0); + uint8x16_t x1 = ggml_vqtbl1q_u8(x13b, mask1); + uint8x16_t x2 = ggml_vqtbl1q_u8(x13b, mask2); + uint8x16_t x3 = ggml_vqtbl1q_u8(x13b, mask3); x0 = vmulq_u8(x0, shift0); x1 = vmulq_u8(x1, shift0);