From a201c6b5f7faad9d3f85ceb4b6666034019944a6 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Tue, 10 Sep 2024 21:01:57 -0400 Subject: [PATCH] shuffle --- ggml/src/ggml-quants.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 9b801ffd9..90d6be1c5 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -12359,9 +12359,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * uint64_t dex = ((uint64_t) x[ib + 3].d << 48) | ((uint64_t) x[ib + 2].d << 32) | ((uint64_t) x[ib + 1].d << 16) | (x[ib].d); uint64_t dey = ((uint64_t) y[ib + 3].d << 48) | ((uint64_t) y[ib + 2].d << 32) | ((uint64_t) y[ib + 1].d << 16) | (y[ib].d); - __m128 dex_32 = _mm_cvtph_ps(_mm_set_epi64x(0, dex)); - __m128 dey_32 = _mm_cvtph_ps(_mm_set_epi64x(0, dey)); - __m128 de_mul = _mm_mul_ps(dex_32, dey_32); + const __m128 dex32 = _mm_cvtph_ps(_mm_set_epi64x(0, dex)); + const __m128 dey32 = _mm_cvtph_ps(_mm_set_epi64x(0, dey)); + const __m128 de_mul = _mm_mul_ps(dex32, dey32); for (int j = 0; j < 2; j++) { const int ibos = ib + (j * 2); @@ -12385,9 +12385,11 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone); const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone); - accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(_mm_cvtss_f32(_mm_permutevar_ps(de_mul, _mm_cvtsi32_si128(ibos)))), + const __m128 ac1d = _mm_shuffle_ps(de_mul, de_mul, (j * 2) * 0x55); + accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set_m128(ac1d, ac1d), _mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1); - accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(_mm_cvtss_f32(_mm_permutevar_ps(de_mul, _mm_cvtsi32_si128(ibos + 1)))), + const __m128 ac2d = _mm_shuffle_ps(de_mul, de_mul, (j * 2 + 1) * 0x55); + accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set_m128(ac2d, ac2d), _mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2); } }