mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-05 00:04:36 +00:00
iq1_s: AVX2 finally works
This commit is contained in:
parent
67e7c4238e
commit
2ffb05acc8
@ -9333,8 +9333,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|||||||
const __m128i ql = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
const __m128i ql = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
||||||
memcpy(&aux64, sc, 8); sc += 8;
|
memcpy(&aux64, sc, 8); sc += 8;
|
||||||
const __m128i qh = _mm_shuffle_epi8(_mm_set_epi64x(aux64 >> 4, aux64), shuffle_h);
|
const __m128i qh = _mm_shuffle_epi8(_mm_set_epi64x(aux64 >> 4, aux64), shuffle_h);
|
||||||
const __m256i hbit = _mm256_cvtepi8_epi16(_mm_and_si128(qh, m8));
|
const __m256i hbit = _mm256_cvtepu8_epi16(_mm_and_si128(qh, m8));
|
||||||
v_gindex = _mm256_or_si256(_mm256_cvtepi8_epi16(ql), _mm256_slli_epi16(hbit, 5));
|
v_gindex = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
|
||||||
const __m128i scales = _mm_or_si128(_mm_slli_epi16(_mm_and_si128(qh, m7), 1), m1);
|
const __m128i scales = _mm_or_si128(_mm_slli_epi16(_mm_and_si128(qh, m7), 1), m1);
|
||||||
|
|
||||||
for (int i32 = 0; i32 < 4; ++i32) {
|
for (int i32 = 0; i32 < 4; ++i32) {
|
||||||
|
Loading…
Reference in New Issue
Block a user