From 2f8bd2b90133cf37ae752015e1bfd738cc6d0112 Mon Sep 17 00:00:00 2001 From: Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com> Date: Fri, 25 Oct 2024 12:57:41 +0530 Subject: [PATCH] llamafile : extend sgemm.cpp support for Q5_0 models (#10010) --- ggml/src/llamafile/sgemm.cpp | 57 ++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/ggml/src/llamafile/sgemm.cpp b/ggml/src/llamafile/sgemm.cpp index 0193a463a..9eead3f61 100644 --- a/ggml/src/llamafile/sgemm.cpp +++ b/ggml/src/llamafile/sgemm.cpp @@ -942,6 +942,36 @@ class tinyBLAS_Q0_AVX { return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8)); } + inline __m256i load(const block_q5_0 *b) { + return _mm256_or_si256(denibble(b->qs), bittobyte(b->qh)); + } + + inline __m128i load0(const block_q5_0* b) { + const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs)); + uint32_t x32; + memcpy(&x32, b->qh, sizeof(uint32_t)); + __m128i qxl = _mm_and_si128(_mm_set1_epi8(15), x); + __m128i bytesl = _mm_cmpeq_epi8(_mm_set1_epi64x(-1), + _mm_or_si128(_mm_set1_epi64x(0x7fbfdfeff7fbfdfe), + _mm_shuffle_epi8(_mm_set1_epi32(x32), + _mm_set_epi64x(0x0101010101010101, 0x0000000000000000)))); + bytesl = _mm_andnot_si128(bytesl, _mm_set1_epi8((char)0xF0)); + return _mm_or_si128(qxl, bytesl); + } + + inline __m128i load1(const block_q5_0* b) { + const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs)); + uint32_t x32; + memcpy(&x32, b->qh, sizeof(uint32_t)); + __m128i qxh = _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)); + __m128i bytesh = _mm_cmpeq_epi8(_mm_set1_epi64x(-1), + _mm_or_si128(_mm_set1_epi64x(0x7fbfdfeff7fbfdfe), + _mm_shuffle_epi8(_mm_set1_epi32(x32), + _mm_set_epi64x(0x0303030303030303, 0x0202020202020202)))); + bytesh = _mm_andnot_si128(bytesh, _mm_set1_epi8((char)0xF0)); + return _mm_or_si128(qxh, bytesh); + } + inline __m256i load(const block_iq4_nl *b) { return MM256_SET_M128I(load1(b), load0(b)); } @@ -973,6 +1003,17 @@ class tinyBLAS_Q0_AVX { _mm_srli_epi16(x, 4), 1)); } + static inline __m256i bittobyte(const uint8_t *p) { + uint32_t x32; + memcpy(&x32, p, sizeof(uint32_t)); + __m256i bytes = _mm256_cmpeq_epi8(_mm256_set1_epi64x(-1), + _mm256_or_si256(_mm256_set1_epi64x(0x7fbfdfeff7fbfdfe), + _mm256_shuffle_epi8(_mm256_set1_epi32(x32), + _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, + 0x0101010101010101, 0x0000000000000000)))); + return _mm256_andnot_si256(bytes, _mm256_set1_epi8((char)0xF0)); + } + const TA *const A; const TB *const B; TC *const C; @@ -1182,6 +1223,22 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda #endif } + case GGML_TYPE_Q5_0: { + if (Btype != GGML_TYPE_Q8_0) + return false; +#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) + tinyBLAS_Q0_AVX tb{ + k, (const block_q5_0 *)A, lda, + (const block_q8_0 *)B, ldb, + (float *)C, ldc, + ith, nth}; + tb.matmul(m, n); + return true; +#else + return false; +#endif + } + case GGML_TYPE_IQ4_NL: { if (Btype != GGML_TYPE_Q8_0) return false;