diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 88044e622..1086cc9ed 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -26,6 +26,8 @@ static const std::vector QUANT_OPTIONS = { { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, + { "TQ1_0", LLAMA_FTYPE_MOSTLY_TQ1_0, " 1.69 bpw ternarization", }, + { "TQ2_0", LLAMA_FTYPE_MOSTLY_TQ2_0, " 2.06 bpw ternarization", }, { "Q1_3", LLAMA_FTYPE_MOSTLY_Q1_3, " 1.63 bpw for BitNet b1.58", }, { "Q2_2", LLAMA_FTYPE_MOSTLY_Q2_2, " 2.00 bpw for BitNet b1.58", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 6bc4be298..03884cba4 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -390,8 +390,10 @@ extern "C" { GGML_TYPE_Q4_0_4_4 = 31, GGML_TYPE_Q4_0_4_8 = 32, GGML_TYPE_Q4_0_8_8 = 33, - GGML_TYPE_Q2_2 = 34, - GGML_TYPE_Q1_3 = 35, + GGML_TYPE_TQ1_0 = 34, + GGML_TYPE_TQ2_0 = 35, + GGML_TYPE_Q2_2 = 36, + GGML_TYPE_Q1_3 = 37, GGML_TYPE_COUNT, }; diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index fbfc24d6c..3be4dd4ca 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -241,6 +241,25 @@ typedef struct { } block_q8_0x8; static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding"); +// +// Ternary quantization +// + +// 1.6875 bpw +typedef struct { + uint8_t q[(QK_K - 4 * QK_K / 64) / 5]; // 5 elements per byte (3^5 = 243 < 256) + uint8_t qs[QK_K/64]; // 4 elements per byte + ggml_half d; +} block_tq1_0; +static_assert(sizeof(block_tq1_0) == sizeof(ggml_half) + QK_K / 64 + (QK_K - 4 * QK_K / 64) / 5, "wrong tq1_0 block size/padding"); + +// 2.0625 bpw +typedef struct { + uint8_t q[QK_K/4]; // 2 bits per element + ggml_half d; +} block_tq2_0; +static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding"); + // // Super-block quantization structures // diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 3e6231e7d..44506f731 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -3366,7 +3366,190 @@ size_t quantize_q2_2(const float * restrict src, void * restrict dst, int64_t nr return nrow * row_size; } -// ====================== 1.625 bpw (de)-quantization (BitNet b1.58) +// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs) + +void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, int64_t k) { + assert(k % QK_K == 0); + const int64_t nb = k / QK_K; + + for (int64_t i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK_K; j++) { + const float v = x[j]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax; + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + + // 5 elements per byte, along 32 bytes + for (size_t j = 0; j < sizeof(y->q) - sizeof(y->q) % 32; j += 32) { + for (size_t m = 0; m < 32; ++m) { + uint8_t q = 0; + for (size_t n = 0; n < 5; ++n) { + int xi = nearest_int(x[m + n*32] * id) + 1; // -1, 0, 1 -> 0, 1, 2 + q *= 3; + q += xi; + } + // ceiling division (243 == pow(3, 5)) + q = ((uint16_t)q * 256 + (243 - 1)) / 243; + y[i].q[j + m] = q; + } + x += 5*32; + } + // along 16 bytes + for (size_t j = sizeof(y->q) - sizeof(y->q) % 32; j < sizeof(y->q); j += 16) { + for (size_t m = 0; m < 16; ++m) { + uint8_t q = 0; + for (size_t n = 0; n < 5; ++n) { + int xi = nearest_int(x[m + n*16] * id) + 1; // -1, 0, 1 -> 0, 1, 2 + q *= 3; + q += xi; + } + // ceiling division (243 == pow(3, 5)) + q = ((uint16_t)q * 256 + (243 - 1)) / 243; + y[i].q[j + m] = q; + } + x += 5*16; + } + // 4 elements per byte + for (size_t j = 0; j < sizeof(y->qs); ++j) { + uint8_t q = 0; + for (size_t m = 0; m < 4; ++m) { + // -1, 0, 1 -> 0, 1, 2 + int xi = nearest_int(x[j + m*sizeof(y->qs)] * id) + 1; + q *= 3; + q += xi; + } + // shift the first value to the most significant trit + q *= 3; + // ceiling division (243 == pow(3, 5)) + q = ((uint16_t)q * 256 + (243 - 1)) / 243; + y[i].qs[j] = q; + } + x += 4*sizeof(y->qs); + } +} + +void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, int64_t k) { + assert(k % QK_K == 0); + const int64_t nb = k / QK_K; + + for (int64_t i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK_K; j++) { + const float v = x[j]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax; + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + + // TODO: should it be along 64 bytes instead for AVX512? + for (size_t j = 0; j < sizeof(y->q); j += 32) { + for (size_t m = 0; m < 32; ++m) { + uint8_t q = 0; + for (size_t n = 0; n < 4; ++n) { + // -1, 0, 1 -> 1, 2, 3 + int xi = nearest_int(x[m + n*32] * id) + 2; + q += (xi & 3) << (2*n); + } + y[i].q[j + m] = q; + } + x += 4*32; + } + } +} + +void quantize_row_tq1_0(const float * restrict x, void * restrict vy, int64_t k) { + assert(k % QK_K == 0); + block_tq1_0 * restrict y = vy; + quantize_row_tq1_0_ref(x, y, k); +} + +void quantize_row_tq2_0(const float * restrict x, void * restrict vy, int64_t k) { + assert(k % QK_K == 0); + block_tq2_0 * restrict y = vy; + quantize_row_tq2_0_ref(x, y, k); +} + +size_t quantize_tq1_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + (void)quant_weights; // not used + const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row); + quantize_row_tq1_0(src, dst, (int64_t)nrow*n_per_row); + return nrow * row_size; +} + +size_t quantize_tq2_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + (void)quant_weights; // not used + const size_t row_size = ggml_row_size(GGML_TYPE_TQ2_0, n_per_row); + quantize_row_tq2_0(src, dst, (int64_t)nrow*n_per_row); + return nrow * row_size; +} + + +void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, int64_t k) { + assert(k % QK_K == 0); + const int64_t nb = k / QK_K; + + const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243}; + + for (int64_t i = 0; i < nb; ++i) { + + const float d = GGML_FP16_TO_FP32(x[i].d); + + for (size_t j = 0; j < sizeof(x->q) - sizeof(x->q) % 32; j += 32) { + for (size_t n = 0; n < 5; ++n) { + for (size_t m = 0; m < 32; ++m) { + uint8_t q = x[i].q[j + m] * pow3[n]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + *y++ = (float) (xi - 1) * d; + } + } + } + for (size_t j = sizeof(x->q) - sizeof(x->q) % 32; j < sizeof(x->q); j += 16) { + for (size_t n = 0; n < 5; ++n) { + for (size_t m = 0; m < 16; ++m) { + uint8_t q = x[i].q[j + m] * pow3[n]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + *y++ = (float) (xi - 1) * d; + } + } + } + + for (size_t n = 0; n < 4; ++n) { + for (size_t j = 0; j < sizeof(x->qs); ++j) { + uint8_t q = x[i].qs[j] * pow3[n]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + *y++ = (float) (xi - 1) * d; + } + } + } +} + +void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, int64_t k) { + assert(k % QK_K == 0); + const int64_t nb = k / QK_K; + + for (int64_t i = 0; i < nb; ++i) { + + const float d = GGML_FP16_TO_FP32(x[i].d); + + for (size_t j = 0; j < sizeof(x->q); j += 32) { + for (size_t l = 0; l < 4; ++l) { + for (size_t m = 0; m < 32; ++m) { + *y++ = (float) (((x[i].q[j + m] >> (l*2)) & 3) - 2) * d; + } + } + } + } +} void quantize_row_q1_3_ref(const float * restrict x, block_q1_3 * restrict y, int64_t k) { assert(k % QK1_3 == 0); @@ -5730,6 +5913,276 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r *s = sumf; } +void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_tq1_0 * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + +#if defined __AVX2__ + __m256 sumf = _mm256_setzero_ps(); + + for (int i = 0; i < nb; ++i) { + // 16-bit sums + __m256i sumi0 = _mm256_setzero_si256(); + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + + // first 32 bytes of 5 elements + { + __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].q)); + // 8-bit multiplies with shifts, masks and adds + __m256i qx1 = _mm256_add_epi8(qx0, _mm256_add_epi8(qx0, qx0)); // 1 * 3 + __m256i qx2 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx0, 3), _mm256_set1_epi8(-8)), qx0); // 1 * 9 + __m256i qx3 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx1, 3), _mm256_set1_epi8(-8)), qx1); // 3 * 9 + __m256i qx4 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx2, 3), _mm256_set1_epi8(-8)), qx2); // 9 * 9 + + // TODO: can _mm256_mulhi_epu16 be faster even if 16-bits? + + // Cancel the +1 from avg so that it behaves like a halving add + qx0 = _mm256_subs_epu8(qx0, _mm256_set1_epi8(1)); + qx1 = _mm256_subs_epu8(qx1, _mm256_set1_epi8(1)); + qx2 = _mm256_subs_epu8(qx2, _mm256_set1_epi8(1)); + qx3 = _mm256_subs_epu8(qx3, _mm256_set1_epi8(1)); + qx4 = _mm256_subs_epu8(qx4, _mm256_set1_epi8(1)); + // Multiply by 3 and get the top 2 bits + qx0 = _mm256_avg_epu8(qx0, _mm256_avg_epu8(qx0, _mm256_setzero_si256())); + qx1 = _mm256_avg_epu8(qx1, _mm256_avg_epu8(qx1, _mm256_setzero_si256())); + qx2 = _mm256_avg_epu8(qx2, _mm256_avg_epu8(qx2, _mm256_setzero_si256())); + qx3 = _mm256_avg_epu8(qx3, _mm256_avg_epu8(qx3, _mm256_setzero_si256())); + qx4 = _mm256_avg_epu8(qx4, _mm256_avg_epu8(qx4, _mm256_setzero_si256())); + qx0 = _mm256_and_si256(_mm256_srli_epi16(qx0, 6), _mm256_set1_epi8(3)); + qx1 = _mm256_and_si256(_mm256_srli_epi16(qx1, 6), _mm256_set1_epi8(3)); + qx2 = _mm256_and_si256(_mm256_srli_epi16(qx2, 6), _mm256_set1_epi8(3)); + qx3 = _mm256_and_si256(_mm256_srli_epi16(qx3, 6), _mm256_set1_epi8(3)); + qx4 = _mm256_and_si256(_mm256_srli_epi16(qx4, 6), _mm256_set1_epi8(3)); + + // 0, 1, 2 => -1, 0, 1 + qx0 = _mm256_sub_epi8(qx0, _mm256_set1_epi8(1)); + qx1 = _mm256_sub_epi8(qx1, _mm256_set1_epi8(1)); + qx2 = _mm256_sub_epi8(qx2, _mm256_set1_epi8(1)); + qx3 = _mm256_sub_epi8(qx3, _mm256_set1_epi8(1)); + qx4 = _mm256_sub_epi8(qx4, _mm256_set1_epi8(1)); + + const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 0)); + const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 32)); + const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 64)); + const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 96)); + const __m256i qy4 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 128)); + + // dot + qx0 = _mm256_sign_epi8(qy0, qx0); + qx1 = _mm256_sign_epi8(qy1, qx1); + qx2 = _mm256_sign_epi8(qy2, qx2); + qx3 = _mm256_sign_epi8(qy3, qx3); + qx4 = _mm256_sign_epi8(qy4, qx4); + + // widening addition + qx0 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx0); + qx1 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx1); + qx2 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx2); + qx3 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx3); + qx4 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx4); + + sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1)); + sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3)); + sumi2 = _mm256_add_epi16(sumi2, qx4); + } + + // last 16 bytes of 5-element, along with the 4 bytes of 4 elements + { + __m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].q + 32)); + __m256i qx5_l = _mm256_cvtepu8_epi16(_mm_broadcastd_epi32(_mm_loadu_si32((const void *) x[i].qs))); + __m128i qx1 = _mm_add_epi8(qx0, _mm_add_epi8(qx0, qx0)); // 1 * 3 + __m128i qx2 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx0, 3), _mm_set1_epi8(-8)), qx0); // 1 * 9 + __m128i qx3 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx1, 3), _mm_set1_epi8(-8)), qx1); // 3 * 9 + __m128i qx4 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx2, 3), _mm_set1_epi8(-8)), qx2); // 9 * 9 + __m256i qx01 = MM256_SET_M128I(qx1, qx0); + __m256i qx23 = MM256_SET_M128I(qx3, qx2); + + // avx2 does not have 8-bit multiplies, so 16-bit it is. + qx5_l = _mm256_mullo_epi16(qx5_l, _mm256_set_epi16(27, 27, 27, 27, 9, 9, 9, 9, 3, 3, 3, 3, 1, 1, 1, 1)); + qx5_l = _mm256_and_si256(qx5_l, _mm256_set1_epi16(0xFF)); + __m128i qx5 = _mm_packus_epi16(_mm256_castsi256_si128(qx5_l), _mm256_extracti128_si256(qx5_l, 1)); + + __m256i qx45 = MM256_SET_M128I(qx5, qx4); + + // Cancel the +1 from avg so that it behaves like a halving add + qx01 = _mm256_subs_epu8(qx01, _mm256_set1_epi8(1)); + qx23 = _mm256_subs_epu8(qx23, _mm256_set1_epi8(1)); + qx45 = _mm256_subs_epu8(qx45, _mm256_set1_epi8(1)); + // Multiply by 3 and get the top 2 bits + qx01 = _mm256_avg_epu8(qx01, _mm256_avg_epu8(qx01, _mm256_setzero_si256())); + qx23 = _mm256_avg_epu8(qx23, _mm256_avg_epu8(qx23, _mm256_setzero_si256())); + qx45 = _mm256_avg_epu8(qx45, _mm256_avg_epu8(qx45, _mm256_setzero_si256())); + qx01 = _mm256_and_si256(_mm256_srli_epi16(qx01, 6), _mm256_set1_epi8(3)); + qx23 = _mm256_and_si256(_mm256_srli_epi16(qx23, 6), _mm256_set1_epi8(3)); + qx45 = _mm256_and_si256(_mm256_srli_epi16(qx45, 6), _mm256_set1_epi8(3)); + + // 0, 1, 2 => -1, 0, 1 + qx01 = _mm256_sub_epi8(qx01, _mm256_set1_epi8(1)); + qx23 = _mm256_sub_epi8(qx23, _mm256_set1_epi8(1)); + qx45 = _mm256_sub_epi8(qx45, _mm256_set1_epi8(1)); + + const __m256i qy01 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 160)); + const __m256i qy23 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 192)); + const __m256i qy45 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 224)); + + // dot + qx01 = _mm256_sign_epi8(qy01, qx01); + qx23 = _mm256_sign_epi8(qy23, qx23); + qx45 = _mm256_sign_epi8(qy45, qx45); + + // widening addition + qx01 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx01); + qx23 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx23); + qx45 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx45); + + sumi0 = _mm256_add_epi16(sumi0, qx01); + sumi1 = _mm256_add_epi16(sumi1, qx23); + sumi2 = _mm256_add_epi16(sumi2, qx45); + } + + const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)); + + sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2)); + sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1)); + + sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf); + } + + *s = hsum_float_8(sumf); +// #elif defined __ARM_NEON +#else + const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243}; + + float sumf = 0.0f; + + for (int i = 0; i < nb; ++i) { + int sum = 0; + + for (size_t j = 0; j < sizeof(x->q) - sizeof(x->q) % 32; j += 32) { + for (size_t l = 0; l < 5; ++l) { + for (size_t m = 0; m < 32; ++m) { + uint8_t q = x[i].q[j + m] * pow3[l]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + sum += (xi - 1) * y[i].qs[j*5 + l*32 + m]; + } + } + } + for (size_t j = sizeof(x->q) - sizeof(x->q) % 32; j < sizeof(x->q); j += 16) { + for (size_t l = 0; l < 5; ++l) { + for (size_t m = 0; m < 16; ++m) { + uint8_t q = x[i].q[j + m] * pow3[l]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + sum += (xi - 1) * y[i].qs[j*5 + l*16 + m]; + } + } + } + + for (size_t l = 0; l < 4; ++l) { + for (size_t j = 0; j < sizeof(x->qs); ++j) { + uint8_t q = x[i].qs[j] * pow3[l]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + sum += (xi - 1) * y[i].qs[sizeof(x->q)*5 + l*sizeof(x->qs) + j]; + } + } + + sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d); + } + + *s = sumf; +#endif +} + +void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_tq2_0 * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + +#if defined __AVX2__ + __m256 sumf = _mm256_setzero_ps(); + + for (int i = 0; i < nb; ++i) { + // 16-bit sums, because 256*127 still fits + __m256i sumi0 = _mm256_setzero_si256(); + __m256i sumi1 = _mm256_setzero_si256(); + + for (size_t j = 0; j < sizeof(x->q); j += 32) { + __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].q + j)); + __m256i qx1 = _mm256_srli_epi16(qx0, 2); + __m256i qx2 = _mm256_srli_epi16(qx0, 4); + __m256i qx3 = _mm256_srli_epi16(qx0, 6); + + // 1, 2, 3 => -1, 0, 1 + qx0 = _mm256_sub_epi8(_mm256_and_si256(qx0, _mm256_set1_epi8(3)), _mm256_set1_epi8(2)); + qx1 = _mm256_sub_epi8(_mm256_and_si256(qx1, _mm256_set1_epi8(3)), _mm256_set1_epi8(2)); + qx2 = _mm256_sub_epi8(_mm256_and_si256(qx2, _mm256_set1_epi8(3)), _mm256_set1_epi8(2)); + qx3 = _mm256_sub_epi8(_mm256_and_si256(qx3, _mm256_set1_epi8(3)), _mm256_set1_epi8(2)); + + const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 0)); + const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 32)); + const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 64)); + const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 96)); + + qx0 = _mm256_sign_epi8(qy0, qx0); + qx1 = _mm256_sign_epi8(qy1, qx1); + qx2 = _mm256_sign_epi8(qy2, qx2); + qx3 = _mm256_sign_epi8(qy3, qx3); + + qx0 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx0); + qx1 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx1); + qx2 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx2); + qx3 = _mm256_maddubs_epi16(_mm256_set1_epi8(1), qx3); + + sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1)); + sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3)); + } + const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)); + + sumi0 = _mm256_add_epi16(sumi0, sumi1); + sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1)); + + sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf); + } + + *s = hsum_float_8(sumf); +#else + float sumf = 0.0f; + + for (int i = 0; i < nb; ++i) { + int32_t sumi = 0; + + for (size_t j = 0; j < sizeof(x->q); j += 32) { + for (size_t l = 0; l < 4; ++l) { + for (size_t k = 0; k < 32; ++k) { + sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].q[j + k] >> (l*2)) & 3) - 2); + } + } + } + + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + + sumf += (float) sumi * d; + } + + *s = sumf; +#endif +} + void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); @@ -15279,6 +15732,14 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte } } } break; + case GGML_TYPE_TQ1_0: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_tq1_0, data, nb); + } break; + case GGML_TYPE_TQ2_0: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_tq2_0, data, nb); + } break; case GGML_TYPE_IQ1_S: { VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb); diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 0e30b306c..238cfd3fb 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -28,6 +28,9 @@ void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_REST void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k); void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k); +void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k); +void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k); + void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k); @@ -50,6 +53,9 @@ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); + void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); @@ -73,6 +79,9 @@ void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRI void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); + void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); @@ -98,6 +107,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); @@ -119,6 +131,9 @@ size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT ds size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index b652598f1..5bbe0e4a8 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -863,7 +863,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q2_2, .from_float = quantize_row_q2_2, - .from_float_reference = (ggml_from_float_t) quantize_row_q2_2_reference, + .from_float_ref = (ggml_from_float_t) quantize_row_q2_2_ref, .vec_dot = ggml_vec_dot_q2_2_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, .nrows = 1, @@ -875,7 +875,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q1_3, .from_float = quantize_row_q1_3, - .from_float_reference = (ggml_from_float_t) quantize_row_q1_3_reference, + .from_float_ref = (ggml_from_float_t) quantize_row_q1_3_ref, .vec_dot = ggml_vec_dot_q1_3_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, .nrows = 1, @@ -994,7 +994,31 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .ncols = 8, .gemv = ggml_gemv_q4_0_8x8_q8_0, .gemm = ggml_gemm_q4_0_8x8_q8_0, - } + }, + [GGML_TYPE_TQ1_0] = { + .type_name = "tq1_0", + .blck_size = QK_K, + .type_size = sizeof(block_tq1_0), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_tq1_0, + .from_float = quantize_row_tq1_0, + .from_float_ref = (ggml_from_float_t) quantize_row_tq1_0_ref, + .vec_dot = ggml_vec_dot_tq1_0_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_TQ2_0] = { + .type_name = "tq2_0", + .blck_size = QK_K, + .type_size = sizeof(block_tq2_0), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_tq2_0, + .from_float = quantize_row_tq2_0, + .from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref, + .vec_dot = ggml_vec_dot_tq2_0_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, }; // For internal test use @@ -13332,6 +13356,8 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -13923,6 +13949,8 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -20622,6 +20650,8 @@ size_t ggml_quantize_chunk( case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_TQ1_0: result = quantize_tq1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_TQ2_0: result = quantize_tq2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index f87afd77d..32e73e56a 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1145,8 +1145,13 @@ class GGMLQuantizationType(IntEnum): F64 = 28 IQ1_M = 29 BF16 = 30 - Q2_2 = 31 - Q1_3 = 32 + Q4_0_4_4 = 31 + Q4_0_4_8 = 32 + Q4_0_8_8 = 33 + TQ1_0 = 34 + TQ2_0 = 35 + Q1_3 = 36 + Q2_2 = 37 # TODO: add GGMLFileType from ggml_ftype in ggml.h diff --git a/include/llama.h b/include/llama.h index 2a3f2c913..7dcc260e8 100644 --- a/include/llama.h +++ b/include/llama.h @@ -166,8 +166,10 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q2_2 = 36, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q1_3 = 37, // except 1d tensors + LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors + LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q1_3 = 38, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q2_2 = 39, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama.cpp b/src/llama.cpp index 7a33ac852..7ab2b47cd 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3771,6 +3771,8 @@ struct llama_model_loader { case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break; case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; + case GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break; + case GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break; case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break; @@ -4466,6 +4468,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small"; case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; + case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary"; + case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary"; case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw"; @@ -15344,6 +15348,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type == GGML_TYPE_Q4_0_8_8) { new_type = GGML_TYPE_Q4_0; } + else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { + new_type = GGML_TYPE_Q4_K; + } } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { @@ -15647,6 +15654,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q5_K_S: case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break; case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break; + case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = GGML_TYPE_TQ1_0; break; + case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = GGML_TYPE_TQ2_0; break; case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break; case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break; case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break; diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index b3d07d6d1..a6e508d01 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -15,13 +15,13 @@ constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f; -constexpr float MAX_QUANTIZATION_TOTAL_ERROR_BITNET = 0.015625f; +constexpr float MAX_QUANTIZATION_TOTAL_ERROR_TERNARY = 0.015625f; // TODO: change to 0.01f constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f; constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f; constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f; -constexpr float MAX_DOT_PRODUCT_ERROR_BITNET = 0.5f; +constexpr float MAX_DOT_PRODUCT_ERROR_TERNARY = 0.5f; // TODO: change to 0.15f static const char* RESULT_STR[] = {"ok", "FAILED"}; @@ -146,8 +146,10 @@ int main(int argc, char * argv[]) { if (qfns.from_float && qfns.to_float) { const float total_error = total_quantization_error(qfns, test_size, test_data.data()); const float max_quantization_error = - type == GGML_TYPE_Q1_3 ? MAX_QUANTIZATION_TOTAL_ERROR_BITNET : - type == GGML_TYPE_Q2_2 ? MAX_QUANTIZATION_TOTAL_ERROR_BITNET : + type == GGML_TYPE_Q1_3 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : + type == GGML_TYPE_Q2_2 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : + type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : + type == GGML_TYPE_TQ2_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : type == GGML_TYPE_IQ2_S ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : @@ -170,8 +172,8 @@ int main(int argc, char * argv[]) { const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S ? MAX_DOT_PRODUCT_ERROR_LOWBIT - : type == GGML_TYPE_Q2_2 || type == GGML_TYPE_Q1_3 - ? MAX_DOT_PRODUCT_ERROR_BITNET + : type == GGML_TYPE_Q2_2 || type == GGML_TYPE_Q1_3 || type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0 + ? MAX_DOT_PRODUCT_ERROR_TERNARY : MAX_DOT_PRODUCT_ERROR; failed = !(vec_dot_error < max_allowed_error); num_failed += failed;