From cb5fad4c6c2cbef92e9b8b63449e1cb7664e4846 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 1 Jul 2024 20:39:06 +0200 Subject: [PATCH] CUDA: refactor and optimize IQ MMVQ (#8215) * CUDA: refactor and optimize IQ MMVQ * uint -> uint32_t * __dp4a -> ggml_cuda_dp4a * remove MIN_CC_DP4A checks * change default * try CI fix --- ggml/src/ggml-common.h | 14 +- ggml/src/ggml-cuda.cu | 12 +- ggml/src/ggml-cuda/common.cuh | 76 +++- ggml/src/ggml-cuda/fattn-common.cuh | 50 +- ggml/src/ggml-cuda/mmvq.cu | 26 +- ggml/src/ggml-cuda/vecdotq.cuh | 682 +++++++++++++--------------- ggml/src/ggml-sycl/mmvq.cpp | 12 +- ggml/src/ggml-sycl/vecdotq.hpp | 21 - 8 files changed, 406 insertions(+), 487 deletions(-) diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index e8efceb76..c74060cc4 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -106,19 +106,19 @@ typedef sycl::half2 ggml_half2; #define QR6_K 2 #define QI2_XXS (QK_K / (4*QR2_XXS)) -#define QR2_XXS 8 +#define QR2_XXS 4 #define QI2_XS (QK_K / (4*QR2_XS)) -#define QR2_XS 8 +#define QR2_XS 4 #define QI2_S (QK_K / (4*QR2_S)) -#define QR2_S 8 +#define QR2_S 4 #define QI3_XXS (QK_K / (4*QR3_XXS)) -#define QR3_XXS 8 +#define QR3_XXS 4 #define QI3_XS (QK_K / (4*QR3_XS)) -#define QR3_XS 8 +#define QR3_XS 4 #define QI1_S (QK_K / (4*QR1_S)) #define QR1_S 8 @@ -130,10 +130,10 @@ typedef sycl::half2 ggml_half2; #define QR4_NL 2 #define QI4_XS (QK_K / (4*QR4_XS)) -#define QR4_XS 8 +#define QR4_XS 2 #define QI3_S (QK_K / (4*QR3_S)) -#define QR3_S 8 +#define QR3_S 4 #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 0acfda91d..649ef5a08 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -1882,6 +1882,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor bool use_mul_mat_q = ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32; + // if mmvq is available it's a better choice than dmmv: +#ifndef GGML_CUDA_FORCE_DMMV + use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q; +#endif // GGML_CUDA_FORCE_DMMV + bool any_gpus_with_slow_fp16 = false; if (split) { @@ -1894,22 +1899,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor } const int cc = ggml_cuda_info().devices[id].cc; - use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A; use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc); } } else { const int cc = ggml_cuda_info().devices[ctx.device].cc; - use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A; use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc); } - // if mmvq is available it's a better choice than dmmv: -#ifndef GGML_CUDA_FORCE_DMMV - use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q; -#endif // GGML_CUDA_FORCE_DMMV - // debug helpers //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]); diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 8d00db6c1..472f4ace1 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -3,6 +3,7 @@ #include "ggml.h" #include "ggml-cuda.h" +#include #include #if defined(GGML_USE_HIPBLAS) @@ -268,30 +269,15 @@ static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigne return c; } -static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { -#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) - c = __builtin_amdgcn_sdot4(a, b, c, false); -#elif defined(RDNA3) - c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); -#elif defined(__gfx1010__) || defined(__gfx900__) - int tmp1; - int tmp2; - asm("\n \ - v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \ - v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \ - v_add3_u32 %0, %1, %2, %0 \n \ - v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \ - v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \ - v_add3_u32 %0, %1, %2, %0 \n \ - " - : "+v"(c), "=&v"(tmp1), "=&v"(tmp2) - : "v"(a), "v"(b) - ); -#else - const int8x4_t va = reinterpret_cast(a); - const int8x4_t vb = reinterpret_cast(b); - c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3]; -#endif +static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) { + const uint8x4_t& va = reinterpret_cast(a); + const uint8x4_t& vb = reinterpret_cast(b); + unsigned int c; + uint8x4_t& vc = reinterpret_cast(c); +#pragma unroll + for (int i = 0; i < 4; ++i) { + vc[i] = va[i] == vb[i] ? 0x00 : 0xff; + } return c; } @@ -467,8 +453,48 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half } #endif // CUDART_VERSION < 12000 +static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) { +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) + c = __builtin_amdgcn_sdot4(a, b, c, false); +#elif defined(RDNA3) + c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); +#elif defined(__gfx1010__) || defined(__gfx900__) + int tmp1; + int tmp2; + asm("\n \ + v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \ + v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \ + v_add3_u32 %0, %1, %2, %0 \n \ + v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \ + v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \ + v_add3_u32 %0, %1, %2, %0 \n \ + " + : "+v"(c), "=&v"(tmp1), "=&v"(tmp2) + : "v"(a), "v"(b) + ); +#else + const int8x4_t va = reinterpret_cast(a); + const int8x4_t vb = reinterpret_cast(b); + c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3]; +#endif + return c; + +#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + +#if __CUDA_ARCH__ >= MIN_CC_DP4A + return __dp4a(a, b, c); +#else // __CUDA_ARCH__ >= MIN_CC_DP4A + const int8_t * a8 = (const int8_t *) &a; + const int8_t * b8 = (const int8_t *) &b; + return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3]; +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +} + // TODO: move to ggml-common.h -static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; +static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v); diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index bd7993595..650780fd2 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -54,12 +54,11 @@ typedef float (*vec_dot_KQ_f32_t)( template static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0( const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c; GGML_UNUSED(Q_v); - half sum = 0.0f; + T sum = 0.0f; #pragma unroll for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) { @@ -72,7 +71,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0( const int v = (get_int_from_uint8(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F; const int u = Q_q8[k_KQ_0/WARP_SIZE]; - const int sumi = __dp4a(v, u, 0); + const int sumi = ggml_cuda_dp4a(v, u, 0); #ifdef FP16_AVAILABLE if (std::is_same::value) { @@ -90,19 +89,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0( } return sum; -#else - GGML_UNUSED(K_c); - GGML_UNUSED(Q_v); - GGML_UNUSED(Q_q8); - GGML_UNUSED(Q_ds_v); - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } template static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1( const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c; GGML_UNUSED(Q_v); @@ -120,7 +111,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1( const int v = (get_int_from_uint8_aligned(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F; const int u = Q_q8[k_KQ_0/WARP_SIZE]; - const int sumi = __dp4a(v, u, 0); + const int sumi = ggml_cuda_dp4a(v, u, 0); #ifdef FP16_AVAILABLE if (std::is_same::value) { @@ -142,19 +133,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1( } return sum; -#else - GGML_UNUSED(K_c); - GGML_UNUSED(Q_v); - GGML_UNUSED(Q_q8); - GGML_UNUSED(Q_ds_v); - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } template static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0( const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c; GGML_UNUSED(Q_v); @@ -179,7 +162,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0( const int u = Q_q8[k_KQ_0/WARP_SIZE]; - const int sumi = __dp4a(v, u, 0); + const int sumi = ggml_cuda_dp4a(v, u, 0); #ifdef FP16_AVAILABLE if (std::is_same::value) { @@ -197,19 +180,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0( } return sum; -#else - GGML_UNUSED(K_c); - GGML_UNUSED(Q_v); - GGML_UNUSED(Q_q8); - GGML_UNUSED(Q_ds_v); - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } template static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1( const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c; GGML_UNUSED(Q_v); @@ -234,7 +209,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1( const int u = Q_q8[k_KQ_0/WARP_SIZE]; - const int sumi = __dp4a(v, u, 0); + const int sumi = ggml_cuda_dp4a(v, u, 0); #ifdef FP16_AVAILABLE if (std::is_same::value) { @@ -256,19 +231,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1( } return sum; -#else - GGML_UNUSED(K_c); - GGML_UNUSED(Q_v); - GGML_UNUSED(Q_q8); - GGML_UNUSED(Q_ds_v); - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } template static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0( const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c; GGML_UNUSED(Q_v); @@ -297,13 +264,6 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0( } return sum; -#else - GGML_UNUSED(K_c); - GGML_UNUSED(Q_v); - GGML_UNUSED(Q_q8); - GGML_UNUSED(Q_ds_v); - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } template diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index e8d157169..e22faf69b 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -28,16 +28,22 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) static constexpr __device__ int get_vdr_mmvq(ggml_type type) { return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ : - type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ : - type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ : - type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ : - type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ : - type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ : - type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ : - type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ : - type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ : - type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ : - type == GGML_TYPE_IQ4_NL ? VDR_Q4_K_Q8_1_MMVQ : + type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ : + type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ : + type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ : + type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ : + type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ : + type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ : + type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ : + type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ : + type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ : + type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ : + type == GGML_TYPE_IQ2_XS ? VDR_IQ2_XS_Q8_1_MMVQ : + type == GGML_TYPE_IQ2_S ? VDR_IQ2_S_Q8_1_MMVQ : + type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ : + type == GGML_TYPE_IQ3_S ? VDR_IQ3_S_Q8_1_MMVQ : + type == GGML_TYPE_IQ4_NL ? VDR_IQ4_NL_Q8_1_MMVQ : + type == GGML_TYPE_IQ4_XS ? VDR_IQ4_XS_Q8_1_MMVQ : 1; } diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh index 3b12d6566..3f3a87c75 100644 --- a/ggml/src/ggml-cuda/vecdotq.cuh +++ b/ggml/src/ggml-cuda/vecdotq.cuh @@ -1,4 +1,5 @@ #include "common.cuh" +#include static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) { const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment @@ -28,6 +29,18 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment } +static __device__ __forceinline__ int get_int_b2(const void * x, const int & i32) { + const uint16_t * x16 = (const uint16_t *) x; + + int x32 = x16[2*i32 + 0] << 0; + x32 |= x16[2*i32 + 1] << 16; + + return x32; +} + +static __device__ __forceinline__ int get_int_b4(const void * x, const int & i32) { + return ((const int *) x)[i32]; // assume at least 4 byte alignment +} // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q @@ -38,7 +51,6 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl( const int * v, const int * u, const float & d4, const half2 & ds8) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics int sumi = 0; #pragma unroll @@ -47,17 +59,14 @@ template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; // SIMD dot product of quantized values - sumi = __dp4a(vi0, u[2*i+0], sumi); - sumi = __dp4a(vi1, u[2*i+1], sumi); + sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi); + sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); } const float2 ds8f = __half22float2(ds8); // second part effectively subtracts 8 from each quant value return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y); -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } #define VDR_Q4_1_Q8_1_MMVQ 2 @@ -66,7 +75,6 @@ template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl( const int * v, const int * u, const half2 & dm4, const half2 & ds8) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics int sumi = 0; #pragma unroll @@ -75,8 +83,8 @@ template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; // SIMD dot product of quantized values - sumi = __dp4a(vi0, u[2*i+0], sumi); - sumi = __dp4a(vi1, u[2*i+1], sumi); + sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi); + sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); } #ifdef GGML_CUDA_F16 @@ -92,9 +100,6 @@ template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } #define VDR_Q5_0_Q8_1_MMVQ 2 @@ -103,7 +108,6 @@ template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl( const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics int sumi = 0; #pragma unroll @@ -113,23 +117,20 @@ template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 - sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values + sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 - sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values + sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values } const float2 ds8f = __half22float2(ds8); // second part effectively subtracts 16 from each quant value return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y); -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } #define VDR_Q5_1_Q8_1_MMVQ 2 @@ -138,7 +139,6 @@ template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl( const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics int sumi = 0; #pragma unroll @@ -148,14 +148,14 @@ template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 - sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values + sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 - sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values + sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values } #ifdef GGML_CUDA_F16 @@ -171,10 +171,6 @@ template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it return sumi*d5d8 + m5s8 / (QI5_1 / vdr); - -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } #define VDR_Q8_0_Q8_1_MMVQ 2 @@ -183,31 +179,26 @@ template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp template static __device__ __forceinline__ T vec_dot_q8_0_q8_1_impl( const int * v, const int * u, const T & d8_0, const T & d8_1) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics int sumi = 0; #pragma unroll for (int i = 0; i < vdr; ++i) { // SIMD dot product of quantized values - sumi = __dp4a(v[i], u[i], sumi); + sumi = ggml_cuda_dp4a(v[i], u[i], sumi); } return d8_0*d8_1 * ((T) sumi); -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl( const int * v, const int * u, const half2 & dm8, const half2 & ds8) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics int sumi = 0; #pragma unroll for (int i = 0; i < vdr; ++i) { // SIMD dot product of quantized values - sumi = __dp4a(v[i], u[i], sumi); + sumi = ggml_cuda_dp4a(v[i], u[i], sumi); } #ifdef GGML_CUDA_F16 @@ -223,9 +214,6 @@ template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it return sumi*d8d8 + m8s8 / (QI8_1 / vdr); -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } #define VDR_Q2_K_Q8_1_MMVQ 1 @@ -236,7 +224,6 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq( const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales, const half2 & dm2, const float * __restrict__ d8) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -246,28 +233,24 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq( const int vi = (v >> (2*i)) & 0x03030303; - sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product + sumf_d += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product // fill int with 4x m int m = sc >> 4; m |= m << 8; m |= m << 16; - sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values + sumf_m += d8[i] * ggml_cuda_dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values } const float2 dm2f = __half22float2(dm2); return dm2f.x*sumf_d - dm2f.y*sumf_m; -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } // contiguous u/y values static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq( const int * __restrict__ v, const int * __restrict__ u, const half2 * dm2, const float & d8) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -281,8 +264,8 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq( #pragma unroll for (int i = i0; i < i0 + QI8_1/2; ++i) { const int vi = (vi0 >> (2*(i % (QI8_1/2)))) & 0x03030303; - sumi_d = __dp4a(vi, u[i], sumi_d); // SIMD dot product - sumi_m = __dp4a(0x01010101, u[i], sumi_m); + sumi_d = ggml_cuda_dp4a(vi, u[i], sumi_d); // SIMD dot product + sumi_m = ggml_cuda_dp4a(0x01010101, u[i], sumi_m); } sumf_d += dm2f.x * sumi_d; @@ -290,9 +273,6 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq( } return d8*(sumf_d - sumf_m); -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } #define VDR_Q3_K_Q8_1_MMVQ 1 @@ -303,7 +283,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq( const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales, const int & scale_offset, const float & d3, const float * __restrict__ d8) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics float sumf = 0.0f; #pragma unroll @@ -326,13 +305,10 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq( const int vi = __vsubss4(vil, vih); - sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product + sumf += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * sc); // SIMD dot product } return d3 * sumf; -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } // contiguous u/y values @@ -340,7 +316,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq( const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales, const float & d3, const float & d8) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics int sumi = 0; #pragma unroll @@ -350,16 +325,13 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq( #pragma unroll for (int i = i0; i < i0 + QI8_1/2; ++i) { const int vi = __vsubss4((v[i/2] >> (4*(i%2))) & 0x0F0F0F0F, 0x04040404); - sumi_sc = __dp4a(vi, u[i], sumi_sc); // SIMD dot product + sumi_sc = ggml_cuda_dp4a(vi, u[i], sumi_sc); // SIMD dot product } sumi += sumi_sc * scales[i0 / (QI8_1/2)]; } return d3*d8 * sumi; -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } #define VDR_Q4_K_Q8_1_MMVQ 2 @@ -370,7 +342,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -379,8 +350,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F; const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F; - const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product - const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u + const int dot1 = ggml_cuda_dp4a(v1i, u[2*i+1], ggml_cuda_dp4a(v0i, u[2*i+0], 0)); // SIMD dot product + const int dot2 = ggml_cuda_dp4a(0x01010101, u[2*i+1], ggml_cuda_dp4a(0x01010101, u[2*i+0], 0)); // sum of u sumf_d += d8[i] * (dot1 * sc[i]); sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values @@ -389,10 +360,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( const float2 dm4f = __half22float2(dm4); return dm4f.x*sumf_d - dm4f.y*sumf_m; - -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } // contiguous u/y values @@ -400,7 +367,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -410,7 +376,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( #pragma unroll for (int j = 0; j < QI8_1; ++j) { - sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product + sumi_d = ggml_cuda_dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product } const float2 ds8f = __half22float2(ds8[i]); @@ -422,10 +388,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( const float2 dm4f = __half22float2(dm4); return dm4f.x*sumf_d - dm4f.y*sumf_m; - -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } #define VDR_Q5_K_Q8_1_MMVQ 2 @@ -436,7 +398,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq( const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc, const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -451,8 +412,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq( const int v0i = vl0i | vh0i; const int v1i = vl1i | vh1i; - const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product - const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u + const int dot1 = ggml_cuda_dp4a(v0i, u[2*i+0], ggml_cuda_dp4a(v1i, u[2*i+1], 0)); // SIMD dot product + const int dot2 = ggml_cuda_dp4a(0x01010101, u[2*i+0], ggml_cuda_dp4a(0x01010101, u[2*i+1], 0)); // sum of u sumf_d += d8[i] * (dot1 * sc[i]); sumf_m += d8[i] * (dot2 * m[i]); @@ -462,10 +423,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq( const float2 dm5f = __half22float2(dm5); return dm5f.x*sumf_d - dm5f.y*sumf_m; - -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } // contiguous u/y values @@ -473,7 +430,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -483,7 +439,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( #pragma unroll for (int j = 0; j < QI8_1; ++j) { - sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product + sumi_d = ggml_cuda_dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product } const float2 ds8f = __half22float2(ds8[i]); @@ -495,10 +451,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( const float2 dm4f = __half22float2(dm4); return dm4f.x*sumf_d - dm4f.y*sumf_m; - -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } #define VDR_Q6_K_Q8_1_MMVQ 1 @@ -509,7 +461,6 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq( const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales, const float & d, const float * __restrict__ d8) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics float sumf = 0.0f; #pragma unroll @@ -522,13 +473,10 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq( const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32 - sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product + sumf += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * sc); // SIMD dot product } return d*sumf; -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } // contiguous u/y values @@ -536,7 +484,6 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq( const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc, const float & d6, const float * __restrict__ d8) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics float sumf_d = 0.0f; #pragma unroll @@ -545,21 +492,17 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq( #pragma unroll for (int i = i0; i < i0 + 2; ++i) { - sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product - sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product + sumi_d.x = ggml_cuda_dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product + sumi_d.x = ggml_cuda_dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product - sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product - sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product + sumi_d.y = ggml_cuda_dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product + sumi_d.y = ggml_cuda_dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product } sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y); } return d6 * sumf_d; - -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } static __device__ __forceinline__ float vec_dot_q4_0_q8_1( @@ -572,9 +515,9 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1( #pragma unroll for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) { - v[i] = get_int_from_uint8(bq4_0->qs, iqs + i); - u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0); + v[i] = get_int_b2(bq4_0->qs, iqs + i); + u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI4_0); } return vec_dot_q4_0_q8_1_impl(v, u, bq4_0->d, bq8_1->ds); @@ -591,9 +534,9 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1( #pragma unroll for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) { - v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i); - u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1); + v[i] = get_int_b4(bq4_1->qs, iqs + i); + u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI4_1); } return vec_dot_q4_1_q8_1_impl(v, u, bq4_1->dm, bq8_1->ds); @@ -610,10 +553,10 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1( #pragma unroll for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) { - vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i); - vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i)); - u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0); + vl[i] = get_int_b2(bq5_0->qs, iqs + i); + vh[i] = get_int_b2(bq5_0->qh, 0) >> (4 * (iqs + i)); + u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI5_0); } return vec_dot_q5_0_q8_1_impl(vl, vh, u, bq5_0->d, bq8_1->ds); @@ -630,10 +573,10 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1( #pragma unroll for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) { - vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i); - vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i)); - u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1); + vl[i] = get_int_b4(bq5_1->qs, iqs + i); + vh[i] = get_int_b4(bq5_1->qh, 0) >> (4 * (iqs + i)); + u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI5_1); } return vec_dot_q5_1_q8_1_impl(vl, vh, u, bq5_1->dm, bq8_1->ds); @@ -649,8 +592,8 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1( #pragma unroll for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) { - v[i] = get_int_from_int8(bq8_0->qs, iqs + i); - u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + v[i] = get_int_b2(bq8_0->qs, iqs + i); + u[i] = get_int_b4(bq8_1->qs, iqs + i); } return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, __low2half(bq8_1->ds)); @@ -666,13 +609,13 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1( const uint8_t * scales = bq2_K->scales + scale_offset; - const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs); + const int v = get_int_b4(bq2_K->qs, iqs); int u[QR2_K]; float d8[QR2_K]; #pragma unroll for (int i = 0; i < QR2_K; ++ i) { - u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); + u[i] = get_int_b4(bq8_1[bq8_offset + i].qs, iqs % QI8_1); d8[i] = __low2float(bq8_1[bq8_offset + i].ds); } @@ -689,17 +632,17 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1( const float d = bq3_K->d; - const int vl = get_int_from_uint8(bq3_K->qs, iqs); + const int vl = get_int_b2(bq3_K->qs, iqs); // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted - const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset; + const int vh = ~get_int_b2(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset; int u[QR3_K]; float d8[QR3_K]; #pragma unroll for (int i = 0; i < QR3_K; ++i) { - u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); + u[i] = get_int_b4(bq8_1[bq8_offset + i].qs, iqs % QI8_1); d8[i] = __low2float(bq8_1[bq8_offset + i].ds); } @@ -807,8 +750,8 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8); const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4)); - const int vl = get_int_from_uint8(bq6_K->ql, iqs); - const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift; + const int vl = get_int_b2(bq6_K->ql, iqs); + const int vh = get_int_b2(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift; const int8_t * scales = bq6_K->scales + scale_offset; @@ -817,335 +760,342 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( #pragma unroll for (int i = 0; i < QR6_K; ++i) { - u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); + u[i] = get_int_b4(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds); } return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); } +#define VDR_IQ2_XXS_Q8_1_MMVQ 2 + static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { + const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq + kbx; -#if QR2_XXS == 8 - const int ib32 = iqs; - const uint16_t * q2 = bq2->qs + 4*ib32; - const uint8_t * aux8 = (const uint8_t *)q2; - const int8_t * q8 = bq8_1[ib32].qs; - uint32_t aux32 = q2[2] | (q2[3] << 16); + const int q2 = get_int_b2(bq2->qs, iqs); + const uint8_t * aux8 = (const uint8_t *) &q2; + const uint32_t aux32 = get_int_b2(bq2->qs, iqs + 1); + int sumi = 0; - for (int l = 0; l < 4; ++l) { - const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]); - const uint8_t signs = ksigns_iq2xs[aux32 & 127]; - for (int j = 0; j < 8; ++j) { - sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1); - } - q8 += 8; - aux32 >>= 7; +#pragma unroll + for (int k0 = 0; k0 < 8; k0 += 2) { + const int * grid_pos = (const int *) (iq2xxs_grid + aux8[k0/2]); + const int signs_packed = ksigns_iq2xs[(aux32 >> (7*k0/2)) & 0x7F]; + + const int signs0 = __vcmpne4(((signs_packed & 0x03) << 7) | ((signs_packed & 0x0C) << 21), 0x00000000); + const int grid0 = __vsub4(grid_pos[0] ^ signs0, signs0); + const int u0 = get_int_b4(bq8_1[iqs/2].qs, k0 + 0); + sumi = ggml_cuda_dp4a(grid0, u0, sumi); + + const int signs1 = __vcmpne4(((signs_packed & 0x30) << 3) | ((signs_packed & 0xC0) << 17), 0x00000000); + const int grid1 = __vsub4(grid_pos[1] ^ signs1, signs1); + const int u1 = get_int_b4(bq8_1[iqs/2].qs, k0 + 1); + sumi = ggml_cuda_dp4a(grid1, u1, sumi); } - const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f; + + const int ls = aux32 >> 28; + sumi = (ls*sumi + sumi/2)/4; + const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds); return d * sumi; -#else - // iqs is 0...15 - const int ib32 = iqs/2; - const int il = iqs%2; - const uint16_t * q2 = bq2->qs + 4*ib32; - const uint8_t * aux8 = (const uint8_t *)q2; - const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]); - const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]); - const uint32_t aux32 = q2[2] | (q2[3] << 16); - const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f; - const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127]; - const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127]; - const int8_t * q8 = bq8_1[ib32].qs + 16*il; - int sumi1 = 0, sumi2 = 0; - for (int j = 0; j < 8; ++j) { - sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1); - sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1); - } - return d * (sumi1 + sumi2); -#endif } +#define VDR_IQ2_XS_Q8_1_MMVQ 2 + static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq + kbx; - const int ib32 = iqs; - const uint16_t * q2 = bq2->qs + 4*ib32; - const int8_t * q8 = bq8_1[ib32].qs; - const uint8_t ls1 = bq2->scales[ib32] & 0xf; - const uint8_t ls2 = bq2->scales[ib32] >> 4; + const int2 q2_packed = make_int2(get_int_b2(bq2->qs, iqs + 0), get_int_b2(bq2->qs, iqs + 1)); + const uint16_t * q2 = (const uint16_t *) &q2_packed; + const int ls0 = bq2->scales[iqs/2] & 0x0F; + const int ls1 = bq2->scales[iqs/2] >> 4; + + int sumi0 = 0; int sumi1 = 0; - for (int l = 0; l < 2; ++l) { - const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511)); - const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9)); - const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]); - const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]); - sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1); - sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1); - q8 += 8; +#pragma unroll + for (int l0 = 0; l0 < 8; l0 += 2) { + const uint32_t * grid_pos = (const uint32_t *)(iq2xs_grid + (q2[l0/2] & 0x000001FF)); + const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l0/2] >> 9)); + + const int grid_l = __vsub4(grid_pos[0] ^ signs[0], signs[0]); + const int grid_h = __vsub4(grid_pos[1] ^ signs[1], signs[1]); + + const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0); + const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1); + + if (l0 < 4) { + sumi0 = ggml_cuda_dp4a(grid_l, u0, sumi0); + sumi0 = ggml_cuda_dp4a(grid_h, u1, sumi0); + } else { + sumi1 = ggml_cuda_dp4a(grid_l, u0, sumi1); + sumi1 = ggml_cuda_dp4a(grid_h, u1, sumi1); + } } - int sumi2 = 0; - for (int l = 2; l < 4; ++l) { - const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511)); - const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9)); - const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]); - const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]); - sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2); - sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2); - q8 += 8; - } - const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f; - return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); -#else - GGML_UNUSED(ksigns64); - NO_DEVICE_CODE; -#endif + const int sumi = (sumi0*ls0 + sumi1*ls1 + (sumi0 + sumi1)/2)/4; + const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds); + return d * sumi; } -// TODO +#define VDR_IQ2_S_Q8_1_MMVQ 2 + static __device__ __forceinline__ float vec_dot_iq2_s_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_iq2_s * bq2 = (const block_iq2_s *) vbq + kbx; - const int ib32 = iqs; - const int8_t * q8 = bq8_1[ib32].qs; - const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32; - const uint8_t ls1 = bq2->scales[ib32] & 0xf; - const uint8_t ls2 = bq2->scales[ib32] >> 4; + const int qs_packed = get_int_b2(bq2->qs, iqs/2); + const uint8_t * qs = (const uint8_t *) &qs_packed; + + const int qh = bq2->qh[iqs/2]; + + const int signs_packed_32 = get_int_b2(bq2->qs, QK_K/32 + iqs/2); + const uint8_t * signs_packed_8 = (const uint8_t *) &signs_packed_32; + + const int ls0 = bq2->scales[iqs/2] & 0x0F; + const int ls1 = bq2->scales[iqs/2] >> 4; + + int sumi0 = 0; int sumi1 = 0; - for (int l = 0; l < 2; ++l) { - const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); - const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201); - const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201); - const int grid_l = __vsub4(grid[0] ^ signs0, signs0); - const int grid_h = __vsub4(grid[1] ^ signs1, signs1); - sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1); - sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1); - q8 += 8; +#pragma unroll + for (int l0 = 0; l0 < 8; l0 += 2) { + const int * grid_pos = (const int *)(iq2s_grid + (qs[l0/2] | ((qh << (8-l0)) & 0x300))); + + const int signs0 = __vcmpne4(((signs_packed_8[l0/2] & 0x03) << 7) | ((signs_packed_8[l0/2] & 0x0C) << 21), 0x00000000); + const int signs1 = __vcmpne4(((signs_packed_8[l0/2] & 0x30) << 3) | ((signs_packed_8[l0/2] & 0xC0) << 17), 0x00000000); + + const int grid_l = __vsub4(grid_pos[0] ^ signs0, signs0); + const int grid_h = __vsub4(grid_pos[1] ^ signs1, signs1); + + const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0); + const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1); + + if (l0 < 4) { + sumi0 = ggml_cuda_dp4a(grid_l, u0, sumi0); + sumi0 = ggml_cuda_dp4a(grid_h, u1, sumi0); + } else { + sumi1 = ggml_cuda_dp4a(grid_l, u0, sumi1); + sumi1 = ggml_cuda_dp4a(grid_h, u1, sumi1); + } } - int sumi2 = 0; - for (int l = 2; l < 4; ++l) { - const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); - const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201); - const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201); - const int grid_l = __vsub4(grid[0] ^ signs0, signs0); - const int grid_h = __vsub4(grid[1] ^ signs1, signs1); - sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2); - sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2); - q8 += 8; - } - const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f; - return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); -#else - GGML_UNUSED(ksigns64); - NO_DEVICE_CODE; -#endif + const int sumi = (sumi0*ls0 + sumi1*ls1 + (sumi0 + sumi1)/2)/4; + + const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds); + return d * sumi; } +#define VDR_IQ3_XXS_Q8_1_MMVQ 2 + static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq + kbx; - const int ib32 = iqs; - const uint8_t * q3 = bq2->qs + 8*ib32; - const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32; - const int8_t * q8 = bq8_1[ib32].qs; - uint32_t aux32 = gas[0] | (gas[1] << 16); + const block_iq3_xxs * bq3 = (const block_iq3_xxs *) vbq + kbx; + + const int2 q3_packed = make_int2(get_int_b2(bq3->qs, iqs), get_int_b2(bq3->qs, iqs+1)); + const uint8_t * q3 = (const uint8_t *) &q3_packed; + const uint32_t aux32 = get_int_b2(bq3->qs, QK_K/16 + iqs/2); + int sumi = 0; - for (int l = 0; l < 4; ++l) { - const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0]; - const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1]; - const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127)); - const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]); - const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]); - sumi = __dp4a(grid_l, *((int *)q8+0), sumi); - sumi = __dp4a(grid_h, *((int *)q8+1), sumi); - q8 += 8; - aux32 >>= 7; +#pragma unroll + for (int l0 = 0; l0 < 8; l0 += 2) { + const int2 grid_pos = make_int2(iq3xxs_grid[q3[l0 + 0]], iq3xxs_grid[q3[l0 + 1]]); + + const int * signs = (const int *)(ksigns64 + ((aux32 >> (7*l0/2)) & 0x7F)); + + const int grid_l = __vsub4(grid_pos.x ^ signs[0], signs[0]); + const int grid_h = __vsub4(grid_pos.y ^ signs[1], signs[1]); + + const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0); + const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1); + + sumi = ggml_cuda_dp4a(grid_l, u0, sumi); + sumi = ggml_cuda_dp4a(grid_h, u1, sumi); } - const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f; + + const int ls = aux32 >> 28; + sumi = (ls*sumi + sumi/2)/2; + const float d = __half2float(bq3->d) * __low2float(bq8_1[iqs/2].ds); return d * sumi; -#else - NO_DEVICE_CODE; -#endif } +#define VDR_IQ3_S_Q8_1_MMVQ 2 + // TODO: don't use lookup table for signs static __device__ __forceinline__ float vec_dot_iq3_s_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - const block_iq3_s * bq2 = (const block_iq3_s *) vbq + kbx; - const int ib32 = iqs; - const uint8_t * qs = bq2->qs + 8*ib32; - const int8_t * q8 = bq8_1[ib32].qs; + const block_iq3_s * bq3 = (const block_iq3_s *) vbq + kbx; + + const int2 qs_packed = make_int2(get_int_b2(bq3->qs, iqs + 0), get_int_b2(bq3->qs, iqs + 1)); + const uint8_t * qs = (const uint8_t *) &qs_packed; + + const int qh = bq3->qh[iqs/2]; + + const int signs_packed_32 = get_int_b2(bq3->signs, iqs/2); + const uint8_t * signs_packed_8 = (const uint8_t *) &signs_packed_32; + int sumi = 0; - for (int l = 0; l < 4; ++l) { - const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)); - const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)); - uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201); - uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201); - const int grid_l = __vsub4(grid1[0] ^ signs0, signs0); - const int grid_h = __vsub4(grid2[0] ^ signs1, signs1); - sumi = __dp4a(grid_l, *((int *)q8+0), sumi); - sumi = __dp4a(grid_h, *((int *)q8+1), sumi); - q8 += 8; +#pragma unroll + for (int l0 = 0; l0 < 8; l0 += 2) { + const int2 grid_pos = make_int2( + iq3s_grid[qs[l0 + 0] | ((qh << (8 - l0)) & 0x100)], + iq3s_grid[qs[l0 + 1] | ((qh << (7 - l0)) & 0x100)]); + + const int signs0 = __vcmpne4(((signs_packed_8[l0/2] & 0x03) << 7) | ((signs_packed_8[l0/2] & 0x0C) << 21), 0x00000000); + const int signs1 = __vcmpne4(((signs_packed_8[l0/2] & 0x30) << 3) | ((signs_packed_8[l0/2] & 0xC0) << 17), 0x00000000); + + const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0); + const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1); + + const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0); + const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1); + + sumi = ggml_cuda_dp4a(grid_l, u0, sumi); + sumi = ggml_cuda_dp4a(grid_h, u1, sumi); } - const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds); + + sumi *= 1 + 2*((bq3->scales[iqs/4] >> ((iqs << 1) & 0x04)) & 0x0F); + + const float d = __half2float(bq3->d) * __low2float(bq8_1[iqs/2].ds); return d * sumi; -#else - NO_DEVICE_CODE; -#endif } static __device__ __forceinline__ float vec_dot_iq1_s_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { const block_iq1_s * bq1 = (const block_iq1_s *) vbq + kbx; - const int ib32 = iqs; + const int qs_packed = get_int_b2(bq1->qs, iqs); + const uint8_t * qs = (const uint8_t *) &qs_packed; + + const int qh = bq1->qh[iqs]; + int sumi = 0; -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - const int * q8 = (const int *)bq8_1[ib32].qs; - for (int l = 0; l < 4; ++l) { - const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8))); - int grid0 = grid[0] & 0x0f0f0f0f; - int grid1 = (grid[0] >> 4) & 0x0f0f0f0f; - sumi = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi)); +#pragma unroll + for (int l0 = 0; l0 < 8; l0 += 2) { + const int grid = iq1s_grid_gpu[qs[l0/2] | (((qh >> 3*(l0/2)) & 0x07) << 8)]; + + const int grid0 = (grid >> 0) & 0x0F0F0F0F; + const int grid1 = (grid >> 4) & 0x0F0F0F0F; + + const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0); + const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1); + + sumi = ggml_cuda_dp4a(grid0, u0, sumi); + sumi = ggml_cuda_dp4a(grid1, u1, sumi); } -#else - const int8_t * q8 = bq8_1[ib32].qs; - for (int l = 0; l < 4; ++l) { - const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8))); - for (int j = 0; j < 4; ++j) { - sumi += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4); - } - q8 += 8; - } -#endif - const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA; - const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1); - const float d = d1q * __low2float (bq8_1[ib32].ds); - const float m = d1q * __high2float(bq8_1[ib32].ds); - return d * sumi + m * delta; + + const float d1q = __half2float(bq1->d) * (((qh >> 11) & 0x0E) + 1); + const float delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000); + const float2 ds = __half22float2(bq8_1[iqs].ds); + return d1q * (ds.x*sumi + ds.y*delta); } static __device__ __forceinline__ float vec_dot_iq1_m_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { + const block_iq1_m * bq1 = (const block_iq1_m *) vbq + kbx; - const int ib32 = iqs; - int sumi[2] = {0, 0}; - float sumf[2] = {0.f, 0.f}; -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - const int * q8 = (const int *)bq8_1[ib32].qs; - for (int l = 0; l < 4; ++l) { - const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8))); - int grid0 = grid[0] & 0x0f0f0f0f; - int grid1 = (grid[0] >> 4) & 0x0f0f0f0f; - sumi[l/2] = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi[l/2])); - const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA; - const int sumy = __dp4a(q8[2*l+1], 0x01010101, __dp4a(q8[2*l+0], 0x01010101, 0)); - sumf[l/2] += delta*sumy; - } -#else - const int8_t * q8 = bq8_1[ib32].qs; - for (int l = 0; l < 4; ++l) { - const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8))); + const int qs_packed = get_int_b4(bq1->qs, iqs); + const uint8_t * qs = (const uint8_t *) &qs_packed; + + int sumi[2] = {0}; + float sumf[2] = {0.0f}; +#pragma unroll + for (int l0 = 0; l0 < 8; l0 += 2) { + const int qhl = bq1->qh[2*iqs + l0/4] >> (4 * ((l0/2) % 2)); + + const int grid = iq1s_grid_gpu[qs[l0/2] | ((qhl & 0x07) << 8)]; + + const int grid0 = (grid >> 0) & 0x0F0F0F0F; + const int grid1 = (grid >> 4) & 0x0F0F0F0F; + + const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0); + const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1); + + sumi[l0/4] = ggml_cuda_dp4a(grid0, u0, sumi[l0/4]); + sumi[l0/4] = ggml_cuda_dp4a(grid1, u1, sumi[l0/4]); + + const float delta = -1.0f + IQ1M_DELTA - (qhl & 0x08) * (2.0f*IQ1M_DELTA/0x08); int sumy = 0; - for (int j = 0; j < 4; ++j) { - sumi[l/2] += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4); - sumy += q8[j] + q8[j+4]; - } - const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA; - sumf[l/2] += delta*sumy; - q8 += 8; + sumy = ggml_cuda_dp4a(u0, 0x01010101, sumy); + sumy = ggml_cuda_dp4a(u1, 0x01010101, sumy); + sumf[l0/4] += delta*sumy; } -#endif + + const uint16_t * sc = (const uint16_t *) bq1->scales; + iq1m_scale_t scale; - const uint16_t * sc = (const uint16_t *)bq1->scales; - scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); - const float d = (float)scale.f16 * __low2float (bq8_1[ib32].ds); - return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1)); + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00F0) | ((sc[2] >> 4) & 0x0F00) | (sc[3] & 0xF000); + const float d = __half2float(scale.f16) * __low2float(bq8_1[iqs].ds); + + const int tmp = sc[iqs/2] >> (6*(iqs%2)); + const int sc0 = 2*((tmp >> 0) & 0x07) + 1; + const int sc1 = 2*((tmp >> 3) & 0x07) + 1; + return d * ((sumi[0] + sumf[0]) * sc0 + (sumi[1] + sumf[1]) * sc1); } -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics -static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values, - int & val1, int & val2) { +static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4) { + const int q0_32 = (q4 >> 0) & 0x0F0F0F0F; + const int8_t * q0_8 = (const int8_t *) &q0_32; + const char4 val0_8 = make_char4( + kvalues_iq4nl[q0_8[0]], kvalues_iq4nl[q0_8[1]], kvalues_iq4nl[q0_8[2]], kvalues_iq4nl[q0_8[3]]); - uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32; - aux32 = q4 & 0x0f0f0f0f; - uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8); - uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8); - val1 = v1 | (v2 << 16); - aux32 = (q4 >> 4) & 0x0f0f0f0f; - v1 = values[q8[0]] | (values[q8[1]] << 8); - v2 = values[q8[2]] | (values[q8[3]] << 8); - val2 = v1 | (v2 << 16); + const int q1_32 = (q4 >> 4) & 0x0F0F0F0F; + const int8_t * q1_8 = (const int8_t *) &q1_32; + const char4 val1_8 = make_char4( + kvalues_iq4nl[q1_8[0]], kvalues_iq4nl[q1_8[1]], kvalues_iq4nl[q1_8[2]], kvalues_iq4nl[q1_8[3]]); + + return make_int2(*((const int *) &val0_8), *((const int *) &val1_8)); } -#endif + +#define VDR_IQ4_NL_Q8_1_MMVQ 2 static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { - const block_iq4_nl * bq = (const block_iq4_nl *) vbq + kbx; + const block_iq4_nl * bq4 = (const block_iq4_nl *) vbq + kbx; -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs; - const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs; + const int * q8 = (const int *) bq8_1->qs + iqs; - const uint8_t * values = (const uint8_t *)kvalues_iq4nl; - - int v1, v2; - int sumi1 = 0, sumi2 = 0; + int sumi = 0; +#pragma unroll for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) { - const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16); - get_int_from_table_16(aux, values, v1, v2); - sumi1 = __dp4a(v1, q8[l+0], sumi1); - sumi2 = __dp4a(v2, q8[l+4], sumi2); + const int aux_q4 = get_int_b2(bq4->qs, iqs + l); + const int2 v = get_int_from_table_16(aux_q4); + + sumi = ggml_cuda_dp4a(v.x, q8[l + 0], sumi); + sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi); } -#else - const uint8_t * q4 = bq->qs + 4*iqs; - const int8_t * q8 = bq8_1->qs + 4*iqs; - - int sumi1 = 0, sumi2 = 0; - for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) { - sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf]; - sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >> 4]; - } -#endif - const float d = (float)bq->d * __low2float(bq8_1->ds); - return d * (sumi1 + sumi2); + const float d = __half2float(bq4->d) * __low2float(bq8_1->ds); + return d * sumi; } +#define VDR_IQ4_XS_Q8_1_MMVQ 4 + static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq + kbx; - const uint8_t * values = (const uint8_t *)kvalues_iq4nl; - // iqs is 0...7 - const int ib32 = iqs; - const int32_t * q8 = (const int *)bq8_1[ib32].qs; - const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32; - const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4); - const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds); - int v1, v2; - int sumi1 = 0, sumi2 = 0; + int sumi = 0; +#pragma unroll for (int j = 0; j < 4; ++j) { - get_int_from_table_16(q4[j], values, v1, v2); - sumi1 = __dp4a(v1, q8[j+0], sumi1); - sumi2 = __dp4a(v2, q8[j+4], sumi2); + const int aux_q4 = get_int_b4(bq4->qs, iqs + j); + const int2 v = get_int_from_table_16(aux_q4); + + const int u0 = get_int_b4(bq8_1[iqs/4].qs, j + 0); + const int u1 = get_int_b4(bq8_1[iqs/4].qs, j + 4); + + sumi = ggml_cuda_dp4a(v.x, u0, sumi); + sumi = ggml_cuda_dp4a(v.y, u1, sumi); } - return d * (sumi1 + sumi2); -#else - return vec_dot_iq4_xs_q8_1(vbq, bq8_1, kbx, iqs); -#endif + + const int ls = ((bq4->scales_l[iqs/8] >> (iqs & 0x04)) & 0x0F) | (((bq4->scales_h >> (iqs/2)) & 0x03) << 4); + sumi *= ls - 32; + + const float d = __half2float(bq4->d) * __low2float(bq8_1[iqs/4].ds); + return d * sumi; } diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index 23227649e..9b751f3c6 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -735,7 +735,7 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q_iq2_xxs_q8_1( + mul_mat_vec_q_iq2_xxs_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); }); @@ -760,7 +760,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q_iq2_xs_q8_1( + mul_mat_vec_q_iq2_xs_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); }); @@ -785,7 +785,7 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q_iq2_s_q8_1( + mul_mat_vec_q_iq2_s_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); }); @@ -810,7 +810,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q_iq3_xxs_q8_1( + mul_mat_vec_q_iq3_xxs_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); }); @@ -834,7 +834,7 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q_iq3_s_q8_1( + mul_mat_vec_q_iq3_s_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); }); @@ -924,7 +924,7 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q_iq4_xs_q8_1( + mul_mat_vec_q_iq4_xs_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); }); diff --git a/ggml/src/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp index 5e2e82546..d2dccade2 100644 --- a/ggml/src/ggml-sycl/vecdotq.hpp +++ b/ggml/src/ggml-sycl/vecdotq.hpp @@ -820,7 +820,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq, #if QK_K == 256 const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq; -#if QR2_XXS == 8 const int ib32 = iqs; const uint16_t * q2 = bq2->qs + 4*ib32; const uint8_t * aux8 = (const uint8_t *)q2; @@ -838,26 +837,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq, } const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.25f; return d * sumi; -#else - // iqs is 0...15 - const int ib32 = iqs/2; - const int il = iqs%2; - const uint16_t * q2 = bq2->qs + 4*ib32; - const uint8_t * aux8 = (const uint8_t *)q2; - const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]); - const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]); - const uint32_t aux32 = q2[2] | (q2[3] << 16); - const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * bq8_1[ib32].ds[0] * 0.25f; - const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127]; - const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127]; - const int8_t * q8 = bq8_1[ib32].qs + 16*il; - int sumi1 = 0, sumi2 = 0; - for (int j = 0; j < 8; ++j) { - sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1); - sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1); - } - return d * (sumi1 + sumi2); -#endif #else assert(false); return 0.f;