CUDA: refactor and optimize IQ MMVQ (#8215)

* CUDA: refactor and optimize IQ MMVQ

* uint -> uint32_t

* __dp4a -> ggml_cuda_dp4a

* remove MIN_CC_DP4A checks

* change default

* try CI fix
This commit is contained in:
Johannes Gäßler 2024-07-01 20:39:06 +02:00 committed by GitHub
parent dae57a1ebc
commit cb5fad4c6c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 406 additions and 487 deletions

View File

@ -106,19 +106,19 @@ typedef sycl::half2 ggml_half2;
#define QR6_K 2 #define QR6_K 2
#define QI2_XXS (QK_K / (4*QR2_XXS)) #define QI2_XXS (QK_K / (4*QR2_XXS))
#define QR2_XXS 8 #define QR2_XXS 4
#define QI2_XS (QK_K / (4*QR2_XS)) #define QI2_XS (QK_K / (4*QR2_XS))
#define QR2_XS 8 #define QR2_XS 4
#define QI2_S (QK_K / (4*QR2_S)) #define QI2_S (QK_K / (4*QR2_S))
#define QR2_S 8 #define QR2_S 4
#define QI3_XXS (QK_K / (4*QR3_XXS)) #define QI3_XXS (QK_K / (4*QR3_XXS))
#define QR3_XXS 8 #define QR3_XXS 4
#define QI3_XS (QK_K / (4*QR3_XS)) #define QI3_XS (QK_K / (4*QR3_XS))
#define QR3_XS 8 #define QR3_XS 4
#define QI1_S (QK_K / (4*QR1_S)) #define QI1_S (QK_K / (4*QR1_S))
#define QR1_S 8 #define QR1_S 8
@ -130,10 +130,10 @@ typedef sycl::half2 ggml_half2;
#define QR4_NL 2 #define QR4_NL 2
#define QI4_XS (QK_K / (4*QR4_XS)) #define QI4_XS (QK_K / (4*QR4_XS))
#define QR4_XS 8 #define QR4_XS 2
#define QI3_S (QK_K / (4*QR3_S)) #define QI3_S (QK_K / (4*QR3_S))
#define QR3_S 8 #define QR3_S 4
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP

View File

@ -1882,6 +1882,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
bool use_mul_mat_q = ggml_is_quantized(src0->type) bool use_mul_mat_q = ggml_is_quantized(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32; && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
// if mmvq is available it's a better choice than dmmv:
#ifndef GGML_CUDA_FORCE_DMMV
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
#endif // GGML_CUDA_FORCE_DMMV
bool any_gpus_with_slow_fp16 = false; bool any_gpus_with_slow_fp16 = false;
if (split) { if (split) {
@ -1894,22 +1899,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
} }
const int cc = ggml_cuda_info().devices[id].cc; const int cc = ggml_cuda_info().devices[id].cc;
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc); any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
} }
} else { } else {
const int cc = ggml_cuda_info().devices[ctx.device].cc; const int cc = ggml_cuda_info().devices[ctx.device].cc;
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc); any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
} }
// if mmvq is available it's a better choice than dmmv:
#ifndef GGML_CUDA_FORCE_DMMV
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
#endif // GGML_CUDA_FORCE_DMMV
// debug helpers // debug helpers
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]); //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);

View File

@ -3,6 +3,7 @@
#include "ggml.h" #include "ggml.h"
#include "ggml-cuda.h" #include "ggml-cuda.h"
#include <cstdint>
#include <memory> #include <memory>
#if defined(GGML_USE_HIPBLAS) #if defined(GGML_USE_HIPBLAS)
@ -268,30 +269,15 @@ static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigne
return c; return c;
} }
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
c = __builtin_amdgcn_sdot4(a, b, c, false); const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
#elif defined(RDNA3) unsigned int c;
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
#elif defined(__gfx1010__) || defined(__gfx900__) #pragma unroll
int tmp1; for (int i = 0; i < 4; ++i) {
int tmp2; vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
asm("\n \ }
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
v_add3_u32 %0, %1, %2, %0 \n \
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
v_add3_u32 %0, %1, %2, %0 \n \
"
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
: "v"(a), "v"(b)
);
#else
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
#endif
return c; return c;
} }
@ -467,8 +453,48 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
} }
#endif // CUDART_VERSION < 12000 #endif // CUDART_VERSION < 12000
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
c = __builtin_amdgcn_sdot4(a, b, c, false);
#elif defined(RDNA3)
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
#elif defined(__gfx1010__) || defined(__gfx900__)
int tmp1;
int tmp2;
asm("\n \
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
v_add3_u32 %0, %1, %2, %0 \n \
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
v_add3_u32 %0, %1, %2, %0 \n \
"
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
: "v"(a), "v"(b)
);
#else
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
#endif
return c;
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if __CUDA_ARCH__ >= MIN_CC_DP4A
return __dp4a(a, b, c);
#else // __CUDA_ARCH__ >= MIN_CC_DP4A
const int8_t * a8 = (const int8_t *) &a;
const int8_t * b8 = (const int8_t *) &b;
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
}
// TODO: move to ggml-common.h // TODO: move to ggml-common.h
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v); typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);

View File

@ -54,12 +54,11 @@ typedef float (*vec_dot_KQ_f32_t)(
template<typename T, int D> template<typename T, int D>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0( static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c; const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
GGML_UNUSED(Q_v); GGML_UNUSED(Q_v);
half sum = 0.0f; T sum = 0.0f;
#pragma unroll #pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) { for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
@ -72,7 +71,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
const int v = (get_int_from_uint8(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F; const int v = (get_int_from_uint8(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
const int u = Q_q8[k_KQ_0/WARP_SIZE]; const int u = Q_q8[k_KQ_0/WARP_SIZE];
const int sumi = __dp4a(v, u, 0); const int sumi = ggml_cuda_dp4a(v, u, 0);
#ifdef FP16_AVAILABLE #ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) { if (std::is_same<T, half>::value) {
@ -90,19 +89,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
} }
return sum; return sum;
#else
GGML_UNUSED(K_c);
GGML_UNUSED(Q_v);
GGML_UNUSED(Q_q8);
GGML_UNUSED(Q_ds_v);
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
template<typename T, int D> template<typename T, int D>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1( static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c; const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
GGML_UNUSED(Q_v); GGML_UNUSED(Q_v);
@ -120,7 +111,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
const int v = (get_int_from_uint8_aligned(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F; const int v = (get_int_from_uint8_aligned(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
const int u = Q_q8[k_KQ_0/WARP_SIZE]; const int u = Q_q8[k_KQ_0/WARP_SIZE];
const int sumi = __dp4a(v, u, 0); const int sumi = ggml_cuda_dp4a(v, u, 0);
#ifdef FP16_AVAILABLE #ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) { if (std::is_same<T, half>::value) {
@ -142,19 +133,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
} }
return sum; return sum;
#else
GGML_UNUSED(K_c);
GGML_UNUSED(Q_v);
GGML_UNUSED(Q_q8);
GGML_UNUSED(Q_ds_v);
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
template<typename T, int D> template<typename T, int D>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0( static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c; const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
GGML_UNUSED(Q_v); GGML_UNUSED(Q_v);
@ -179,7 +162,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
const int u = Q_q8[k_KQ_0/WARP_SIZE]; const int u = Q_q8[k_KQ_0/WARP_SIZE];
const int sumi = __dp4a(v, u, 0); const int sumi = ggml_cuda_dp4a(v, u, 0);
#ifdef FP16_AVAILABLE #ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) { if (std::is_same<T, half>::value) {
@ -197,19 +180,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
} }
return sum; return sum;
#else
GGML_UNUSED(K_c);
GGML_UNUSED(Q_v);
GGML_UNUSED(Q_q8);
GGML_UNUSED(Q_ds_v);
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
template<typename T, int D> template<typename T, int D>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1( static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c; const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
GGML_UNUSED(Q_v); GGML_UNUSED(Q_v);
@ -234,7 +209,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
const int u = Q_q8[k_KQ_0/WARP_SIZE]; const int u = Q_q8[k_KQ_0/WARP_SIZE];
const int sumi = __dp4a(v, u, 0); const int sumi = ggml_cuda_dp4a(v, u, 0);
#ifdef FP16_AVAILABLE #ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) { if (std::is_same<T, half>::value) {
@ -256,19 +231,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
} }
return sum; return sum;
#else
GGML_UNUSED(K_c);
GGML_UNUSED(Q_v);
GGML_UNUSED(Q_q8);
GGML_UNUSED(Q_ds_v);
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
template <typename T, int D> template <typename T, int D>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0( static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c; const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
GGML_UNUSED(Q_v); GGML_UNUSED(Q_v);
@ -297,13 +264,6 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
} }
return sum; return sum;
#else
GGML_UNUSED(K_c);
GGML_UNUSED(Q_v);
GGML_UNUSED(Q_q8);
GGML_UNUSED(Q_ds_v);
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
template <typename T, int D> template <typename T, int D>

View File

@ -28,16 +28,22 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
static constexpr __device__ int get_vdr_mmvq(ggml_type type) { static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ : return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ : type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ : type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ : type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ : type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ : type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ : type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ : type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ : type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ : type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
type == GGML_TYPE_IQ4_NL ? VDR_Q4_K_Q8_1_MMVQ : type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ :
type == GGML_TYPE_IQ2_XS ? VDR_IQ2_XS_Q8_1_MMVQ :
type == GGML_TYPE_IQ2_S ? VDR_IQ2_S_Q8_1_MMVQ :
type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ :
type == GGML_TYPE_IQ3_S ? VDR_IQ3_S_Q8_1_MMVQ :
type == GGML_TYPE_IQ4_NL ? VDR_IQ4_NL_Q8_1_MMVQ :
type == GGML_TYPE_IQ4_XS ? VDR_IQ4_XS_Q8_1_MMVQ :
1; 1;
} }

File diff suppressed because it is too large Load Diff

View File

@ -735,7 +735,7 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(32)]] { [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS, block_iq2_xxs, 1>( mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
vx, vy, dst, ncols, nrows, item_ct1); vx, vy, dst, ncols, nrows, item_ct1);
}); });
}); });
@ -760,7 +760,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(32)]] { [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS, block_iq2_xs, 1>( mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
vx, vy, dst, ncols, nrows, item_ct1); vx, vy, dst, ncols, nrows, item_ct1);
}); });
}); });
@ -785,7 +785,7 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(32)]] { [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S, block_iq2_s, 1>( mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
vx, vy, dst, ncols, nrows, item_ct1); vx, vy, dst, ncols, nrows, item_ct1);
}); });
}); });
@ -810,7 +810,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(32)]] { [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS, block_iq3_xxs, 1>( mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
vx, vy, dst, ncols, nrows, item_ct1); vx, vy, dst, ncols, nrows, item_ct1);
}); });
}); });
@ -834,7 +834,7 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(32)]] { [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_XS, block_iq3_s, 1>( mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
vx, vy, dst, ncols, nrows, item_ct1); vx, vy, dst, ncols, nrows, item_ct1);
}); });
}); });
@ -924,7 +924,7 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(32)]] { [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS, block_iq4_xs, 1>( mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
vx, vy, dst, ncols, nrows, item_ct1); vx, vy, dst, ncols, nrows, item_ct1);
}); });
}); });

View File

@ -820,7 +820,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
#if QK_K == 256 #if QK_K == 256
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq; const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
#if QR2_XXS == 8
const int ib32 = iqs; const int ib32 = iqs;
const uint16_t * q2 = bq2->qs + 4*ib32; const uint16_t * q2 = bq2->qs + 4*ib32;
const uint8_t * aux8 = (const uint8_t *)q2; const uint8_t * aux8 = (const uint8_t *)q2;
@ -838,26 +837,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
} }
const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.25f; const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.25f;
return d * sumi; return d * sumi;
#else
// iqs is 0...15
const int ib32 = iqs/2;
const int il = iqs%2;
const uint16_t * q2 = bq2->qs + 4*ib32;
const uint8_t * aux8 = (const uint8_t *)q2;
const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
const uint32_t aux32 = q2[2] | (q2[3] << 16);
const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * bq8_1[ib32].ds[0] * 0.25f;
const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
const int8_t * q8 = bq8_1[ib32].qs + 16*il;
int sumi1 = 0, sumi2 = 0;
for (int j = 0; j < 8; ++j) {
sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1);
sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1);
}
return d * (sumi1 + sumi2);
#endif
#else #else
assert(false); assert(false);
return 0.f; return 0.f;