mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 03:31:46 +00:00
CUDA: MMQ code deduplication + iquant support (#8495)
* CUDA: MMQ code deduplication + iquant support * 1 less parallel job for CI build
This commit is contained in:
parent
07283b1a90
commit
69c487f4ed
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@ -860,7 +860,7 @@ jobs:
|
|||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
|
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
|
||||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1))
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
id: tag
|
id: tag
|
||||||
|
@ -59,6 +59,24 @@ void ggml_cuda_op_mul_mat_q(
|
|||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
|
mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
|
case GGML_TYPE_IQ2_XXS:
|
||||||
|
mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ2_XS:
|
||||||
|
mul_mat_q_case<GGML_TYPE_IQ2_XS>(ctx, args, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ2_S:
|
||||||
|
mul_mat_q_case<GGML_TYPE_IQ2_S>(ctx, args, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
mul_mat_q_case<GGML_TYPE_IQ3_XXS>(ctx, args, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ3_S:
|
||||||
|
mul_mat_q_case<GGML_TYPE_IQ3_S>(ctx, args, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
|
mul_mat_q_case<GGML_TYPE_IQ1_S>(ctx, args, stream);
|
||||||
|
break;
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream);
|
mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
@ -93,6 +111,12 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
|||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
|
case GGML_TYPE_IQ2_XXS:
|
||||||
|
case GGML_TYPE_IQ2_XS:
|
||||||
|
case GGML_TYPE_IQ2_S:
|
||||||
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
case GGML_TYPE_IQ3_S:
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
case GGML_TYPE_IQ4_NL:
|
case GGML_TYPE_IQ4_NL:
|
||||||
mmq_supported = true;
|
mmq_supported = true;
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -23,7 +23,8 @@ SOURCE_FATTN_WMMA_CASE = "DECL_FATTN_WMMA_F16_CASE({head_size}, {cols_per_block}
|
|||||||
TYPES_MMQ = [
|
TYPES_MMQ = [
|
||||||
"GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
|
"GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
|
||||||
"GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
|
"GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
|
||||||
"GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
|
"GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
|
||||||
|
"GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
|
||||||
]
|
]
|
||||||
|
|
||||||
SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
@ -0,0 +1,5 @@
|
|||||||
|
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
|
||||||
|
#include "../mmq.cuh"
|
||||||
|
|
||||||
|
DECL_MMQ_CASE(GGML_TYPE_IQ1_S);
|
@ -0,0 +1,5 @@
|
|||||||
|
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
|
||||||
|
#include "../mmq.cuh"
|
||||||
|
|
||||||
|
DECL_MMQ_CASE(GGML_TYPE_IQ2_S);
|
@ -0,0 +1,5 @@
|
|||||||
|
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
|
||||||
|
#include "../mmq.cuh"
|
||||||
|
|
||||||
|
DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);
|
@ -0,0 +1,5 @@
|
|||||||
|
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
|
||||||
|
#include "../mmq.cuh"
|
||||||
|
|
||||||
|
DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);
|
@ -0,0 +1,5 @@
|
|||||||
|
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
|
||||||
|
#include "../mmq.cuh"
|
||||||
|
|
||||||
|
DECL_MMQ_CASE(GGML_TYPE_IQ3_S);
|
@ -0,0 +1,5 @@
|
|||||||
|
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
|
||||||
|
#include "../mmq.cuh"
|
||||||
|
|
||||||
|
DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);
|
@ -188,6 +188,27 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|||||||
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_16_q8_1_impl(
|
||||||
|
const int * v, const int * u, const float * d8_0, const float & d8_1) {
|
||||||
|
|
||||||
|
float sumf = 0.0f;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i0 = 0; i0 < vdr; i0 += QI8_0/2) {
|
||||||
|
int sumi = 0;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = i0; i < i0 + QI8_0/2; ++i) {
|
||||||
|
// SIMD dot product of quantized values
|
||||||
|
sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf += d8_0[i0/(QI8_0/2)]*sumi;
|
||||||
|
}
|
||||||
|
|
||||||
|
return d8_1*sumf;
|
||||||
|
}
|
||||||
|
|
||||||
#define VDR_Q2_K_Q8_1_MMVQ 1
|
#define VDR_Q2_K_Q8_1_MMVQ 1
|
||||||
#define VDR_Q2_K_Q8_1_MMQ 4
|
#define VDR_Q2_K_Q8_1_MMQ 4
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user