mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 19:21:46 +00:00
k-quants : allow to optionally disable at compile time (#1734)
* k-quants : put behind optional compile flag LLAMA_K_QUANTS * build : enable k-quants by default
This commit is contained in:
parent
5b57a5b726
commit
5c64a0952e
@ -72,6 +72,7 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern
|
|||||||
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
|
||||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||||
option(LLAMA_METAL "llama: use Metal" OFF)
|
option(LLAMA_METAL "llama: use Metal" OFF)
|
||||||
|
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
|
||||||
|
|
||||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||||
@ -226,6 +227,10 @@ if (LLAMA_METAL)
|
|||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_K_QUANTS)
|
||||||
|
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (LLAMA_CLBLAST)
|
if (LLAMA_CLBLAST)
|
||||||
find_package(CLBlast)
|
find_package(CLBlast)
|
||||||
if (CLBlast_FOUND)
|
if (CLBlast_FOUND)
|
||||||
@ -396,11 +401,10 @@ endif()
|
|||||||
add_library(ggml OBJECT
|
add_library(ggml OBJECT
|
||||||
ggml.c
|
ggml.c
|
||||||
ggml.h
|
ggml.h
|
||||||
ggml-quants-k.h
|
|
||||||
ggml-quants-k.c
|
|
||||||
${GGML_SOURCES_CUDA}
|
${GGML_SOURCES_CUDA}
|
||||||
${GGML_SOURCES_OPENCL}
|
${GGML_SOURCES_OPENCL}
|
||||||
${GGML_SOURCES_METAL}
|
${GGML_SOURCES_METAL}
|
||||||
|
${GGML_SOURCES_EXTRA}
|
||||||
)
|
)
|
||||||
|
|
||||||
target_include_directories(ggml PUBLIC .)
|
target_include_directories(ggml PUBLIC .)
|
||||||
|
37
Makefile
37
Makefile
@ -121,6 +121,11 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
|
|||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifndef LLAMA_NO_K_QUANTS
|
||||||
|
CFLAGS += -DGGML_USE_K_QUANTS
|
||||||
|
OBJS += k_quants.o
|
||||||
|
endif
|
||||||
|
|
||||||
ifndef LLAMA_NO_ACCELERATE
|
ifndef LLAMA_NO_ACCELERATE
|
||||||
# Mac M1 - include Accelerate framework.
|
# Mac M1 - include Accelerate framework.
|
||||||
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
|
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
|
||||||
@ -140,7 +145,7 @@ ifdef LLAMA_OPENBLAS
|
|||||||
endif # LLAMA_OPENBLAS
|
endif # LLAMA_OPENBLAS
|
||||||
|
|
||||||
ifdef LLAMA_BLIS
|
ifdef LLAMA_BLIS
|
||||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
|
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
|
||||||
LDFLAGS += -lblis -L/usr/local/lib
|
LDFLAGS += -lblis -L/usr/local/lib
|
||||||
endif # LLAMA_BLIS
|
endif # LLAMA_BLIS
|
||||||
|
|
||||||
@ -212,6 +217,11 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
|
|||||||
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
|
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_NO_K_QUANTS
|
||||||
|
k_quants.o: k_quants.c k_quants.h
|
||||||
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
endif # LLAMA_NO_K_QUANTS
|
||||||
|
|
||||||
#
|
#
|
||||||
# Print build information
|
# Print build information
|
||||||
#
|
#
|
||||||
@ -231,10 +241,7 @@ $(info )
|
|||||||
# Build library
|
# Build library
|
||||||
#
|
#
|
||||||
|
|
||||||
ggml.o: ggml.c ggml.h ggml-cuda.h ggml-quants-k.h
|
ggml.o: ggml.c ggml.h ggml-cuda.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
ggml-quants-k.o: ggml-quants-k.c ggml-quants-k.h ggml.h ggml-cuda.h
|
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
|
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
|
||||||
@ -243,7 +250,7 @@ llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
|
|||||||
common.o: examples/common.cpp examples/common.h
|
common.o: examples/common.cpp examples/common.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
libllama.so: llama.o ggml.o ggml-quants-k.o $(OBJS)
|
libllama.so: llama.o ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
@ -253,28 +260,28 @@ clean:
|
|||||||
# Examples
|
# Examples
|
||||||
#
|
#
|
||||||
|
|
||||||
main: examples/main/main.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
|
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
@echo
|
@echo
|
||||||
@echo '==== Run ./main -h for help. ===='
|
@echo '==== Run ./main -h for help. ===='
|
||||||
@echo
|
@echo
|
||||||
|
|
||||||
quantize: examples/quantize/quantize.cpp build-info.h ggml.o ggml-quants-k.o llama.o $(OBJS)
|
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o ggml-quants-k.o llama.o $(OBJS)
|
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
|
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
embedding: examples/embedding/embedding.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
|
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
|
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
|
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||||
@ -289,11 +296,11 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
|||||||
# Tests
|
# Tests
|
||||||
#
|
#
|
||||||
|
|
||||||
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o ggml-quants-k.o $(OBJS)
|
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
./$@
|
./$@
|
||||||
|
|
||||||
vdot: pocs/vdot/vdot.cpp ggml.o ggml-quants-k.o $(OBJS)
|
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
.PHONY: tests clean
|
.PHONY: tests clean
|
||||||
|
120
ggml-cuda.cu
120
ggml-cuda.cu
@ -110,24 +110,24 @@ typedef struct {
|
|||||||
uint8_t qs[QK_K/4]; // quants
|
uint8_t qs[QK_K/4]; // quants
|
||||||
half d; // super-block scale for quantized scales
|
half d; // super-block scale for quantized scales
|
||||||
half dmin; // super-block scale for quantized mins
|
half dmin; // super-block scale for quantized mins
|
||||||
} block_q2_k;
|
} block_q2_K;
|
||||||
static_assert(sizeof(block_q2_k) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_k block size/padding");
|
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint8_t hmask[QK_K/8];
|
uint8_t hmask[QK_K/8];
|
||||||
uint8_t qs[QK_K/4]; // nibbles / quants
|
uint8_t qs[QK_K/4]; // nibbles / quants
|
||||||
uint8_t scales[3*QK_K/64];
|
uint8_t scales[3*QK_K/64];
|
||||||
half d;
|
half d;
|
||||||
} block_q3_k;
|
} block_q3_K;
|
||||||
static_assert(sizeof(block_q3_k) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_k block size/padding");
|
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d; // super-block scale for quantized scales
|
half d; // super-block scale for quantized scales
|
||||||
half dmin; // super-block scale for quantized mins
|
half dmin; // super-block scale for quantized mins
|
||||||
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
||||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||||
} block_q4_k;
|
} block_q4_K;
|
||||||
static_assert(sizeof(block_q4_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_k block size/padding");
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d; // super-block scale for quantized scales
|
half d; // super-block scale for quantized scales
|
||||||
@ -135,16 +135,16 @@ typedef struct {
|
|||||||
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
||||||
uint8_t qh[QK_K/8]; // quants, high bit
|
uint8_t qh[QK_K/8]; // quants, high bit
|
||||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||||
} block_q5_k;
|
} block_q5_K;
|
||||||
static_assert(sizeof(block_q5_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_k block size/padding");
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
||||||
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
||||||
int8_t scales[QK_K/16]; // scales
|
int8_t scales[QK_K/16]; // scales
|
||||||
half d; // delta
|
half d; // delta
|
||||||
} block_q6_k;
|
} block_q6_K;
|
||||||
static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_k block size/padding");
|
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
||||||
|
|
||||||
#define WARP_SIZE 32
|
#define WARP_SIZE 32
|
||||||
|
|
||||||
@ -299,7 +299,7 @@ static __device__ void dequantize_q8_0(const void * vx, const int ib, const int
|
|||||||
|
|
||||||
//================================== k-quants
|
//================================== k-quants
|
||||||
|
|
||||||
static __global__ void dequantize_block_q2_k(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int i = blockIdx.x;
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
@ -307,7 +307,7 @@ static __global__ void dequantize_block_q2_k(const void * vx, float * yy) {
|
|||||||
const int l = tid - 32*n;
|
const int l = tid - 32*n;
|
||||||
const int is = 8*n + l/16;
|
const int is = 8*n + l/16;
|
||||||
|
|
||||||
const block_q2_k * x = (const block_q2_k *) vx;
|
const block_q2_K * x = (const block_q2_K *) vx;
|
||||||
|
|
||||||
const uint8_t q = x[i].qs[32*n + l];
|
const uint8_t q = x[i].qs[32*n + l];
|
||||||
float * y = yy + i*QK_K + 128*n;
|
float * y = yy + i*QK_K + 128*n;
|
||||||
@ -321,9 +321,9 @@ static __global__ void dequantize_block_q2_k(const void * vx, float * yy) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ void vec_dot_q2_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
||||||
|
|
||||||
const block_q2_k * x = (const block_q2_k *) vx;
|
const block_q2_K * x = (const block_q2_K *) vx;
|
||||||
|
|
||||||
// if n is 0, we want to do the lower 128, else the upper 128,
|
// if n is 0, we want to do the lower 128, else the upper 128,
|
||||||
// covering y[l+0], y[l+32], y[l+64], y[l+96] and
|
// covering y[l+0], y[l+32], y[l+64], y[l+96] and
|
||||||
@ -352,7 +352,7 @@ static __device__ void vec_dot_q2_k(const void * vx, const int ib, const int iqs
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_block_q3_k(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
||||||
|
|
||||||
int r = threadIdx.x/4;
|
int r = threadIdx.x/4;
|
||||||
int i = blockIdx.x;
|
int i = blockIdx.x;
|
||||||
@ -362,7 +362,7 @@ static __global__ void dequantize_block_q3_k(const void * vx, float * yy) {
|
|||||||
int n = tid / 4;
|
int n = tid / 4;
|
||||||
int j = tid - 4*n;
|
int j = tid - 4*n;
|
||||||
|
|
||||||
const block_q3_k * x = (const block_q3_k *) vx;
|
const block_q3_K * x = (const block_q3_K *) vx;
|
||||||
|
|
||||||
uint8_t m = 1 << (4*n + j);
|
uint8_t m = 1 << (4*n + j);
|
||||||
int is = 8*n + 2*j + is0;
|
int is = 8*n + 2*j + is0;
|
||||||
@ -383,9 +383,9 @@ static __global__ void dequantize_block_q3_k(const void * vx, float * yy) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ void vec_dot_q3_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
||||||
|
|
||||||
const block_q3_k * x = (const block_q3_k *) vx;
|
const block_q3_K * x = (const block_q3_K *) vx;
|
||||||
|
|
||||||
const uint32_t kmask1 = 0x03030303;
|
const uint32_t kmask1 = 0x03030303;
|
||||||
const uint32_t kmask2 = 0x0f0f0f0f;
|
const uint32_t kmask2 = 0x0f0f0f0f;
|
||||||
@ -437,8 +437,8 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_block_q4_k(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
||||||
const block_q4_k * x = (const block_q4_k *) vx;
|
const block_q4_K * x = (const block_q4_K *) vx;
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int i = blockIdx.x;
|
||||||
|
|
||||||
@ -474,9 +474,9 @@ static __global__ void dequantize_block_q4_k(const void * vx, float * yy) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ void vec_dot_q4_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
||||||
|
|
||||||
const block_q4_k * x = (const block_q4_k *) vx;
|
const block_q4_K * x = (const block_q4_K *) vx;
|
||||||
|
|
||||||
// iqs is in 0...248 in steps of 8 =>
|
// iqs is in 0...248 in steps of 8 =>
|
||||||
const int j = iqs / 64; // j is in 0...3
|
const int j = iqs / 64; // j is in 0...3
|
||||||
@ -506,8 +506,8 @@ static __device__ void vec_dot_q4_k(const void * vx, const int ib, const int iqs
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_block_q5_k(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
||||||
const block_q5_k * x = (const block_q5_k *) vx;
|
const block_q5_K * x = (const block_q5_K *) vx;
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int i = blockIdx.x;
|
||||||
|
|
||||||
@ -539,9 +539,9 @@ static __global__ void dequantize_block_q5_k(const void * vx, float * yy) {
|
|||||||
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ void vec_dot_q5_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
||||||
|
|
||||||
const block_q5_k * x = (const block_q5_k *) vx;
|
const block_q5_K * x = (const block_q5_K *) vx;
|
||||||
|
|
||||||
// iqs is in 0...248 in steps of 8 =>
|
// iqs is in 0...248 in steps of 8 =>
|
||||||
const int j = iqs / 64; // j is in 0...3
|
const int j = iqs / 64; // j is in 0...3
|
||||||
@ -576,8 +576,8 @@ static __device__ void vec_dot_q5_k(const void * vx, const int ib, const int iqs
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_block_q6_k(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
||||||
const block_q6_k * x = (const block_q6_k *) vx;
|
const block_q6_K * x = (const block_q6_K *) vx;
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int i = blockIdx.x;
|
||||||
|
|
||||||
@ -601,9 +601,9 @@ static __global__ void dequantize_block_q6_k(const void * vx, float * yy) {
|
|||||||
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ void vec_dot_q6_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
static __device__ void vec_dot_q6_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
||||||
|
|
||||||
const block_q6_k * x = (const block_q6_k *) vx;
|
const block_q6_K * x = (const block_q6_K *) vx;
|
||||||
|
|
||||||
const int ip = iqs / 128; // 0 or 1
|
const int ip = iqs / 128; // 0 or 1
|
||||||
const int il = (iqs - 128*ip)/8; // 0...15
|
const int il = (iqs - 128*ip)/8; // 0...15
|
||||||
@ -804,29 +804,29 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu
|
|||||||
dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_row_q2_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_q2_k<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_row_q3_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_q3_k<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_row_q4_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_q4_k<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_row_q5_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_q5_k<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_row_q6_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_q6_k<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
@ -869,35 +869,35 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f
|
|||||||
<<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
<<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q2_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int ny = 2;
|
const int ny = 2;
|
||||||
const dim3 block_dims(32, ny, 1);
|
const dim3 block_dims(32, ny, 1);
|
||||||
dequantize_mul_mat_vec_k<32, vec_dot_q2_k><<<(nrows + ny - 1)/ny, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<<(nrows + ny - 1)/ny, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q3_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const dim3 block_dims(32, 2, 1);
|
const dim3 block_dims(32, 2, 1);
|
||||||
dequantize_mul_mat_vec_k<32, vec_dot_q3_k><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
dequantize_mul_mat_vec_k<32, vec_dot_q3_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q4_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const dim3 block_dims(32, 2, 1);
|
const dim3 block_dims(32, 2, 1);
|
||||||
dequantize_mul_mat_vec_k<32, vec_dot_q4_k><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q5_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const dim3 block_dims(32, 2, 1);
|
const dim3 block_dims(32, 2, 1);
|
||||||
dequantize_mul_mat_vec_k<32, vec_dot_q5_k><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q6_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const dim3 block_dims(32, 2, 1);
|
const dim3 block_dims(32, 2, 1);
|
||||||
dequantize_mul_mat_vec_k<32, vec_dot_q6_k><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
dequantize_mul_mat_vec_k<32, vec_dot_q6_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||||
@ -926,15 +926,15 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
return dequantize_row_q8_0_cuda;
|
return dequantize_row_q8_0_cuda;
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
return dequantize_row_q2_k_cuda;
|
return dequantize_row_q2_K_cuda;
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
return dequantize_row_q3_k_cuda;
|
return dequantize_row_q3_K_cuda;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
return dequantize_row_q4_k_cuda;
|
return dequantize_row_q4_K_cuda;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
return dequantize_row_q5_k_cuda;
|
return dequantize_row_q5_K_cuda;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
return dequantize_row_q6_k_cuda;
|
return dequantize_row_q6_K_cuda;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
return convert_fp16_to_fp32_cuda;
|
return convert_fp16_to_fp32_cuda;
|
||||||
default:
|
default:
|
||||||
@ -1277,19 +1277,19 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|||||||
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
dequantize_mul_mat_vec_q2_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
dequantize_mul_mat_vec_q3_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
dequantize_mul_mat_vec_q4_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
dequantize_mul_mat_vec_q5_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
dequantize_mul_mat_vec_q6_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
|
107
ggml.c
107
ggml.c
@ -2,7 +2,10 @@
|
|||||||
#define _GNU_SOURCE
|
#define _GNU_SOURCE
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-quants-k.h"
|
|
||||||
|
#ifdef GGML_USE_K_QUANTS
|
||||||
|
#include "k_quants.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||||
@ -1580,46 +1583,48 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
|||||||
.vec_dot_q = NULL, // TODO
|
.vec_dot_q = NULL, // TODO
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
},
|
},
|
||||||
|
#ifdef GGML_USE_K_QUANTS
|
||||||
[GGML_TYPE_Q2_K] = {
|
[GGML_TYPE_Q2_K] = {
|
||||||
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_k,
|
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_K,
|
||||||
.quantize_row_q = quantize_row_q2_k,
|
.quantize_row_q = quantize_row_q2_K,
|
||||||
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_k_reference,
|
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference,
|
||||||
.quantize_row_q_dot = quantize_row_q8_k,
|
.quantize_row_q_dot = quantize_row_q8_K,
|
||||||
.vec_dot_q = ggml_vec_dot_q2_k_q8_k,
|
.vec_dot_q = ggml_vec_dot_q2_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q3_K] = {
|
[GGML_TYPE_Q3_K] = {
|
||||||
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_k,
|
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K,
|
||||||
.quantize_row_q = quantize_row_q3_k,
|
.quantize_row_q = quantize_row_q3_K,
|
||||||
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_k_reference,
|
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference,
|
||||||
.quantize_row_q_dot = quantize_row_q8_k,
|
.quantize_row_q_dot = quantize_row_q8_K,
|
||||||
.vec_dot_q = ggml_vec_dot_q3_k_q8_k,
|
.vec_dot_q = ggml_vec_dot_q3_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q4_K] = {
|
[GGML_TYPE_Q4_K] = {
|
||||||
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_k,
|
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_K,
|
||||||
.quantize_row_q = quantize_row_q4_k,
|
.quantize_row_q = quantize_row_q4_K,
|
||||||
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_k_reference,
|
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference,
|
||||||
.quantize_row_q_dot = quantize_row_q8_k,
|
.quantize_row_q_dot = quantize_row_q8_K,
|
||||||
.vec_dot_q = ggml_vec_dot_q4_k_q8_k,
|
.vec_dot_q = ggml_vec_dot_q4_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q5_K] = {
|
[GGML_TYPE_Q5_K] = {
|
||||||
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_k,
|
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_K,
|
||||||
.quantize_row_q = quantize_row_q5_k,
|
.quantize_row_q = quantize_row_q5_K,
|
||||||
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_k_reference,
|
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference,
|
||||||
.quantize_row_q_dot = quantize_row_q8_k,
|
.quantize_row_q_dot = quantize_row_q8_K,
|
||||||
.vec_dot_q = ggml_vec_dot_q5_k_q8_k,
|
.vec_dot_q = ggml_vec_dot_q5_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q6_K] = {
|
[GGML_TYPE_Q6_K] = {
|
||||||
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_k,
|
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K,
|
||||||
.quantize_row_q = quantize_row_q6_k,
|
.quantize_row_q = quantize_row_q6_K,
|
||||||
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_k_reference,
|
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference,
|
||||||
.quantize_row_q_dot = quantize_row_q8_k,
|
.quantize_row_q_dot = quantize_row_q8_K,
|
||||||
.vec_dot_q = ggml_vec_dot_q6_k_q8_k,
|
.vec_dot_q = ggml_vec_dot_q6_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
},
|
},
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
// For internal test use
|
// For internal test use
|
||||||
@ -3499,12 +3504,14 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
|
|||||||
[GGML_TYPE_Q5_1] = QK5_1,
|
[GGML_TYPE_Q5_1] = QK5_1,
|
||||||
[GGML_TYPE_Q8_0] = QK8_0,
|
[GGML_TYPE_Q8_0] = QK8_0,
|
||||||
[GGML_TYPE_Q8_1] = QK8_1,
|
[GGML_TYPE_Q8_1] = QK8_1,
|
||||||
|
#ifdef GGML_USE_K_QUANTS
|
||||||
[GGML_TYPE_Q2_K] = QK_K,
|
[GGML_TYPE_Q2_K] = QK_K,
|
||||||
[GGML_TYPE_Q3_K] = QK_K,
|
[GGML_TYPE_Q3_K] = QK_K,
|
||||||
[GGML_TYPE_Q4_K] = QK_K,
|
[GGML_TYPE_Q4_K] = QK_K,
|
||||||
[GGML_TYPE_Q5_K] = QK_K,
|
[GGML_TYPE_Q5_K] = QK_K,
|
||||||
[GGML_TYPE_Q6_K] = QK_K,
|
[GGML_TYPE_Q6_K] = QK_K,
|
||||||
[GGML_TYPE_Q8_K] = QK_K,
|
[GGML_TYPE_Q8_K] = QK_K,
|
||||||
|
#endif
|
||||||
[GGML_TYPE_I8] = 1,
|
[GGML_TYPE_I8] = 1,
|
||||||
[GGML_TYPE_I16] = 1,
|
[GGML_TYPE_I16] = 1,
|
||||||
[GGML_TYPE_I32] = 1,
|
[GGML_TYPE_I32] = 1,
|
||||||
@ -3520,12 +3527,14 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
|||||||
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
|
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
|
||||||
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
|
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
|
||||||
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
|
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
|
||||||
[GGML_TYPE_Q2_K] = sizeof(block_q2_k),
|
#ifdef GGML_USE_K_QUANTS
|
||||||
[GGML_TYPE_Q3_K] = sizeof(block_q3_k),
|
[GGML_TYPE_Q2_K] = sizeof(block_q2_K),
|
||||||
[GGML_TYPE_Q4_K] = sizeof(block_q4_k),
|
[GGML_TYPE_Q3_K] = sizeof(block_q3_K),
|
||||||
[GGML_TYPE_Q5_K] = sizeof(block_q5_k),
|
[GGML_TYPE_Q4_K] = sizeof(block_q4_K),
|
||||||
[GGML_TYPE_Q6_K] = sizeof(block_q6_k),
|
[GGML_TYPE_Q5_K] = sizeof(block_q5_K),
|
||||||
[GGML_TYPE_Q8_K] = sizeof(block_q8_k),
|
[GGML_TYPE_Q6_K] = sizeof(block_q6_K),
|
||||||
|
[GGML_TYPE_Q8_K] = sizeof(block_q8_K),
|
||||||
|
#endif
|
||||||
[GGML_TYPE_I8] = sizeof(int8_t),
|
[GGML_TYPE_I8] = sizeof(int8_t),
|
||||||
[GGML_TYPE_I16] = sizeof(int16_t),
|
[GGML_TYPE_I16] = sizeof(int16_t),
|
||||||
[GGML_TYPE_I32] = sizeof(int32_t),
|
[GGML_TYPE_I32] = sizeof(int32_t),
|
||||||
@ -3542,12 +3551,12 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
|||||||
[GGML_TYPE_Q5_1] = "q5_1",
|
[GGML_TYPE_Q5_1] = "q5_1",
|
||||||
[GGML_TYPE_Q8_0] = "q8_0",
|
[GGML_TYPE_Q8_0] = "q8_0",
|
||||||
[GGML_TYPE_Q8_1] = "q8_1",
|
[GGML_TYPE_Q8_1] = "q8_1",
|
||||||
[GGML_TYPE_Q2_K] = "q2_k",
|
[GGML_TYPE_Q2_K] = "q2_K",
|
||||||
[GGML_TYPE_Q3_K] = "q3_k",
|
[GGML_TYPE_Q3_K] = "q3_K",
|
||||||
[GGML_TYPE_Q4_K] = "q4_k",
|
[GGML_TYPE_Q4_K] = "q4_K",
|
||||||
[GGML_TYPE_Q5_K] = "q5_k",
|
[GGML_TYPE_Q5_K] = "q5_K",
|
||||||
[GGML_TYPE_Q6_K] = "q6_k",
|
[GGML_TYPE_Q6_K] = "q6_K",
|
||||||
[GGML_TYPE_Q8_K] = "q8_k",
|
[GGML_TYPE_Q8_K] = "q8_K",
|
||||||
[GGML_TYPE_I8] = "i8",
|
[GGML_TYPE_I8] = "i8",
|
||||||
[GGML_TYPE_I16] = "i16",
|
[GGML_TYPE_I16] = "i16",
|
||||||
[GGML_TYPE_I32] = "i32",
|
[GGML_TYPE_I32] = "i32",
|
||||||
@ -16249,36 +16258,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|||||||
block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
|
block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
|
||||||
result = ggml_quantize_q8_0(src + start, block, n, n, hist);
|
result = ggml_quantize_q8_0(src + start, block, n, n, hist);
|
||||||
} break;
|
} break;
|
||||||
|
#ifdef GGML_USE_K_QUANTS
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK_K == 0);
|
GGML_ASSERT(start % QK_K == 0);
|
||||||
block_q2_k * block = (block_q2_k*)dst + start / QK_K;
|
block_q2_K * block = (block_q2_K*)dst + start / QK_K;
|
||||||
result = ggml_quantize_q2_k(src + start, block, n, n, hist);
|
result = ggml_quantize_q2_K(src + start, block, n, n, hist);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK_K == 0);
|
GGML_ASSERT(start % QK_K == 0);
|
||||||
block_q3_k * block = (block_q3_k*)dst + start / QK_K;
|
block_q3_K * block = (block_q3_K*)dst + start / QK_K;
|
||||||
result = ggml_quantize_q3_k(src + start, block, n, n, hist);
|
result = ggml_quantize_q3_K(src + start, block, n, n, hist);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK_K == 0);
|
GGML_ASSERT(start % QK_K == 0);
|
||||||
block_q4_k * block = (block_q4_k*)dst + start / QK_K;
|
block_q4_K * block = (block_q4_K*)dst + start / QK_K;
|
||||||
result = ggml_quantize_q4_k(src + start, block, n, n, hist);
|
result = ggml_quantize_q4_K(src + start, block, n, n, hist);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK_K == 0);
|
GGML_ASSERT(start % QK_K == 0);
|
||||||
block_q5_k * block = (block_q5_k*)dst + start / QK_K;
|
block_q5_K * block = (block_q5_K*)dst + start / QK_K;
|
||||||
result = ggml_quantize_q5_k(src + start, block, n, n, hist);
|
result = ggml_quantize_q5_K(src + start, block, n, n, hist);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK_K == 0);
|
GGML_ASSERT(start % QK_K == 0);
|
||||||
block_q6_k * block = (block_q6_k*)dst + start / QK_K;
|
block_q6_K * block = (block_q6_K*)dst + start / QK_K;
|
||||||
result = ggml_quantize_q6_k(src + start, block, n, n, hist);
|
result = ggml_quantize_q6_K(src + start, block, n, n, hist);
|
||||||
} break;
|
} break;
|
||||||
|
#endif
|
||||||
default:
|
default:
|
||||||
assert(false);
|
assert(false);
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#include "ggml-quants-k.h"
|
#include "k_quants.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
@ -272,7 +272,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
|
|||||||
|
|
||||||
//========================- 2-bit (de)-quantization
|
//========================- 2-bit (de)-quantization
|
||||||
|
|
||||||
void quantize_row_q2_k_reference(const float * restrict x, block_q2_k * restrict y, int k) {
|
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
@ -341,7 +341,7 @@ void quantize_row_q2_k_reference(const float * restrict x, block_q2_k * restrict
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void dequantize_row_q2_k(const block_q2_k * restrict x, float * restrict y, int k) {
|
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
@ -374,26 +374,26 @@ void dequantize_row_q2_k(const block_q2_k * restrict x, float * restrict y, int
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void quantize_row_q2_k(const float * restrict x, void * restrict vy, int k) {
|
void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
|
||||||
quantize_row_q2_k_reference(x, vy, k);
|
quantize_row_q2_K_reference(x, vy, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_q2_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
||||||
(void)hist;
|
(void)hist;
|
||||||
|
|
||||||
for (int j = 0; j < nb; j += k) {
|
for (int j = 0; j < nb; j += k) {
|
||||||
block_q2_k * restrict y = (block_q2_k *)dst + j/QK_K;
|
block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
|
||||||
quantize_row_q2_k_reference(src + j, y, k);
|
quantize_row_q2_K_reference(src + j, y, k);
|
||||||
}
|
}
|
||||||
return (n/QK_K*sizeof(block_q2_k));
|
return (n/QK_K*sizeof(block_q2_K));
|
||||||
}
|
}
|
||||||
|
|
||||||
//========================= 3-bit (de)-quantization
|
//========================= 3-bit (de)-quantization
|
||||||
|
|
||||||
void quantize_row_q3_k_reference(const float * restrict x, block_q3_k * restrict y, int k) {
|
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
@ -469,7 +469,7 @@ void quantize_row_q3_k_reference(const float * restrict x, block_q3_k * restrict
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void dequantize_row_q3_k(const block_q3_k * restrict x, float * restrict y, int k) {
|
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
assert(QK_K == 256);
|
assert(QK_K == 256);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
@ -520,26 +520,26 @@ void dequantize_row_q3_k(const block_q3_k * restrict x, float * restrict y, int
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void quantize_row_q3_k(const float * restrict x, void * restrict vy, int k) {
|
void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
|
||||||
quantize_row_q3_k_reference(x, vy, k);
|
quantize_row_q3_K_reference(x, vy, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_q3_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
||||||
(void)hist;
|
(void)hist;
|
||||||
|
|
||||||
for (int j = 0; j < nb; j += k) {
|
for (int j = 0; j < nb; j += k) {
|
||||||
block_q3_k * restrict y = (block_q3_k *)dst + j/QK_K;
|
block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
|
||||||
quantize_row_q3_k_reference(src + j, y, k);
|
quantize_row_q3_K_reference(src + j, y, k);
|
||||||
}
|
}
|
||||||
return (n/QK_K*sizeof(block_q3_k));
|
return (n/QK_K*sizeof(block_q3_K));
|
||||||
}
|
}
|
||||||
|
|
||||||
// ====================== 4-bit (de)-quantization
|
// ====================== 4-bit (de)-quantization
|
||||||
|
|
||||||
void quantize_row_q4_k_reference(const float * restrict x, block_q4_k * restrict y, int k) {
|
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
@ -604,7 +604,7 @@ void quantize_row_q4_k_reference(const float * restrict x, block_q4_k * restrict
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void dequantize_row_q4_k(const block_q4_k * restrict x, float * restrict y, int k) {
|
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
@ -630,26 +630,26 @@ void dequantize_row_q4_k(const block_q4_k * restrict x, float * restrict y, int
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void quantize_row_q4_k(const float * restrict x, void * restrict vy, int k) {
|
void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
block_q4_k * restrict y = vy;
|
block_q4_K * restrict y = vy;
|
||||||
quantize_row_q4_k_reference(x, y, k);
|
quantize_row_q4_K_reference(x, y, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_q4_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
(void)hist; // TODO: collect histograms
|
(void)hist; // TODO: collect histograms
|
||||||
for (int j = 0; j < nb; j += k) {
|
for (int j = 0; j < nb; j += k) {
|
||||||
block_q4_k * restrict y = (block_q4_k *)dst + j/QK_K;
|
block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
|
||||||
quantize_row_q4_k_reference(src + j, y, k);
|
quantize_row_q4_K_reference(src + j, y, k);
|
||||||
}
|
}
|
||||||
return (n/QK_K*sizeof(block_q4_k));
|
return (n/QK_K*sizeof(block_q4_K));
|
||||||
}
|
}
|
||||||
|
|
||||||
// ====================== 5-bit (de)-quantization
|
// ====================== 5-bit (de)-quantization
|
||||||
|
|
||||||
void quantize_row_q5_k_reference(const float * restrict x, block_q5_k * restrict y, int k) {
|
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
@ -731,7 +731,7 @@ void quantize_row_q5_k_reference(const float * restrict x, block_q5_k * restrict
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void dequantize_row_q5_k(const block_q5_k * restrict x, float * restrict y, int k) {
|
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
@ -759,26 +759,26 @@ void dequantize_row_q5_k(const block_q5_k * restrict x, float * restrict y, int
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void quantize_row_q5_k(const float * restrict x, void * restrict vy, int k) {
|
void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
block_q5_k * restrict y = vy;
|
block_q5_K * restrict y = vy;
|
||||||
quantize_row_q5_k_reference(x, y, k);
|
quantize_row_q5_K_reference(x, y, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_q5_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
(void)hist;
|
(void)hist;
|
||||||
for (int j = 0; j < nb; j += k) {
|
for (int j = 0; j < nb; j += k) {
|
||||||
block_q5_k * restrict y = (block_q5_k *)dst + j/QK_K;
|
block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
|
||||||
quantize_row_q5_k_reference(src + j, y, k);
|
quantize_row_q5_K_reference(src + j, y, k);
|
||||||
}
|
}
|
||||||
return (n/QK_K*sizeof(block_q5_k));
|
return (n/QK_K*sizeof(block_q5_K));
|
||||||
}
|
}
|
||||||
|
|
||||||
// ====================== 6-bit (de)-quantization
|
// ====================== 6-bit (de)-quantization
|
||||||
|
|
||||||
void quantize_row_q6_k_reference(const float * restrict x, block_q6_k * restrict y, int k) {
|
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
@ -842,7 +842,7 @@ void quantize_row_q6_k_reference(const float * restrict x, block_q6_k * restrict
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void dequantize_row_q6_k(const block_q6_k * restrict x, float * restrict y, int k) {
|
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
@ -875,28 +875,28 @@ void dequantize_row_q6_k(const block_q6_k * restrict x, float * restrict y, int
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void quantize_row_q6_k(const float * restrict x, void * restrict vy, int k) {
|
void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
block_q6_k * restrict y = vy;
|
block_q6_K * restrict y = vy;
|
||||||
quantize_row_q6_k_reference(x, y, k);
|
quantize_row_q6_K_reference(x, y, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_q6_k(const float * src, void * dst, int n, int k, int64_t * hist) {
|
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
(void)hist; // TODO
|
(void)hist; // TODO
|
||||||
|
|
||||||
for (int j = 0; j < nb; j += k) {
|
for (int j = 0; j < nb; j += k) {
|
||||||
block_q6_k * restrict y = (block_q6_k *)dst + j/QK_K;
|
block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
|
||||||
quantize_row_q6_k_reference(src + j, y, k);
|
quantize_row_q6_K_reference(src + j, y, k);
|
||||||
}
|
}
|
||||||
return (n/QK_K*sizeof(block_q6_k));
|
return (n/QK_K*sizeof(block_q6_K));
|
||||||
}
|
}
|
||||||
|
|
||||||
//===================================== Q8_K ==============================================
|
//===================================== Q8_K ==============================================
|
||||||
|
|
||||||
void quantize_row_q8_k_reference(const float * restrict x, block_q8_k * restrict y, int k) {
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
@ -933,7 +933,7 @@ void quantize_row_q8_k_reference(const float * restrict x, block_q8_k * restrict
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void dequantize_row_q8_k(const block_q8_k * restrict x, float * restrict y, int k) {
|
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
@ -944,8 +944,8 @@ void dequantize_row_q8_k(const block_q8_k * restrict x, float * restrict y, int
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void quantize_row_q8_k(const float * restrict x, void * restrict y, int k) {
|
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) {
|
||||||
quantize_row_q8_k_reference(x, y, k);
|
quantize_row_q8_K_reference(x, y, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
//===================================== Dot ptoducts =================================
|
//===================================== Dot ptoducts =================================
|
||||||
@ -1002,10 +1002,10 @@ static inline __m128i get_scale_shuffle(int i) {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void ggml_vec_dot_q2_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||||
|
|
||||||
const block_q2_k * restrict x = vx;
|
const block_q2_K * restrict x = vx;
|
||||||
const block_q8_k * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
|
||||||
const int nb = n / QK_K;
|
const int nb = n / QK_K;
|
||||||
|
|
||||||
@ -1201,14 +1201,14 @@ void ggml_vec_dot_q2_k_q8_k(const int n, float * restrict s, const void * restri
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_q3_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
|
||||||
const uint32_t kmask1 = 0x03030303;
|
const uint32_t kmask1 = 0x03030303;
|
||||||
const uint32_t kmask2 = 0x0f0f0f0f;
|
const uint32_t kmask2 = 0x0f0f0f0f;
|
||||||
|
|
||||||
const block_q3_k * restrict x = vx;
|
const block_q3_K * restrict x = vx;
|
||||||
const block_q8_k * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
|
||||||
const int nb = n / QK_K;
|
const int nb = n / QK_K;
|
||||||
|
|
||||||
@ -1501,11 +1501,11 @@ void ggml_vec_dot_q3_k_q8_k(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_q4_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
|
||||||
const block_q4_k * restrict x = vx;
|
const block_q4_K * restrict x = vx;
|
||||||
const block_q8_k * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
|
||||||
const int nb = n / QK_K;
|
const int nb = n / QK_K;
|
||||||
|
|
||||||
@ -1727,11 +1727,11 @@ void ggml_vec_dot_q4_k_q8_k(const int n, float * restrict s, const void * restri
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_q5_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
|
||||||
const block_q5_k * restrict x = vx;
|
const block_q5_K * restrict x = vx;
|
||||||
const block_q8_k * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
|
||||||
const int nb = n / QK_K;
|
const int nb = n / QK_K;
|
||||||
|
|
||||||
@ -1974,11 +1974,11 @@ void ggml_vec_dot_q5_k_q8_k(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
void ggml_vec_dot_q6_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
|
||||||
const block_q6_k * restrict x = vx;
|
const block_q6_K * restrict x = vx;
|
||||||
const block_q8_k * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
|
||||||
const int nb = n / QK_K;
|
const int nb = n / QK_K;
|
||||||
|
|
@ -22,8 +22,8 @@ typedef struct {
|
|||||||
uint8_t qs[QK_K/4]; // quants
|
uint8_t qs[QK_K/4]; // quants
|
||||||
ggml_fp16_t d; // super-block scale for quantized scales
|
ggml_fp16_t d; // super-block scale for quantized scales
|
||||||
ggml_fp16_t dmin; // super-block scale for quantized mins
|
ggml_fp16_t dmin; // super-block scale for quantized mins
|
||||||
} block_q2_k;
|
} block_q2_K;
|
||||||
static_assert(sizeof(block_q2_k) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_k block size/padding");
|
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
||||||
|
|
||||||
// 3-bit quantization
|
// 3-bit quantization
|
||||||
// weight is represented as x = a * q
|
// weight is represented as x = a * q
|
||||||
@ -34,8 +34,8 @@ typedef struct {
|
|||||||
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
||||||
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
||||||
ggml_fp16_t d; // super-block scale
|
ggml_fp16_t d; // super-block scale
|
||||||
} block_q3_k;
|
} block_q3_K;
|
||||||
static_assert(sizeof(block_q3_k) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_k block size/padding");
|
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
|
||||||
|
|
||||||
// 4-bit quantization
|
// 4-bit quantization
|
||||||
// 16 blocks of 32 elements each
|
// 16 blocks of 32 elements each
|
||||||
@ -46,8 +46,8 @@ typedef struct {
|
|||||||
ggml_fp16_t dmin; // super-block scale for quantized mins
|
ggml_fp16_t dmin; // super-block scale for quantized mins
|
||||||
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
||||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||||
} block_q4_k;
|
} block_q4_K;
|
||||||
static_assert(sizeof(block_q4_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_k block size/padding");
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
||||||
|
|
||||||
// 5-bit quantization
|
// 5-bit quantization
|
||||||
// 16 blocks of 32 elements each
|
// 16 blocks of 32 elements each
|
||||||
@ -59,8 +59,8 @@ typedef struct {
|
|||||||
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
||||||
uint8_t qh[QK_K/8]; // quants, high bit
|
uint8_t qh[QK_K/8]; // quants, high bit
|
||||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||||
} block_q5_k;
|
} block_q5_K;
|
||||||
static_assert(sizeof(block_q5_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_k block size/padding");
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
||||||
|
|
||||||
// 6-bit quantization
|
// 6-bit quantization
|
||||||
// weight is represented as x = a * q
|
// weight is represented as x = a * q
|
||||||
@ -71,52 +71,52 @@ typedef struct {
|
|||||||
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
||||||
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
||||||
ggml_fp16_t d; // super-block scale
|
ggml_fp16_t d; // super-block scale
|
||||||
} block_q6_k;
|
} block_q6_K;
|
||||||
static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_k block size/padding");
|
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
|
||||||
|
|
||||||
// This is only used for intermediate quantization and dot products
|
// This is only used for intermediate quantization and dot products
|
||||||
typedef struct {
|
typedef struct {
|
||||||
float d; // delta
|
float d; // delta
|
||||||
int8_t qs[QK_K]; // quants
|
int8_t qs[QK_K]; // quants
|
||||||
int16_t bsums[QK_K/16]; // sum of quants in groups of 16
|
int16_t bsums[QK_K/16]; // sum of quants in groups of 16
|
||||||
} block_q8_k;
|
} block_q8_K;
|
||||||
static_assert(sizeof(block_q8_k) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_k block size/padding");
|
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
|
||||||
|
|
||||||
|
|
||||||
// Quantization
|
// Quantization
|
||||||
void quantize_row_q2_k_reference(const float * restrict x, block_q2_k * restrict y, int k);
|
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
|
||||||
void quantize_row_q3_k_reference(const float * restrict x, block_q3_k * restrict y, int k);
|
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
|
||||||
void quantize_row_q4_k_reference(const float * restrict x, block_q4_k * restrict y, int k);
|
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
|
||||||
void quantize_row_q5_k_reference(const float * restrict x, block_q5_k * restrict y, int k);
|
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
||||||
void quantize_row_q6_k_reference(const float * restrict x, block_q6_k * restrict y, int k);
|
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
||||||
void quantize_row_q8_k_reference(const float * restrict x, block_q8_k * restrict y, int k);
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
||||||
|
|
||||||
void quantize_row_q2_k(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
|
||||||
void quantize_row_q3_k(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
|
||||||
void quantize_row_q4_k(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
||||||
void quantize_row_q5_k(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
||||||
void quantize_row_q6_k(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
||||||
void quantize_row_q8_k(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
||||||
|
|
||||||
// Dequantization
|
// Dequantization
|
||||||
void dequantize_row_q2_k(const block_q2_k * restrict x, float * restrict y, int k);
|
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
|
||||||
void dequantize_row_q3_k(const block_q3_k * restrict x, float * restrict y, int k);
|
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
|
||||||
void dequantize_row_q4_k(const block_q4_k * restrict x, float * restrict y, int k);
|
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
|
||||||
void dequantize_row_q5_k(const block_q5_k * restrict x, float * restrict y, int k);
|
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
|
||||||
void dequantize_row_q6_k(const block_q6_k * restrict x, float * restrict y, int k);
|
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
|
||||||
void dequantize_row_q8_k(const block_q8_k * restrict x, float * restrict y, int k);
|
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
||||||
|
|
||||||
// Dot product
|
// Dot product
|
||||||
void ggml_vec_dot_q2_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
void ggml_vec_dot_q3_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
void ggml_vec_dot_q4_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
void ggml_vec_dot_q5_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
void ggml_vec_dot_q6_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
|
||||||
// Quantization with histogram collection
|
// Quantization with histogram collection
|
||||||
size_t ggml_quantize_q2_k(const float * src, void * dst, int n, int k, int64_t * hist);
|
size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
size_t ggml_quantize_q3_k(const float * src, void * dst, int n, int k, int64_t * hist);
|
size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
size_t ggml_quantize_q4_k(const float * src, void * dst, int n, int k, int64_t * hist);
|
size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
size_t ggml_quantize_q5_k(const float * src, void * dst, int n, int k, int64_t * hist);
|
size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
size_t ggml_quantize_q6_k(const float * src, void * dst, int n, int k, int64_t * hist);
|
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
|
Loading…
Reference in New Issue
Block a user