mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-14 14:59:52 +00:00
ggml : fix BLAS with unsupported types (#9775)
Some checks failed
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Has been cancelled
Python Type-Check / pyright type-check (push) Has been cancelled
Some checks failed
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Has been cancelled
Python Type-Check / pyright type-check (push) Has been cancelled
* ggml : do not use BLAS with types without to_float * ggml : return pointer from ggml_internal_get_type_traits to avoid unnecessary copies * ggml : rename ggml_internal_get_type_traits -> ggml_get_type_traits it's not really internal if everybody uses it
This commit is contained in:
parent
458367a906
commit
dca1d4b58a
@ -314,9 +314,9 @@ struct lora_merge_ctx {
|
|||||||
// optionally dequantize it
|
// optionally dequantize it
|
||||||
printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
|
printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
|
||||||
auto nels = ggml_nelements(inp_base);
|
auto nels = ggml_nelements(inp_base);
|
||||||
ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
|
const auto * qtype = ggml_get_type_traits(base->type);
|
||||||
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
|
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
|
||||||
qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
|
qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
|
||||||
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
|
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
|
||||||
} else {
|
} else {
|
||||||
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
|
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
|
||||||
|
@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void test_roundtrip_on_chunk(
|
static void test_roundtrip_on_chunk(
|
||||||
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
|
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, bool use_reference,
|
||||||
float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
|
float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
|
||||||
) {
|
) {
|
||||||
if (layer->type == GGML_TYPE_F16) {
|
if (layer->type == GGML_TYPE_F16) {
|
||||||
@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk(
|
|||||||
|
|
||||||
// Run quantization function for a single layer and update error stats
|
// Run quantization function for a single layer and update error stats
|
||||||
static void test_roundtrip_on_layer(
|
static void test_roundtrip_on_layer(
|
||||||
std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
|
std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, bool use_reference,
|
||||||
const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
|
const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
|
||||||
std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
|
std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
|
||||||
) {
|
) {
|
||||||
@ -371,8 +371,8 @@ int main(int argc, char ** argv) {
|
|||||||
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
|
const auto * qfns = ggml_get_type_traits(type);
|
||||||
if (qfns.from_float && qfns.to_float) {
|
if (qfns->from_float && qfns->to_float) {
|
||||||
if (params.verbose) {
|
if (params.verbose) {
|
||||||
printf("testing %s ...\n", ggml_type_name(type));
|
printf("testing %s ...\n", ggml_type_name(type));
|
||||||
}
|
}
|
||||||
@ -393,7 +393,7 @@ int main(int argc, char ** argv) {
|
|||||||
test_roundtrip_on_layer(
|
test_roundtrip_on_layer(
|
||||||
layer_name,
|
layer_name,
|
||||||
params.per_layer_stats,
|
params.per_layer_stats,
|
||||||
qfns,
|
*qfns,
|
||||||
params.reference,
|
params.reference,
|
||||||
kv_tensor.second,
|
kv_tensor.second,
|
||||||
input_scratch,
|
input_scratch,
|
||||||
|
@ -2535,7 +2535,7 @@ extern "C" {
|
|||||||
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||||
const void * GGML_RESTRICT y, int nr, int nc);
|
const void * GGML_RESTRICT y, int nr, int nc);
|
||||||
|
|
||||||
typedef struct {
|
struct ggml_type_traits {
|
||||||
const char * type_name;
|
const char * type_name;
|
||||||
int64_t blck_size;
|
int64_t blck_size;
|
||||||
int64_t blck_size_interleave; // interleave elements in blocks
|
int64_t blck_size_interleave; // interleave elements in blocks
|
||||||
@ -2551,9 +2551,9 @@ extern "C" {
|
|||||||
int64_t ncols; // number of columns to process simultaneously
|
int64_t ncols; // number of columns to process simultaneously
|
||||||
ggml_gemv_t gemv;
|
ggml_gemv_t gemv;
|
||||||
ggml_gemm_t gemm;
|
ggml_gemm_t gemm;
|
||||||
} ggml_type_traits_t;
|
};
|
||||||
|
|
||||||
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
@ -1177,7 +1177,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
|
|||||||
op->type != GGML_TYPE_IQ1_S &&
|
op->type != GGML_TYPE_IQ1_S &&
|
||||||
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
|
||||||
case GGML_OP_ROPE_BACK:
|
case GGML_OP_ROPE_BACK:
|
||||||
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
||||||
case GGML_OP_IM2COL_BACK:
|
case GGML_OP_IM2COL_BACK:
|
||||||
|
@ -65,8 +65,8 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
|
|||||||
|
|
||||||
// convert src0 to float
|
// convert src0 to float
|
||||||
if (type != GGML_TYPE_F32) {
|
if (type != GGML_TYPE_F32) {
|
||||||
ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
|
const auto * type_traits = ggml_get_type_traits(type);
|
||||||
ggml_to_float_t const to_float = type_traits.to_float;
|
ggml_to_float_t const to_float = type_traits->to_float;
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
@ -420,19 +420,21 @@ static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const s
|
|||||||
// TODO: find the optimal value
|
// TODO: find the optimal value
|
||||||
const int64_t min_batch = 32;
|
const int64_t min_batch = 32;
|
||||||
|
|
||||||
return (ggml_is_contiguous(src0) &&
|
return ggml_is_contiguous(src0) &&
|
||||||
ggml_is_contiguous(src1) &&
|
ggml_is_contiguous(src1) &&
|
||||||
src1->type == GGML_TYPE_F32 &&
|
src1->type == GGML_TYPE_F32 &&
|
||||||
(ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch));
|
(ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) &&
|
||||||
|
(src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
case GGML_OP_OUT_PROD:
|
case GGML_OP_OUT_PROD:
|
||||||
return (op->src[0]->type == GGML_TYPE_F32 &&
|
return op->src[0]->type == GGML_TYPE_F32 &&
|
||||||
op->src[1]->type == GGML_TYPE_F32 &&
|
op->src[1]->type == GGML_TYPE_F32 &&
|
||||||
ggml_is_matrix(src0) &&
|
ggml_is_matrix(src0) &&
|
||||||
ggml_is_matrix(src1) &&
|
ggml_is_matrix(src1) &&
|
||||||
ggml_is_contiguous(src0) &&
|
ggml_is_contiguous(src0) &&
|
||||||
(ggml_is_contiguous(src1) || ggml_is_transposed(src1)));
|
(ggml_is_contiguous(src1) || ggml_is_transposed(src1)) &&
|
||||||
|
(src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
|
||||||
|
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
|
@ -5287,9 +5287,9 @@ static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, gg
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_type_traits_t tt = ggml_internal_get_type_traits(quant);
|
const auto * tt = ggml_get_type_traits(quant);
|
||||||
|
|
||||||
ggml_to_float_t dequant_fn = tt.to_float;
|
ggml_to_float_t dequant_fn = tt->to_float;
|
||||||
|
|
||||||
dequant_fn(from, to, ne);
|
dequant_fn(from, to, ne);
|
||||||
}
|
}
|
||||||
|
@ -729,7 +729,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
|
|||||||
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
|
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
|
||||||
static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
|
static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
|
||||||
|
|
||||||
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_I8] = {
|
[GGML_TYPE_I8] = {
|
||||||
.type_name = "i8",
|
.type_name = "i8",
|
||||||
.blck_size = 1,
|
.blck_size = 1,
|
||||||
@ -1151,9 +1151,9 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// For internal test use
|
// For internal test use
|
||||||
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
|
||||||
GGML_ASSERT(type < GGML_TYPE_COUNT);
|
GGML_ASSERT(type < GGML_TYPE_COUNT);
|
||||||
return type_traits[type];
|
return &type_traits[type];
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -136,7 +136,7 @@ int main(int argc, char** argv) {
|
|||||||
|
|
||||||
auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;
|
auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;
|
||||||
|
|
||||||
auto funcs = ggml_internal_get_type_traits(ggml_type);
|
const auto * funcs = ggml_get_type_traits(ggml_type);
|
||||||
|
|
||||||
Stat simple, ggml;
|
Stat simple, ggml;
|
||||||
|
|
||||||
@ -156,8 +156,8 @@ int main(int argc, char** argv) {
|
|||||||
|
|
||||||
t1 = std::chrono::high_resolution_clock::now();
|
t1 = std::chrono::high_resolution_clock::now();
|
||||||
float fs;
|
float fs;
|
||||||
if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
|
if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
|
||||||
else funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
|
else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
|
||||||
t2 = std::chrono::high_resolution_clock::now();
|
t2 = std::chrono::high_resolution_clock::now();
|
||||||
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
|
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
|
||||||
if (iloop > 3) ggml.addResult(fs, t);
|
if (iloop > 3) ggml.addResult(fs, t);
|
||||||
|
@ -236,7 +236,7 @@ int main(int argc, char** argv) {
|
|||||||
int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
|
int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
|
||||||
int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
|
int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
|
||||||
|
|
||||||
auto funcs = useQ4_1 ? ggml_internal_get_type_traits(GGML_TYPE_Q4_1) : ggml_internal_get_type_traits(GGML_TYPE_Q4_0);
|
const auto * funcs = useQ4_1 ? ggml_get_type_traits(GGML_TYPE_Q4_1) : ggml_get_type_traits(GGML_TYPE_Q4_0);
|
||||||
|
|
||||||
std::vector<block_q4_0> q40;
|
std::vector<block_q4_0> q40;
|
||||||
std::vector<block_q4_1> q41;
|
std::vector<block_q4_1> q41;
|
||||||
@ -261,9 +261,9 @@ int main(int argc, char** argv) {
|
|||||||
// Note, we do not include this in the timing as in practical application
|
// Note, we do not include this in the timing as in practical application
|
||||||
// we already have the quantized model weights.
|
// we already have the quantized model weights.
|
||||||
if (useQ4_1) {
|
if (useQ4_1) {
|
||||||
funcs.from_float(x1.data(), q41.data(), kVecSize);
|
funcs->from_float(x1.data(), q41.data(), kVecSize);
|
||||||
} else {
|
} else {
|
||||||
funcs.from_float(x1.data(), q40.data(), kVecSize);
|
funcs->from_float(x1.data(), q40.data(), kVecSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Now measure time the dot product needs using the "scalar" version above
|
// Now measure time the dot product needs using the "scalar" version above
|
||||||
@ -282,10 +282,10 @@ int main(int argc, char** argv) {
|
|||||||
dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
|
dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type);
|
const auto * vdot = ggml_get_type_traits(funcs->vec_dot_type);
|
||||||
vdot.from_float(y1.data(), q8.data(), kVecSize);
|
vdot->from_float(y1.data(), q8.data(), kVecSize);
|
||||||
if (useQ4_1) funcs.vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
|
if (useQ4_1) funcs->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
|
||||||
else funcs.vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
|
else funcs->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
|
||||||
}
|
}
|
||||||
sumq += result;
|
sumq += result;
|
||||||
t2 = std::chrono::high_resolution_clock::now();
|
t2 = std::chrono::high_resolution_clock::now();
|
||||||
|
@ -17872,10 +17872,9 @@ static void llama_tensor_dequantize_internal(
|
|||||||
}
|
}
|
||||||
float * f32_output = (float *) output.data();
|
float * f32_output = (float *) output.data();
|
||||||
|
|
||||||
ggml_type_traits_t qtype;
|
const ggml_type_traits * qtype = ggml_get_type_traits(tensor->type);
|
||||||
if (ggml_is_quantized(tensor->type)) {
|
if (ggml_is_quantized(tensor->type)) {
|
||||||
qtype = ggml_internal_get_type_traits(tensor->type);
|
if (qtype->to_float == NULL) {
|
||||||
if (qtype.to_float == NULL) {
|
|
||||||
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
|
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
|
||||||
}
|
}
|
||||||
} else if (tensor->type != GGML_TYPE_F16 &&
|
} else if (tensor->type != GGML_TYPE_F16 &&
|
||||||
@ -17889,7 +17888,7 @@ static void llama_tensor_dequantize_internal(
|
|||||||
} else if (tensor->type == GGML_TYPE_BF16) {
|
} else if (tensor->type == GGML_TYPE_BF16) {
|
||||||
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
|
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
|
||||||
} else if (ggml_is_quantized(tensor->type)) {
|
} else if (ggml_is_quantized(tensor->type)) {
|
||||||
qtype.to_float(tensor->data, f32_output, nelements);
|
qtype->to_float(tensor->data, f32_output, nelements);
|
||||||
} else {
|
} else {
|
||||||
GGML_ABORT("fatal error"); // unreachable
|
GGML_ABORT("fatal error"); // unreachable
|
||||||
}
|
}
|
||||||
@ -17925,7 +17924,7 @@ static void llama_tensor_dequantize_internal(
|
|||||||
} else if (typ == GGML_TYPE_BF16) {
|
} else if (typ == GGML_TYPE_BF16) {
|
||||||
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
|
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
|
||||||
} else {
|
} else {
|
||||||
qtype.to_float(inbuf, outbuf, nels);
|
qtype->to_float(inbuf, outbuf, nels);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
|
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
|
||||||
|
@ -133,7 +133,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
|
|||||||
std::vector<uint8_t> buf(ggml_nbytes(t));
|
std::vector<uint8_t> buf(ggml_nbytes(t));
|
||||||
ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
|
ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
|
||||||
|
|
||||||
ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
|
const auto * tt = ggml_get_type_traits(t->type);
|
||||||
size_t bs = ggml_blck_size(t->type);
|
size_t bs = ggml_blck_size(t->type);
|
||||||
std::vector<float> vq(ggml_blck_size(t->type));
|
std::vector<float> vq(ggml_blck_size(t->type));
|
||||||
bool quantized = ggml_is_quantized(t->type);
|
bool quantized = ggml_is_quantized(t->type);
|
||||||
@ -159,7 +159,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
|
|||||||
} else if (t->type == GGML_TYPE_I8) {
|
} else if (t->type == GGML_TYPE_I8) {
|
||||||
tv.push_back((float)*(int8_t *) &buf[i]);
|
tv.push_back((float)*(int8_t *) &buf[i]);
|
||||||
} else if (quantized) {
|
} else if (quantized) {
|
||||||
tt.to_float(&buf[i], vq.data(), bs);
|
tt->to_float(&buf[i], vq.data(), bs);
|
||||||
tv.insert(tv.end(), vq.begin(), vq.end());
|
tv.insert(tv.end(), vq.begin(), vq.end());
|
||||||
} else {
|
} else {
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
|
@ -44,26 +44,26 @@ static float array_rmse(const float * a1, const float * a2, size_t n) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Total quantization error on test data
|
// Total quantization error on test data
|
||||||
static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
|
static float total_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
|
||||||
std::vector<uint8_t> tmp_q(2*test_size);
|
std::vector<uint8_t> tmp_q(2*test_size);
|
||||||
std::vector<float> tmp_out(test_size);
|
std::vector<float> tmp_out(test_size);
|
||||||
|
|
||||||
qfns.from_float(test_data, tmp_q.data(), test_size);
|
qfns->from_float(test_data, tmp_q.data(), test_size);
|
||||||
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
|
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
|
||||||
return array_rmse(test_data, tmp_out.data(), test_size);
|
return array_rmse(test_data, tmp_out.data(), test_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Total quantization error on test data
|
// Total quantization error on test data
|
||||||
static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
|
static float reference_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
|
||||||
std::vector<uint8_t> tmp_q(2*test_size);
|
std::vector<uint8_t> tmp_q(2*test_size);
|
||||||
std::vector<float> tmp_out(test_size);
|
std::vector<float> tmp_out(test_size);
|
||||||
std::vector<float> tmp_out_ref(test_size);
|
std::vector<float> tmp_out_ref(test_size);
|
||||||
|
|
||||||
qfns.from_float(test_data, tmp_q.data(), test_size);
|
qfns->from_float(test_data, tmp_q.data(), test_size);
|
||||||
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
|
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
|
||||||
|
|
||||||
qfns.from_float_ref(test_data, tmp_q.data(), test_size);
|
qfns->from_float_ref(test_data, tmp_q.data(), test_size);
|
||||||
qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
|
qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
|
||||||
|
|
||||||
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
|
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
|
||||||
}
|
}
|
||||||
@ -78,18 +78,18 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
|
|||||||
|
|
||||||
// Total dot product error
|
// Total dot product error
|
||||||
static float dot_product_error(
|
static float dot_product_error(
|
||||||
ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
|
const ggml_type_traits * qfns, size_t test_size, const float * test_data1, const float *test_data2
|
||||||
) {
|
) {
|
||||||
std::vector<uint8_t> tmp_q1(2*test_size);
|
std::vector<uint8_t> tmp_q1(2*test_size);
|
||||||
std::vector<uint8_t> tmp_q2(2*test_size);
|
std::vector<uint8_t> tmp_q2(2*test_size);
|
||||||
|
|
||||||
auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
|
const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
|
||||||
|
|
||||||
qfns.from_float(test_data1, tmp_q1.data(), test_size);
|
qfns->from_float(test_data1, tmp_q1.data(), test_size);
|
||||||
vdot.from_float(test_data2, tmp_q2.data(), test_size);
|
vdot->from_float(test_data2, tmp_q2.data(), test_size);
|
||||||
|
|
||||||
float result = INFINITY;
|
float result = INFINITY;
|
||||||
qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
|
qfns->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
|
||||||
|
|
||||||
const float dot_ref = dot_product(test_data1, test_data2, test_size);
|
const float dot_ref = dot_product(test_data1, test_data2, test_size);
|
||||||
|
|
||||||
@ -131,10 +131,10 @@ int main(int argc, char * argv[]) {
|
|||||||
|
|
||||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||||
ggml_type type = (ggml_type) i;
|
ggml_type type = (ggml_type) i;
|
||||||
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
|
const auto * qfns = ggml_get_type_traits(type);
|
||||||
|
|
||||||
// deprecated - skip
|
// deprecated - skip
|
||||||
if (qfns.blck_size == 0) {
|
if (qfns->blck_size == 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -143,7 +143,7 @@ int main(int argc, char * argv[]) {
|
|||||||
printf("Testing %s\n", ggml_type_name((ggml_type) i));
|
printf("Testing %s\n", ggml_type_name((ggml_type) i));
|
||||||
ggml_quantize_init(ei);
|
ggml_quantize_init(ei);
|
||||||
|
|
||||||
if (qfns.from_float && qfns.to_float) {
|
if (qfns->from_float && qfns->to_float) {
|
||||||
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
||||||
const float max_quantization_error =
|
const float max_quantization_error =
|
||||||
type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
|
type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
|
||||||
|
@ -122,9 +122,9 @@ static void usage(char * argv[]) {
|
|||||||
printf(" --type TYPE set test type as");
|
printf(" --type TYPE set test type as");
|
||||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||||
ggml_type type = (ggml_type) i;
|
ggml_type type = (ggml_type) i;
|
||||||
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
|
const auto * qfns = ggml_get_type_traits(type);
|
||||||
if (ggml_type_name(type) != NULL) {
|
if (ggml_type_name(type) != NULL) {
|
||||||
if (qfns.from_float && qfns.to_float) {
|
if (qfns->from_float && qfns->to_float) {
|
||||||
printf(" %s", ggml_type_name(type));
|
printf(" %s", ggml_type_name(type));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -270,12 +270,12 @@ int main(int argc, char * argv[]) {
|
|||||||
|
|
||||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||||
ggml_type type = (ggml_type) i;
|
ggml_type type = (ggml_type) i;
|
||||||
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
|
const auto * qfns = ggml_get_type_traits(type);
|
||||||
if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
|
if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (qfns.from_float && qfns.to_float) {
|
if (qfns->from_float && qfns->to_float) {
|
||||||
printf("%s\n", ggml_type_name(type));
|
printf("%s\n", ggml_type_name(type));
|
||||||
|
|
||||||
ggml_quantize_init(type);
|
ggml_quantize_init(type);
|
||||||
@ -285,7 +285,7 @@ int main(int argc, char * argv[]) {
|
|||||||
for (size_t size : params.test_sizes) {
|
for (size_t size : params.test_sizes) {
|
||||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||||
auto quantize_fn = [&](void) -> float {
|
auto quantize_fn = [&](void) -> float {
|
||||||
qfns.from_float_ref(test_data1, test_q1, size);
|
qfns->from_float_ref(test_data1, test_q1, size);
|
||||||
return test_q1[0];
|
return test_q1[0];
|
||||||
};
|
};
|
||||||
size_t quantized_size = ggml_row_size(type, size);
|
size_t quantized_size = ggml_row_size(type, size);
|
||||||
@ -299,7 +299,7 @@ int main(int argc, char * argv[]) {
|
|||||||
for (size_t size : params.test_sizes) {
|
for (size_t size : params.test_sizes) {
|
||||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||||
auto quantize_fn = [&](void) -> float {
|
auto quantize_fn = [&](void) -> float {
|
||||||
qfns.from_float(test_data1, test_q1, size);
|
qfns->from_float(test_data1, test_q1, size);
|
||||||
return test_q1[0];
|
return test_q1[0];
|
||||||
};
|
};
|
||||||
size_t quantized_size = ggml_row_size(type, size);
|
size_t quantized_size = ggml_row_size(type, size);
|
||||||
@ -310,11 +310,11 @@ int main(int argc, char * argv[]) {
|
|||||||
|
|
||||||
if (params.op_dequantize_row_q) {
|
if (params.op_dequantize_row_q) {
|
||||||
printf(" dequantize_row_q\n");
|
printf(" dequantize_row_q\n");
|
||||||
qfns.from_float(test_data1, test_q1, largest);
|
qfns->from_float(test_data1, test_q1, largest);
|
||||||
for (size_t size : params.test_sizes) {
|
for (size_t size : params.test_sizes) {
|
||||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||||
auto quantize_fn = [&](void) -> float {
|
auto quantize_fn = [&](void) -> float {
|
||||||
qfns.to_float(test_q1, test_out, size);
|
qfns->to_float(test_q1, test_out, size);
|
||||||
return test_out[0];
|
return test_out[0];
|
||||||
};
|
};
|
||||||
size_t quantized_size = ggml_row_size(type, size);
|
size_t quantized_size = ggml_row_size(type, size);
|
||||||
@ -328,8 +328,8 @@ int main(int argc, char * argv[]) {
|
|||||||
for (size_t size : params.test_sizes) {
|
for (size_t size : params.test_sizes) {
|
||||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||||
auto quantize_fn = [&](void) -> float {
|
auto quantize_fn = [&](void) -> float {
|
||||||
auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
|
const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
|
||||||
vdot.from_float(test_data1, test_q1, size);
|
vdot->from_float(test_data1, test_q1, size);
|
||||||
return test_q1[0];
|
return test_q1[0];
|
||||||
};
|
};
|
||||||
size_t quantized_size = ggml_row_size(type, size);
|
size_t quantized_size = ggml_row_size(type, size);
|
||||||
@ -340,13 +340,13 @@ int main(int argc, char * argv[]) {
|
|||||||
|
|
||||||
if (params.op_vec_dot_q) {
|
if (params.op_vec_dot_q) {
|
||||||
printf(" vec_dot_q\n");
|
printf(" vec_dot_q\n");
|
||||||
qfns.from_float(test_data1, test_q1, largest);
|
qfns->from_float(test_data1, test_q1, largest);
|
||||||
qfns.from_float(test_data2, test_q2, largest);
|
qfns->from_float(test_data2, test_q2, largest);
|
||||||
for (size_t size : params.test_sizes) {
|
for (size_t size : params.test_sizes) {
|
||||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||||
auto quantize_fn = [&](void) -> float {
|
auto quantize_fn = [&](void) -> float {
|
||||||
float result;
|
float result;
|
||||||
qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
|
qfns->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
|
||||||
return result;
|
return result;
|
||||||
};
|
};
|
||||||
size_t quantized_size = ggml_row_size(type, size);
|
size_t quantized_size = ggml_row_size(type, size);
|
||||||
|
Loading…
Reference in New Issue
Block a user