mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-04 07:44:35 +00:00
437 lines
14 KiB
C++
437 lines
14 KiB
C++
|
#include <ggml.h>
|
||
|
#include <ggml-alloc.h>
|
||
|
#include <ggml-backend.h>
|
||
|
#include <ggml-backend-impl.h>
|
||
|
|
||
|
#include <iostream>
|
||
|
#include <algorithm>
|
||
|
#include <array>
|
||
|
#include <cfloat>
|
||
|
#include <cstring>
|
||
|
#include <functional>
|
||
|
#include <memory>
|
||
|
#include <random>
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <string>
|
||
|
#include <thread>
|
||
|
#include <vector>
|
||
|
|
||
|
#include "helpers.hpp"
|
||
|
|
||
|
enum test_mode {
|
||
|
MODE_TEST,
|
||
|
MODE_PERF,
|
||
|
};
|
||
|
|
||
|
struct test_case {
|
||
|
virtual ~test_case() {}
|
||
|
|
||
|
virtual std::string op_desc(ggml_tensor * t) {
|
||
|
return ggml_op_desc(t);
|
||
|
}
|
||
|
|
||
|
virtual std::string vars() {
|
||
|
return "";
|
||
|
}
|
||
|
|
||
|
virtual ggml_tensor * build_graph(ggml_context * ctx) = 0;
|
||
|
|
||
|
virtual double max_nmse_err() {
|
||
|
return 1e-7;
|
||
|
}
|
||
|
|
||
|
virtual void initialize_tensors(ggml_context * ctx) {
|
||
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||
|
init_tensor_uniform(t);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
virtual size_t op_size(ggml_tensor * t) {
|
||
|
size_t size = ggml_nbytes(t);
|
||
|
// add source tensors
|
||
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||
|
if (t->src[i] != NULL) {
|
||
|
size += ggml_nbytes(t->src[i]);
|
||
|
}
|
||
|
}
|
||
|
return size;
|
||
|
}
|
||
|
|
||
|
ggml_cgraph * gf = nullptr;
|
||
|
|
||
|
static const int sentinel_size = 1024;
|
||
|
|
||
|
test_mode mode;
|
||
|
|
||
|
std::vector<ggml_tensor *> sentinels;
|
||
|
|
||
|
void add_sentinel(ggml_context * ctx) {
|
||
|
if (mode == MODE_PERF) {
|
||
|
return;
|
||
|
}
|
||
|
ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size);
|
||
|
ggml_format_name(sentinel, "sent_%zu", sentinels.size());
|
||
|
sentinels.push_back(sentinel);
|
||
|
}
|
||
|
|
||
|
// hijack ggml_new_tensor to add sentinels after each tensor to check for overflows in the backend
|
||
|
|
||
|
ggml_tensor * ggml_new_tensor(ggml_context * ctx, ggml_type type, int n_dims, const int64_t * ne) {
|
||
|
ggml_tensor * t = ::ggml_new_tensor(ctx, type, n_dims, ne);
|
||
|
add_sentinel(ctx);
|
||
|
return t;
|
||
|
}
|
||
|
|
||
|
ggml_tensor * ggml_new_tensor_1d(ggml_context * ctx, ggml_type type, int64_t ne0) {
|
||
|
ggml_tensor * t = ::ggml_new_tensor_1d(ctx, type, ne0);
|
||
|
add_sentinel(ctx);
|
||
|
return t;
|
||
|
}
|
||
|
|
||
|
ggml_tensor * ggml_new_tensor_2d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1) {
|
||
|
ggml_tensor * t = ::ggml_new_tensor_2d(ctx, type, ne0, ne1);
|
||
|
add_sentinel(ctx);
|
||
|
return t;
|
||
|
}
|
||
|
|
||
|
ggml_tensor * ggml_new_tensor_3d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2) {
|
||
|
ggml_tensor * t = ::ggml_new_tensor_3d(ctx, type, ne0, ne1, ne2);
|
||
|
add_sentinel(ctx);
|
||
|
return t;
|
||
|
}
|
||
|
|
||
|
ggml_tensor * ggml_new_tensor_4d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
||
|
ggml_tensor * t = ::ggml_new_tensor_4d(ctx, type, ne0, ne1, ne2, ne3);
|
||
|
add_sentinel(ctx);
|
||
|
return t;
|
||
|
}
|
||
|
|
||
|
bool eval(ggml_backend_t backend1, ggml_backend_t backend2, const char * op_name) {
|
||
|
mode = MODE_TEST;
|
||
|
|
||
|
ggml_init_params params = {
|
||
|
/* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
|
||
|
/* .mem_base = */ NULL,
|
||
|
/* .no_alloc = */ true,
|
||
|
};
|
||
|
ggml_context * ctx = ggml_init(params);
|
||
|
|
||
|
gf = ggml_new_graph(ctx);
|
||
|
|
||
|
// pre-graph sentinel
|
||
|
add_sentinel(ctx);
|
||
|
|
||
|
ggml_tensor * out = build_graph(ctx);
|
||
|
|
||
|
if (op_name != nullptr && op_desc(out) != op_name) {
|
||
|
//printf(" %s: skipping\n", op_desc(out).c_str());
|
||
|
ggml_free(ctx);
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
|
||
|
fflush(stdout);
|
||
|
|
||
|
// check if the backends support the ops
|
||
|
bool supported = true;
|
||
|
for (ggml_backend_t backend : {backend1, backend2}) {
|
||
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||
|
if (!ggml_backend_supports_op(backend, t)) {
|
||
|
printf("not supported [%s] ", ggml_backend_name(backend));
|
||
|
supported = false;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if (!supported) {
|
||
|
printf("\n");
|
||
|
ggml_free(ctx);
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
// post-graph sentinel
|
||
|
add_sentinel(ctx);
|
||
|
|
||
|
// allocate
|
||
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
|
||
|
if (buf == NULL) {
|
||
|
printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1));
|
||
|
ggml_free(ctx);
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// build graph
|
||
|
ggml_build_forward_expand(gf, out);
|
||
|
|
||
|
// add sentinels as graph nodes so that they are checked in the callback
|
||
|
for (ggml_tensor * sentinel : sentinels) {
|
||
|
gf->nodes[gf->n_nodes++] = sentinel;
|
||
|
}
|
||
|
|
||
|
// randomize tensors
|
||
|
initialize_tensors(ctx);
|
||
|
|
||
|
// compare
|
||
|
struct callback_userdata {
|
||
|
bool ok;
|
||
|
double max_err;
|
||
|
ggml_backend_t backend1;
|
||
|
ggml_backend_t backend2;
|
||
|
};
|
||
|
|
||
|
callback_userdata ud {
|
||
|
true,
|
||
|
max_nmse_err(),
|
||
|
backend1,
|
||
|
backend2
|
||
|
};
|
||
|
|
||
|
auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool {
|
||
|
callback_userdata * ud = (callback_userdata *) user_data;
|
||
|
const char * bn1 = ggml_backend_name(ud->backend1);
|
||
|
const char * bn2 = ggml_backend_name(ud->backend2);
|
||
|
|
||
|
if (t1->op == GGML_OP_NONE) {
|
||
|
// sentinels must be unchanged
|
||
|
std::vector<uint8_t> t1_data(ggml_nbytes(t1));
|
||
|
std::vector<uint8_t> t2_data(ggml_nbytes(t2));
|
||
|
ggml_backend_tensor_get(t1, t1_data.data(), 0, ggml_nbytes(t1));
|
||
|
ggml_backend_tensor_get(t2, t2_data.data(), 0, ggml_nbytes(t2));
|
||
|
|
||
|
if (memcmp(t1_data.data(), t2_data.data(), ggml_nbytes(t1)) != 0) {
|
||
|
printf("sentinel mismatch: %s ", t1->name);
|
||
|
ud->ok = false;
|
||
|
return true;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
std::vector<float> f1 = tensor_to_float(t1);
|
||
|
std::vector<float> f2 = tensor_to_float(t2);
|
||
|
|
||
|
for (size_t i = 0; i < f1.size(); i++) {
|
||
|
// check for nans
|
||
|
if (std::isnan(f1[i]) || std::isnan(f2[i])) {
|
||
|
printf("[%s] NaN at index %zu (%s=%f %s=%f) ", ggml_op_desc(t1), i, bn1, f1[i], bn2, f2[i]);
|
||
|
ud->ok = false;
|
||
|
return true;
|
||
|
}
|
||
|
// check for infs: both must be inf of the same sign, or both must be finite
|
||
|
if (isinf_or_max(f1[i]) || isinf_or_max(f2[i])) {
|
||
|
if (isinf_or_max(f1[i]) && isinf_or_max(f2[i])) {
|
||
|
if (std::signbit(f1[i]) != std::signbit(f2[i])) {
|
||
|
printf("[%s] inf sign mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
|
||
|
ud->ok = false;
|
||
|
return true;
|
||
|
}
|
||
|
} else {
|
||
|
printf("[%s] inf mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
|
||
|
ud->ok = false;
|
||
|
return true;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
double err = nmse(f1.data(), f2.data(), f1.size());
|
||
|
if (err > ud->max_err) {
|
||
|
printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err);
|
||
|
//for (int i = 0; i < (int) f1.size(); i++) {
|
||
|
// printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
|
||
|
//}
|
||
|
//printf("\n");
|
||
|
//exit(1);
|
||
|
ud->ok = false;
|
||
|
}
|
||
|
return true;
|
||
|
|
||
|
GGML_UNUSED(index);
|
||
|
};
|
||
|
|
||
|
const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud);
|
||
|
|
||
|
if (!cmp_ok) {
|
||
|
printf("compare failed ");
|
||
|
}
|
||
|
|
||
|
ggml_backend_buffer_free(buf);
|
||
|
|
||
|
ggml_free(ctx);
|
||
|
|
||
|
if (ud.ok && cmp_ok) {
|
||
|
printf("\033[1;32mOK\033[0m\n");
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
printf("\033[1;31mFAIL\033[0m\n");
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
bool eval_perf(ggml_backend_t backend, const char * op_name, int n_runs) {
|
||
|
mode = MODE_PERF;
|
||
|
|
||
|
static const size_t graph_nodes = 8192;
|
||
|
|
||
|
ggml_init_params params = {
|
||
|
/* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead_custom(graph_nodes, false),
|
||
|
/* .mem_base = */ NULL,
|
||
|
/* .no_alloc = */ true,
|
||
|
};
|
||
|
ggml_context * ctx = ggml_init(params);
|
||
|
|
||
|
ggml_tensor * out = build_graph(ctx);
|
||
|
|
||
|
if (op_name != nullptr && op_desc(out) != op_name) {
|
||
|
//printf(" %s: skipping\n", op_desc(out).c_str());
|
||
|
ggml_free(ctx);
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
int len = printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
|
||
|
fflush(stdout);
|
||
|
|
||
|
// check if backends support op
|
||
|
if (!ggml_backend_supports_op(backend, out)) {
|
||
|
printf("not supported\n");
|
||
|
ggml_free(ctx);
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
// align while also leaving some margin for variations in parameters
|
||
|
int align = 20;
|
||
|
int last = (len + align - 1) / align * align;
|
||
|
if (last - len < 5) {
|
||
|
last += align;
|
||
|
}
|
||
|
last = std::max(last, 60);
|
||
|
printf("%*s", last - len, "");
|
||
|
|
||
|
// allocate
|
||
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
||
|
if (buf == NULL) {
|
||
|
printf("failed to allocate tensors\n");
|
||
|
ggml_free(ctx);
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// randomize tensors
|
||
|
initialize_tensors(ctx);
|
||
|
|
||
|
// build graph
|
||
|
ggml_cgraph * gf = ggml_new_graph_custom(ctx, graph_nodes, false);
|
||
|
ggml_build_forward_expand(gf, out);
|
||
|
|
||
|
// warmup run
|
||
|
ggml_backend_graph_compute(backend, gf);
|
||
|
|
||
|
// duplicate the op
|
||
|
size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
|
||
|
//int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
|
||
|
for (int i = 1; i < n_runs; i++) {
|
||
|
gf->nodes[gf->n_nodes++] = out;
|
||
|
}
|
||
|
|
||
|
// calculate memory
|
||
|
size_t mem = n_runs * op_size(out);
|
||
|
auto tensor_op_size = [](ggml_tensor * t) {
|
||
|
size_t size = ggml_nbytes(t);
|
||
|
// add source tensors
|
||
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||
|
if (t->src[i] != NULL) {
|
||
|
size += ggml_nbytes(t->src[i]);
|
||
|
}
|
||
|
}
|
||
|
return size;
|
||
|
};
|
||
|
for (int i = 0; i < gf->n_nodes; i++) {
|
||
|
if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
|
||
|
continue;
|
||
|
}
|
||
|
mem += tensor_op_size(gf->nodes[i]);
|
||
|
}
|
||
|
|
||
|
// run
|
||
|
ggml_backend_synchronize(backend);
|
||
|
|
||
|
int64_t start_time = ggml_time_us();
|
||
|
ggml_backend_graph_compute(backend, gf);
|
||
|
ggml_backend_synchronize(backend);
|
||
|
int64_t end_time = ggml_time_us();
|
||
|
double time_us = end_time - start_time;
|
||
|
|
||
|
printf(" %5d runs - %8.2f us/run - %8zu kB/run - \033[1;34m%7.2f GB/s\033[0m\n",
|
||
|
n_runs,
|
||
|
time_us / n_runs,
|
||
|
op_size(out) / 1024,
|
||
|
mem / (time_us/1e6) / 1024.0 / 1024.0 / 1024.0);
|
||
|
|
||
|
ggml_backend_buffer_free(buf);
|
||
|
|
||
|
ggml_free(ctx);
|
||
|
|
||
|
return true;
|
||
|
}
|
||
|
};
|
||
|
|
||
|
// GGML_OP_MUL_MAT
|
||
|
struct test_mul_mat : public test_case {
|
||
|
const ggml_type type_a;
|
||
|
const ggml_type type_b;
|
||
|
const int64_t m;
|
||
|
const int64_t n;
|
||
|
const int64_t k;
|
||
|
const std::array<int64_t, 2> bs; // dims 3 and 4
|
||
|
const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
|
||
|
|
||
|
std::string vars() override {
|
||
|
return VARS_TO_STR7(type_a, type_b, m, n, k, bs, nr);
|
||
|
}
|
||
|
|
||
|
double max_nmse_err() override {
|
||
|
return 5e-4;
|
||
|
}
|
||
|
|
||
|
size_t op_size(ggml_tensor * t) override {
|
||
|
size_t a = ggml_nbytes(t->src[0]) * n * nr[0] * nr[1];
|
||
|
size_t b = ggml_nbytes(t->src[1]) * m;
|
||
|
size_t c = ggml_nbytes(t);
|
||
|
return a + b + c;
|
||
|
|
||
|
GGML_UNUSED(t);
|
||
|
}
|
||
|
|
||
|
test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
|
||
|
int64_t m = 32, int64_t n = 32, int64_t k = 32,
|
||
|
std::array<int64_t, 2> bs = {10, 10},
|
||
|
std::array<int64_t, 2> nr = {2, 2})
|
||
|
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr) {}
|
||
|
|
||
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||
|
// C^T = A * B^T: (k, m) * (k, n) => (m, n)
|
||
|
ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0] , bs[1]);
|
||
|
ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
|
||
|
ggml_tensor * out = ggml_mul_mat(ctx, a, b);
|
||
|
return out;
|
||
|
}
|
||
|
};
|
||
|
|
||
|
void bench_mul_mat(ggml_backend_t backend) {
|
||
|
auto test = test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F16, 512, 256, 1024, { 1, 1}, {1, 1});
|
||
|
|
||
|
test.eval_perf(backend, "MUL_MAT", 8000 /*n_runs*/);
|
||
|
}
|
||
|
|
||
|
int main() {
|
||
|
// enumerate backends
|
||
|
std::cout << "num_backends:" << ggml_backend_reg_get_count() << std::endl;
|
||
|
int backend_id = 1;
|
||
|
|
||
|
{
|
||
|
ggml_backend_t backend = ggml_backend_reg_init_backend(backend_id, NULL);
|
||
|
std::cout << "Using backend:" << ggml_backend_name(backend) << std::endl;
|
||
|
|
||
|
bench_mul_mat(backend);
|
||
|
|
||
|
ggml_backend_free(backend);
|
||
|
}
|
||
|
}
|