#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "helpers.hpp" enum test_mode { MODE_TEST, MODE_PERF, }; struct test_case { virtual ~test_case() {} virtual std::string op_desc(ggml_tensor * t) { return ggml_op_desc(t); } virtual std::string vars() { return ""; } virtual ggml_tensor * build_graph(ggml_context * ctx) = 0; virtual double max_nmse_err() { return 1e-7; } virtual void initialize_tensors(ggml_context * ctx) { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { init_tensor_uniform(t); } } virtual size_t op_size(ggml_tensor * t) { size_t size = ggml_nbytes(t); // add source tensors for (int i = 0; i < GGML_MAX_SRC; i++) { if (t->src[i] != NULL) { size += ggml_nbytes(t->src[i]); } } return size; } ggml_cgraph * gf = nullptr; static const int sentinel_size = 1024; test_mode mode; std::vector sentinels; void add_sentinel(ggml_context * ctx) { if (mode == MODE_PERF) { return; } ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size); ggml_format_name(sentinel, "sent_%zu", sentinels.size()); sentinels.push_back(sentinel); } // hijack ggml_new_tensor to add sentinels after each tensor to check for overflows in the backend ggml_tensor * ggml_new_tensor(ggml_context * ctx, ggml_type type, int n_dims, const int64_t * ne) { ggml_tensor * t = ::ggml_new_tensor(ctx, type, n_dims, ne); add_sentinel(ctx); return t; } ggml_tensor * ggml_new_tensor_1d(ggml_context * ctx, ggml_type type, int64_t ne0) { ggml_tensor * t = ::ggml_new_tensor_1d(ctx, type, ne0); add_sentinel(ctx); return t; } ggml_tensor * ggml_new_tensor_2d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1) { ggml_tensor * t = ::ggml_new_tensor_2d(ctx, type, ne0, ne1); add_sentinel(ctx); return t; } ggml_tensor * ggml_new_tensor_3d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2) { ggml_tensor * t = ::ggml_new_tensor_3d(ctx, type, ne0, ne1, ne2); add_sentinel(ctx); return t; } ggml_tensor * ggml_new_tensor_4d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { ggml_tensor * t = ::ggml_new_tensor_4d(ctx, type, ne0, ne1, ne2, ne3); add_sentinel(ctx); return t; } bool eval(ggml_backend_t backend1, ggml_backend_t backend2, const char * op_name) { mode = MODE_TEST; ggml_init_params params = { /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(), /* .mem_base = */ NULL, /* .no_alloc = */ true, }; ggml_context * ctx = ggml_init(params); gf = ggml_new_graph(ctx); // pre-graph sentinel add_sentinel(ctx); ggml_tensor * out = build_graph(ctx); if (op_name != nullptr && op_desc(out) != op_name) { //printf(" %s: skipping\n", op_desc(out).c_str()); ggml_free(ctx); return true; } printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str()); fflush(stdout); // check if the backends support the ops bool supported = true; for (ggml_backend_t backend : {backend1, backend2}) { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { if (!ggml_backend_supports_op(backend, t)) { printf("not supported [%s] ", ggml_backend_name(backend)); supported = false; break; } } } if (!supported) { printf("\n"); ggml_free(ctx); return true; } // post-graph sentinel add_sentinel(ctx); // allocate ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1); if (buf == NULL) { printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1)); ggml_free(ctx); return false; } // build graph ggml_build_forward_expand(gf, out); // add sentinels as graph nodes so that they are checked in the callback for (ggml_tensor * sentinel : sentinels) { gf->nodes[gf->n_nodes++] = sentinel; } // randomize tensors initialize_tensors(ctx); // compare struct callback_userdata { bool ok; double max_err; ggml_backend_t backend1; ggml_backend_t backend2; }; callback_userdata ud { true, max_nmse_err(), backend1, backend2 }; auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool { callback_userdata * ud = (callback_userdata *) user_data; const char * bn1 = ggml_backend_name(ud->backend1); const char * bn2 = ggml_backend_name(ud->backend2); if (t1->op == GGML_OP_NONE) { // sentinels must be unchanged std::vector t1_data(ggml_nbytes(t1)); std::vector t2_data(ggml_nbytes(t2)); ggml_backend_tensor_get(t1, t1_data.data(), 0, ggml_nbytes(t1)); ggml_backend_tensor_get(t2, t2_data.data(), 0, ggml_nbytes(t2)); if (memcmp(t1_data.data(), t2_data.data(), ggml_nbytes(t1)) != 0) { printf("sentinel mismatch: %s ", t1->name); ud->ok = false; return true; } } std::vector f1 = tensor_to_float(t1); std::vector f2 = tensor_to_float(t2); for (size_t i = 0; i < f1.size(); i++) { // check for nans if (std::isnan(f1[i]) || std::isnan(f2[i])) { printf("[%s] NaN at index %zu (%s=%f %s=%f) ", ggml_op_desc(t1), i, bn1, f1[i], bn2, f2[i]); ud->ok = false; return true; } // check for infs: both must be inf of the same sign, or both must be finite if (isinf_or_max(f1[i]) || isinf_or_max(f2[i])) { if (isinf_or_max(f1[i]) && isinf_or_max(f2[i])) { if (std::signbit(f1[i]) != std::signbit(f2[i])) { printf("[%s] inf sign mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]); ud->ok = false; return true; } } else { printf("[%s] inf mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]); ud->ok = false; return true; } } } double err = nmse(f1.data(), f2.data(), f1.size()); if (err > ud->max_err) { printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err); //for (int i = 0; i < (int) f1.size(); i++) { // printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]); //} //printf("\n"); //exit(1); ud->ok = false; } return true; GGML_UNUSED(index); }; const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud); if (!cmp_ok) { printf("compare failed "); } ggml_backend_buffer_free(buf); ggml_free(ctx); if (ud.ok && cmp_ok) { printf("\033[1;32mOK\033[0m\n"); return true; } printf("\033[1;31mFAIL\033[0m\n"); return false; } bool eval_perf(ggml_backend_t backend, const char * op_name, int n_runs) { mode = MODE_PERF; static const size_t graph_nodes = 8192; ggml_init_params params = { /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead_custom(graph_nodes, false), /* .mem_base = */ NULL, /* .no_alloc = */ true, }; ggml_context * ctx = ggml_init(params); ggml_tensor * out = build_graph(ctx); if (op_name != nullptr && op_desc(out) != op_name) { //printf(" %s: skipping\n", op_desc(out).c_str()); ggml_free(ctx); return true; } int len = printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str()); fflush(stdout); // check if backends support op if (!ggml_backend_supports_op(backend, out)) { printf("not supported\n"); ggml_free(ctx); return true; } // align while also leaving some margin for variations in parameters int align = 20; int last = (len + align - 1) / align * align; if (last - len < 5) { last += align; } last = std::max(last, 60); printf("%*s", last - len, ""); // allocate ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend); if (buf == NULL) { printf("failed to allocate tensors\n"); ggml_free(ctx); return false; } // randomize tensors initialize_tensors(ctx); // build graph ggml_cgraph * gf = ggml_new_graph_custom(ctx, graph_nodes, false); ggml_build_forward_expand(gf, out); // warmup run ggml_backend_graph_compute(backend, gf); // duplicate the op size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU //int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1; for (int i = 1; i < n_runs; i++) { gf->nodes[gf->n_nodes++] = out; } // calculate memory size_t mem = n_runs * op_size(out); auto tensor_op_size = [](ggml_tensor * t) { size_t size = ggml_nbytes(t); // add source tensors for (int i = 0; i < GGML_MAX_SRC; i++) { if (t->src[i] != NULL) { size += ggml_nbytes(t->src[i]); } } return size; }; for (int i = 0; i < gf->n_nodes; i++) { if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) { continue; } mem += tensor_op_size(gf->nodes[i]); } // run ggml_backend_synchronize(backend); int64_t start_time = ggml_time_us(); ggml_backend_graph_compute(backend, gf); ggml_backend_synchronize(backend); int64_t end_time = ggml_time_us(); double time_us = end_time - start_time; printf(" %5d runs - %8.2f us/run - %8zu kB/run - \033[1;34m%7.2f GB/s\033[0m\n", n_runs, time_us / n_runs, op_size(out) / 1024, mem / (time_us/1e6) / 1024.0 / 1024.0 / 1024.0); ggml_backend_buffer_free(buf); ggml_free(ctx); return true; } }; // GGML_OP_MUL_MAT struct test_mul_mat : public test_case { const ggml_type type_a; const ggml_type type_b; const int64_t m; const int64_t n; const int64_t k; const std::array bs; // dims 3 and 4 const std::array nr; // repeat in dims 3 and 4 std::string vars() override { return VARS_TO_STR7(type_a, type_b, m, n, k, bs, nr); } double max_nmse_err() override { return 5e-4; } size_t op_size(ggml_tensor * t) override { size_t a = ggml_nbytes(t->src[0]) * n * nr[0] * nr[1]; size_t b = ggml_nbytes(t->src[1]) * m; size_t c = ggml_nbytes(t); return a + b + c; GGML_UNUSED(t); } test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32, int64_t m = 32, int64_t n = 32, int64_t k = 32, std::array bs = {10, 10}, std::array nr = {2, 2}) : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr) {} ggml_tensor * build_graph(ggml_context * ctx) override { // C^T = A * B^T: (k, m) * (k, n) => (m, n) ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0] , bs[1]); ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]); ggml_tensor * out = ggml_mul_mat(ctx, a, b); return out; } }; void bench_mul_mat(ggml_backend_t backend) { auto test = test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F16, 512, 256, 1024, { 1, 1}, {1, 1}); test.eval_perf(backend, "MUL_MAT", 8000 /*n_runs*/); } int main() { // enumerate backends std::cout << "num_backends:" << ggml_backend_reg_get_count() << std::endl; int backend_id = 1; { ggml_backend_t backend = ggml_backend_reg_init_backend(backend_id, NULL); std::cout << "Using backend:" << ggml_backend_name(backend) << std::endl; bench_mul_mat(backend); ggml_backend_free(backend); } }