diff --git a/ggml-tune.c b/ggml-tune.c index 0a52443e4..81b012766 100644 --- a/ggml-tune.c +++ b/ggml-tune.c @@ -103,10 +103,9 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile( names[i] = ggml_mulmat_tune_task_backend_name( prof->stages[i].backend); } - printf( - "\n[tune] M: %3d, N: %5d, K: %5d, backends of the " - "fastest profile: %s %s %s\n", - M, N, K, names[0], names[1], names[2]); + printf("\n[tune] M: %3d, N: %5d, K: %5d, backends of the " + "fastest profile: %s %s %s\n", + M, N, K, names[0], names[1], names[2]); #endif } } @@ -707,8 +706,7 @@ static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) { void *buf = malloc(sz); if (!buf) { - fprintf(stderr, - "[tune] error: failed to allocate %zu MiB memory", + fprintf(stderr, "[tune] error: failed to allocate %zu MiB memory", sz / 1024 / 1024); return 0; } @@ -835,8 +833,9 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, stages_time[j] = 0; } - /*enum ggml_compute_error err = */ - ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize); + enum ggml_compute_error err = ggml_threading_compute_tensor( + thrd_ctx, node, wdata, wsize); + GGML_ASSERT(err == GGML_COMPUTE_OK); for (int i = 0; i < 3; i++) { int v = (int)stages_time[i]; @@ -892,11 +891,10 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, fprintf(stdout, "[tune] data was written to `%s`\n", params->fname); } else { - fprintf( - stderr, - "[tune] warn: failed to write file `%s`, print to " - "console instead\n\n", - params->fname); + fprintf(stderr, + "[tune] warn: failed to write file `%s`, print to " + "console instead\n\n", + params->fname); params->output_console = 1; } } diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c index 913d25ff5..a8a204862 100644 --- a/tests/test-ggml-tune.c +++ b/tests/test-ggml-tune.c @@ -8,12 +8,13 @@ static int bench(void); static int estimate_time_non_zero_NK(void); -static void init_params(struct ggml_mulmat_tune_params *params, int m_num) { +static void init_params(struct ggml_mulmat_tune_params *params, + enum ggml_ftype ftype, int m_num, int n_threads) { *params = (struct ggml_mulmat_tune_params){ .model = (struct ggml_mulmat_tune_model){ - .name = "3B", // fake - .ftype = GGML_FTYPE_MOSTLY_Q4_0, + .name = "xB", // fake model name + .ftype = ftype, .n_vocab = 4096, .n_embd = 1024, .n_ff = 2048, @@ -21,7 +22,7 @@ static void init_params(struct ggml_mulmat_tune_params *params, int m_num) { }, .m_num = m_num, .n_pass = 1, - .n_threads = 1, + .n_threads = n_threads, .progress = false, .output_console = true, .fname = NULL}; @@ -45,13 +46,11 @@ int main(void) { } static int bench(void) { - printf("test: %s\n", __func__); - { enum ggml_task_backend backends[16]; int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends); if (n_backends < 2) { - printf("test: %s, skipped because no BLAS\n", __func__); + printf("[test-ggml-tune] skipped because no BLAS\n"); return 0; } } @@ -67,16 +66,48 @@ static int bench(void) { ggml_free(ctx); } - struct ggml_mulmat_tune tune; + // F32: ggml_opencl: ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, + // NULL) error -30 at /Users/mqy/tools/AI/llama.cpp/ggml-opencl.cpp:838 + enum ggml_ftype ftypes[] = { + // GGML_FTYPE_ALL_F32, + GGML_FTYPE_MOSTLY_F16, + GGML_FTYPE_MOSTLY_Q4_0, + }; - struct ggml_mulmat_tune_params params; + int n_ftypes = sizeof(ftypes) / sizeof(ftypes[0]); - init_params(¶ms, /*m_num*/ 4); + const int m_num = 4; - bool ok = ggml_mulmat_tune_bench(&tune, ¶ms); - ggml_mulmat_tune_free(&tune); + // Don't use n_threads larger than 2 because Github build hots has limited + // resource quota. + int threads_arr[] = {1, 2}; + int thread_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]); - return ok ? 0 : 1; + int n_passed = 0; + int n_tests = 0; + + for (int i = 0; i < n_ftypes; i++) { + for (int j = 0; j < thread_arr_len; j++) { + printf("\n"); + + int n_threads = threads_arr[j]; + struct ggml_mulmat_tune tune; + + struct ggml_mulmat_tune_params params; + memset(¶ms, 0, sizeof(struct ggml_mulmat_tune_params)); + init_params(¶ms, ftypes[i], m_num, n_threads); + + ++n_tests; + bool ok = ggml_mulmat_tune_bench(&tune, ¶ms); + if (ok) { + ++n_passed; + } + ggml_mulmat_tune_free(&tune); + } + } + + printf("[test-ggml-tune] %d / %d passed\n", n_passed, n_tests); + return (n_passed == n_tests) ? 0 : 1; } // implement `ggml_task_profiles_provider` @@ -93,7 +124,7 @@ ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node, } int estimate_time_non_zero_NK(void) { - printf("test: %s\n", __func__); + printf("test-ggml-tune: %s\n", __func__); struct test_data_t { int M; @@ -106,9 +137,10 @@ int estimate_time_non_zero_NK(void) { }; const int m_num = 2; + const int n_threads = 1; // useless. struct ggml_mulmat_tune_params params; - init_params(¶ms, m_num); + init_params(¶ms, tune.ftype, m_num, n_threads); ggml_mulmat_tune_init(&tune, ¶ms, ggml_task_profiles_mock_qxx_provider); @@ -123,8 +155,8 @@ int estimate_time_non_zero_NK(void) { GGML_ASSERT(shape->n_profiles == 2); GGML_ASSERT(ggml_is_quantized(shape->src0_type)); - printf("shape: N: %d, K: %d, n_profiles: %d\n", shape->N, shape->K, - shape->n_profiles); + printf("[test-ggml-tune] %s, shape: N: %d, K: %d, n_profiles: %d\n", + __func__, shape->N, shape->K, shape->n_profiles); { shape->items[0] =