From 1b041d773730efbfa98743ddc4446bbf14cafc8d Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Wed, 14 Jun 2023 21:17:14 +0800
Subject: [PATCH] threading test: improve readability at both codes and output

---
 tests/test-ggml-threading.c | 192 ++++++++++++++++++++----------------
 1 file changed, 108 insertions(+), 84 deletions(-)

diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c
index 0b47623e2..ed9d8aa2b 100644
--- a/tests/test-ggml-threading.c
+++ b/tests/test-ggml-threading.c
@@ -60,7 +60,11 @@ mock_task_runner(struct ggml_compute_params *params, struct ggml_tensor *node) {
 }
 
 int test_driver(int id, struct ggml_tensor *node, int n_threads) {
-    printf("\n[test-ggml-threading] #%d, n_threads: %d\n", id, n_threads);
+    uint8_t loops = node->task_profile.dev_flags[1];
+    printf(
+        "\n[test-ggml-threading] #%02d, workload: %2d million(s), n_threads: "
+        "%2d\n",
+        id, loops, n_threads);
 
     for (int i = 0; i < n_threads; i++) {
         work_done_arr[i] = 0;
@@ -86,9 +90,8 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
             ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
         if (err != GGML_COMPUTE_OK) {
             ggml_threading_stop(ctx);
-            fprintf(stderr,
-                    "ggml_threading_compute_tensor failed with error: %d.\n",
-                    err);
+            printf("ggml_threading_compute_tensor failed with error: %d.\n",
+                   err);
             return 1;
         }
     }
@@ -99,9 +102,11 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
 
     int t3 = (int)ggml_time_us();
 
+    const struct ggml_task_stage *stages = node->task_profile.stages;
+
     int expect = 0;
     for (int i = 0; i < 3; i++) {
-        struct ggml_task_stage *ts = &node->task_profile.stages[i];
+        const struct ggml_task_stage *ts = &stages[i];
         if (ts->backend != GGML_TASK_BACKEND_NONE) {
             if (ts->parallel) {
                 expect += n_threads;
@@ -117,16 +122,10 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
         actual += work_done_arr[i];
     }
 
-    uint8_t loops = node->task_profile.dev_flags[1];
-
-    printf("\tloops: %2d million(s), ---wait_on_done---: %d\n\tstage-0: "
-           "(parallel: %d, "
-           "wait: %d)\n"
-           "\tstage-1: (parallel: %d, wait: %d)\n",
-           loops, wait_on_done, node->task_profile.stages[0].parallel,
-           node->task_profile.stages[0].wait,
-           node->task_profile.stages[1].parallel,
-           node->task_profile.stages[1].wait);
+    printf("\tstage-0: parallel: %d, wait: %d\n\tstage-1: parallel: %d, wait: "
+           "%d, wait_on_done: %d %s\n",
+           stages[0].parallel, stages[0].wait, stages[1].parallel,
+           stages[1].wait, wait_on_done, stages[1].wait ? "<--------" : "");
 
     if (actual == expect) {
         printf("\tthreading: init %6.3f ms, compute %6.3f ms, cleanup %6.3f "
@@ -136,8 +135,7 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
         return 0;
     }
 
-    fprintf(stderr, "\t== failed. expect %d done, actual %d done\n\n", expect,
-            actual);
+    printf("\t== failed. expect %d done, actual %d done\n\n", expect, actual);
 
     return 2;
 }
@@ -172,8 +170,7 @@ int test_fallback(struct ggml_tensor *node) {
 
     ggml_threading_stop(ctx);
     if (err != GGML_COMPUTE_OK) {
-        fprintf(stderr,
-                "ggml_threading_compute_tensor failed with error: %d.\n", err);
+        printf("ggml_threading_compute_tensor failed with error: %d.\n", err);
         return 1;
     }
 
@@ -195,8 +192,6 @@ int main(void) {
     int n_passed = 0;
     int n_tests = 0;
 
-    int parallel[3] = {0, 1, 2};
-
     // In github build actions (windows-latest-cmake and ubuntu-latest-cmake):
     // When n_threads >= 4, the thread init time and compute time suddenly goes
     // down to 100x ~ 1000x slow -- comparing to n_threads == 2.
@@ -214,13 +209,47 @@ int main(void) {
     //   average time, thus greatly punishes those small workloads.
     // - wait_on_done is general faster than wait_now, can be 10x faster.
 
-    int threads_arr[] = {1, 2, 4, 8};
+    int threads_arr[] = {1, 2, 4, 6, 8, 16};
     int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
 
     // millions of loops.
     uint8_t workload_arr[] = {0u, 1u, 10u};
     int workload_arr_len = sizeof(workload_arr) / sizeof(workload_arr[0]);
 
+    // skip slow/big n_threads.
+    for (int i = 0; i < threads_arr_len; i++) {
+        int n_threads = threads_arr[i];
+
+        if (n_threads == 1) {
+            continue;
+        } else if (n_threads > MAX_N_THREADS) {
+            printf("[test-ggml-threading] warning: the n_threads (%d) is too "
+                   "big, allow at most %d, skip.\n",
+                   n_threads, MAX_N_THREADS);
+            threads_arr[i] = 0;
+            continue;
+        }
+
+        // skip this n_threads when too slow.
+        int t0 = (int)ggml_time_us();
+
+        struct ggml_threading_context *ctx =
+            ggml_threading_start(n_threads, ggml_threading_graph_compute_thread,
+                                 mock_task_runner, 0, /*stages_time*/ NULL);
+
+        int t1 = (int)ggml_time_us();
+
+        ggml_threading_stop(ctx);
+
+        int elapsed_us = t1 - t0;
+        if (elapsed_us > 500 * n_threads) {
+            printf("[test-ggml-threading] warning: it took took %.3f "
+                   "ms to start %d worker thread(s). Loo slow, skip.\n",
+                   1.0 * elapsed_us / 1000, n_threads - 1);
+            threads_arr[i] = 0;
+        }
+    }
+
     // node.task_profile.dev_flags: byte 0 for wait_on_done, byte 1 for loops.
 
     for (int x = 0; x < workload_arr_len; x++) {
@@ -228,37 +257,13 @@ int main(void) {
 
         for (int i = 0; i < threads_arr_len; i++) {
             int n_threads = threads_arr[i];
-            if (n_threads > MAX_N_THREADS) {
-                abort();
+            if (n_threads <= 0) {
+                continue;
             }
 
-            printf("\n[test-ggml-threading] ==== n_nodes: %d, n_threads: %d, "
-                   "loops: %2d million(s) ====\n",
-                   n_repeat, n_threads, workload_arr[x]);
-
-            if (n_threads > 1) { // skip this n_threads when too slow.
-                int t0 = (int)ggml_time_us();
-
-                struct ggml_threading_context *ctx = ggml_threading_start(
-                    n_threads, ggml_threading_graph_compute_thread,
-                    mock_task_runner, 0, /*stages_time*/ NULL);
-
-                int t1 = (int)ggml_time_us();
-
-                ggml_threading_stop(ctx);
-
-                int elapsed_us = t1 - t0;
-                if (elapsed_us > 500 * n_threads) {
-                    fprintf(stderr,
-                            "[test-ggml-threading] warning: it took took %.3f "
-                            "ms to start %d worker thread(s).\n",
-                            1.0 * elapsed_us / 1000, n_threads - 1);
-                    fprintf(stderr, "[test-ggml-threading] warning: looks like "
-                                    "the environment is too slow to run this "
-                                    "number of threads, skip.\n");
-                    continue;
-                }
-            }
+            printf("\n[test-ggml-threading] ==== workload: %2d million(s), "
+                   "n_threads: %2d ====\n",
+                   workload_arr[x], n_threads);
 
             // multi-threads: parallel + wait_now/wait_on_done
 
@@ -268,6 +273,8 @@ int main(void) {
                 stages[0].wait = false;
                 stages[1].wait = false;
 
+                node.task_profile.dev_flags[0] = 0u;
+
                 n_tests++;
                 if (test_driver(n_tests, &node, n_threads) == 0) {
                     n_passed++;
@@ -275,54 +282,71 @@ int main(void) {
                 continue;
             }
 
-            for (int j = 0; j < 3; j++) {
+            { // no parallel, no wait
+                stages[0].parallel = false;
+                stages[1].parallel = false;
                 stages[0].wait = false;
                 stages[1].wait = false;
+
                 node.task_profile.dev_flags[0] = 0u;
 
-                if (parallel[j] == 0) {
-                    stages[0].parallel = false;
-                    stages[1].parallel = false;
+                n_tests++;
+                if (test_driver(n_tests, &node, n_threads) == 0) {
+                    n_passed++;
+                }
+            }
+
+            { // both parallel, no wait
+                stages[0].parallel = true;
+                stages[1].parallel = true;
+                stages[0].wait = false;
+                stages[1].wait = false;
+
+                node.task_profile.dev_flags[0] = 0u;
+
+                n_tests++;
+                if (test_driver(n_tests, &node, n_threads) == 0) {
+                    n_passed++;
+                }
+            }
+
+            { // stage 0 parallel, stage 1 may wait
+                stages[0].parallel = true;
+                stages[1].parallel = false;
+                stages[0].wait = false;
+
+                { // stage 1 no wait
+                    stages[1].wait = false;
+                    node.task_profile.dev_flags[0] = 0u;
 
                     n_tests++;
                     if (test_driver(n_tests, &node, n_threads) == 0) {
                         n_passed++;
                     }
-                } else if (parallel[j] == 1) {
-                    stages[0].parallel = true;
-                    stages[1].parallel = false;
+                }
 
-                    for (int k = 0; k < 2; k++) {
-                        stages[1].wait = (k == 1);
+                { // stage 1 wait
+                    stages[1].wait = true;
+                    if (stages[1].parallel) {
+                        abort();
+                    }
 
-                        if (!stages[1].wait) {
-                            n_tests++;
-                            if (test_driver(n_tests, &node, n_threads) == 0) {
-                                n_passed++;
-                            }
-                            continue;
-                        }
+                    { // disable wait_on_done
+                        node.task_profile.dev_flags[0] = 0u; // wait now.
 
-                        // wait
-
-                        for (int m = 0; m < 2; m++) {
-                            if (m == 1) {
-                                node.task_profile.dev_flags[0] = 1u;
-                            }
-                            n_tests++;
-                            if (test_driver(n_tests, &node, n_threads) == 0) {
-                                n_passed++;
-                            }
-                            node.task_profile.dev_flags[0] = 0u;
+                        n_tests++;
+                        if (test_driver(n_tests, &node, n_threads) == 0) {
+                            n_passed++;
                         }
                     }
-                } else {
-                    stages[0].parallel = true;
-                    stages[1].parallel = true;
 
-                    n_tests++;
-                    if (test_driver(n_tests, &node, n_threads) == 0) {
-                        n_passed++;
+                    { // enable wait_on_done
+                        node.task_profile.dev_flags[0] = 1u; // wait on done
+
+                        n_tests++;
+                        if (test_driver(n_tests, &node, n_threads) == 0) {
+                            n_passed++;
+                        }
                     }
                 }
             }