mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 03:01:45 +00:00
threading test: improve readability at both codes and output
This commit is contained in:
parent
213f133701
commit
1b041d7737
@ -60,7 +60,11 @@ mock_task_runner(struct ggml_compute_params *params, struct ggml_tensor *node) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int test_driver(int id, struct ggml_tensor *node, int n_threads) {
|
int test_driver(int id, struct ggml_tensor *node, int n_threads) {
|
||||||
printf("\n[test-ggml-threading] #%d, n_threads: %d\n", id, n_threads);
|
uint8_t loops = node->task_profile.dev_flags[1];
|
||||||
|
printf(
|
||||||
|
"\n[test-ggml-threading] #%02d, workload: %2d million(s), n_threads: "
|
||||||
|
"%2d\n",
|
||||||
|
id, loops, n_threads);
|
||||||
|
|
||||||
for (int i = 0; i < n_threads; i++) {
|
for (int i = 0; i < n_threads; i++) {
|
||||||
work_done_arr[i] = 0;
|
work_done_arr[i] = 0;
|
||||||
@ -86,8 +90,7 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
|
|||||||
ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
|
ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
|
||||||
if (err != GGML_COMPUTE_OK) {
|
if (err != GGML_COMPUTE_OK) {
|
||||||
ggml_threading_stop(ctx);
|
ggml_threading_stop(ctx);
|
||||||
fprintf(stderr,
|
printf("ggml_threading_compute_tensor failed with error: %d.\n",
|
||||||
"ggml_threading_compute_tensor failed with error: %d.\n",
|
|
||||||
err);
|
err);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@ -99,9 +102,11 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
|
|||||||
|
|
||||||
int t3 = (int)ggml_time_us();
|
int t3 = (int)ggml_time_us();
|
||||||
|
|
||||||
|
const struct ggml_task_stage *stages = node->task_profile.stages;
|
||||||
|
|
||||||
int expect = 0;
|
int expect = 0;
|
||||||
for (int i = 0; i < 3; i++) {
|
for (int i = 0; i < 3; i++) {
|
||||||
struct ggml_task_stage *ts = &node->task_profile.stages[i];
|
const struct ggml_task_stage *ts = &stages[i];
|
||||||
if (ts->backend != GGML_TASK_BACKEND_NONE) {
|
if (ts->backend != GGML_TASK_BACKEND_NONE) {
|
||||||
if (ts->parallel) {
|
if (ts->parallel) {
|
||||||
expect += n_threads;
|
expect += n_threads;
|
||||||
@ -117,16 +122,10 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
|
|||||||
actual += work_done_arr[i];
|
actual += work_done_arr[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
uint8_t loops = node->task_profile.dev_flags[1];
|
printf("\tstage-0: parallel: %d, wait: %d\n\tstage-1: parallel: %d, wait: "
|
||||||
|
"%d, wait_on_done: %d %s\n",
|
||||||
printf("\tloops: %2d million(s), ---wait_on_done---: %d\n\tstage-0: "
|
stages[0].parallel, stages[0].wait, stages[1].parallel,
|
||||||
"(parallel: %d, "
|
stages[1].wait, wait_on_done, stages[1].wait ? "<--------" : "");
|
||||||
"wait: %d)\n"
|
|
||||||
"\tstage-1: (parallel: %d, wait: %d)\n",
|
|
||||||
loops, wait_on_done, node->task_profile.stages[0].parallel,
|
|
||||||
node->task_profile.stages[0].wait,
|
|
||||||
node->task_profile.stages[1].parallel,
|
|
||||||
node->task_profile.stages[1].wait);
|
|
||||||
|
|
||||||
if (actual == expect) {
|
if (actual == expect) {
|
||||||
printf("\tthreading: init %6.3f ms, compute %6.3f ms, cleanup %6.3f "
|
printf("\tthreading: init %6.3f ms, compute %6.3f ms, cleanup %6.3f "
|
||||||
@ -136,8 +135,7 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "\t== failed. expect %d done, actual %d done\n\n", expect,
|
printf("\t== failed. expect %d done, actual %d done\n\n", expect, actual);
|
||||||
actual);
|
|
||||||
|
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
@ -172,8 +170,7 @@ int test_fallback(struct ggml_tensor *node) {
|
|||||||
|
|
||||||
ggml_threading_stop(ctx);
|
ggml_threading_stop(ctx);
|
||||||
if (err != GGML_COMPUTE_OK) {
|
if (err != GGML_COMPUTE_OK) {
|
||||||
fprintf(stderr,
|
printf("ggml_threading_compute_tensor failed with error: %d.\n", err);
|
||||||
"ggml_threading_compute_tensor failed with error: %d.\n", err);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -195,8 +192,6 @@ int main(void) {
|
|||||||
int n_passed = 0;
|
int n_passed = 0;
|
||||||
int n_tests = 0;
|
int n_tests = 0;
|
||||||
|
|
||||||
int parallel[3] = {0, 1, 2};
|
|
||||||
|
|
||||||
// In github build actions (windows-latest-cmake and ubuntu-latest-cmake):
|
// In github build actions (windows-latest-cmake and ubuntu-latest-cmake):
|
||||||
// When n_threads >= 4, the thread init time and compute time suddenly goes
|
// When n_threads >= 4, the thread init time and compute time suddenly goes
|
||||||
// down to 100x ~ 1000x slow -- comparing to n_threads == 2.
|
// down to 100x ~ 1000x slow -- comparing to n_threads == 2.
|
||||||
@ -214,33 +209,32 @@ int main(void) {
|
|||||||
// average time, thus greatly punishes those small workloads.
|
// average time, thus greatly punishes those small workloads.
|
||||||
// - wait_on_done is general faster than wait_now, can be 10x faster.
|
// - wait_on_done is general faster than wait_now, can be 10x faster.
|
||||||
|
|
||||||
int threads_arr[] = {1, 2, 4, 8};
|
int threads_arr[] = {1, 2, 4, 6, 8, 16};
|
||||||
int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
|
int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
|
||||||
|
|
||||||
// millions of loops.
|
// millions of loops.
|
||||||
uint8_t workload_arr[] = {0u, 1u, 10u};
|
uint8_t workload_arr[] = {0u, 1u, 10u};
|
||||||
int workload_arr_len = sizeof(workload_arr) / sizeof(workload_arr[0]);
|
int workload_arr_len = sizeof(workload_arr) / sizeof(workload_arr[0]);
|
||||||
|
|
||||||
// node.task_profile.dev_flags: byte 0 for wait_on_done, byte 1 for loops.
|
// skip slow/big n_threads.
|
||||||
|
|
||||||
for (int x = 0; x < workload_arr_len; x++) {
|
|
||||||
node.task_profile.dev_flags[1] = workload_arr[x];
|
|
||||||
|
|
||||||
for (int i = 0; i < threads_arr_len; i++) {
|
for (int i = 0; i < threads_arr_len; i++) {
|
||||||
int n_threads = threads_arr[i];
|
int n_threads = threads_arr[i];
|
||||||
if (n_threads > MAX_N_THREADS) {
|
|
||||||
abort();
|
if (n_threads == 1) {
|
||||||
|
continue;
|
||||||
|
} else if (n_threads > MAX_N_THREADS) {
|
||||||
|
printf("[test-ggml-threading] warning: the n_threads (%d) is too "
|
||||||
|
"big, allow at most %d, skip.\n",
|
||||||
|
n_threads, MAX_N_THREADS);
|
||||||
|
threads_arr[i] = 0;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("\n[test-ggml-threading] ==== n_nodes: %d, n_threads: %d, "
|
// skip this n_threads when too slow.
|
||||||
"loops: %2d million(s) ====\n",
|
|
||||||
n_repeat, n_threads, workload_arr[x]);
|
|
||||||
|
|
||||||
if (n_threads > 1) { // skip this n_threads when too slow.
|
|
||||||
int t0 = (int)ggml_time_us();
|
int t0 = (int)ggml_time_us();
|
||||||
|
|
||||||
struct ggml_threading_context *ctx = ggml_threading_start(
|
struct ggml_threading_context *ctx =
|
||||||
n_threads, ggml_threading_graph_compute_thread,
|
ggml_threading_start(n_threads, ggml_threading_graph_compute_thread,
|
||||||
mock_task_runner, 0, /*stages_time*/ NULL);
|
mock_task_runner, 0, /*stages_time*/ NULL);
|
||||||
|
|
||||||
int t1 = (int)ggml_time_us();
|
int t1 = (int)ggml_time_us();
|
||||||
@ -249,16 +243,27 @@ int main(void) {
|
|||||||
|
|
||||||
int elapsed_us = t1 - t0;
|
int elapsed_us = t1 - t0;
|
||||||
if (elapsed_us > 500 * n_threads) {
|
if (elapsed_us > 500 * n_threads) {
|
||||||
fprintf(stderr,
|
printf("[test-ggml-threading] warning: it took took %.3f "
|
||||||
"[test-ggml-threading] warning: it took took %.3f "
|
"ms to start %d worker thread(s). Loo slow, skip.\n",
|
||||||
"ms to start %d worker thread(s).\n",
|
|
||||||
1.0 * elapsed_us / 1000, n_threads - 1);
|
1.0 * elapsed_us / 1000, n_threads - 1);
|
||||||
fprintf(stderr, "[test-ggml-threading] warning: looks like "
|
threads_arr[i] = 0;
|
||||||
"the environment is too slow to run this "
|
}
|
||||||
"number of threads, skip.\n");
|
}
|
||||||
|
|
||||||
|
// node.task_profile.dev_flags: byte 0 for wait_on_done, byte 1 for loops.
|
||||||
|
|
||||||
|
for (int x = 0; x < workload_arr_len; x++) {
|
||||||
|
node.task_profile.dev_flags[1] = workload_arr[x];
|
||||||
|
|
||||||
|
for (int i = 0; i < threads_arr_len; i++) {
|
||||||
|
int n_threads = threads_arr[i];
|
||||||
|
if (n_threads <= 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
printf("\n[test-ggml-threading] ==== workload: %2d million(s), "
|
||||||
|
"n_threads: %2d ====\n",
|
||||||
|
workload_arr[x], n_threads);
|
||||||
|
|
||||||
// multi-threads: parallel + wait_now/wait_on_done
|
// multi-threads: parallel + wait_now/wait_on_done
|
||||||
|
|
||||||
@ -268,6 +273,8 @@ int main(void) {
|
|||||||
stages[0].wait = false;
|
stages[0].wait = false;
|
||||||
stages[1].wait = false;
|
stages[1].wait = false;
|
||||||
|
|
||||||
|
node.task_profile.dev_flags[0] = 0u;
|
||||||
|
|
||||||
n_tests++;
|
n_tests++;
|
||||||
if (test_driver(n_tests, &node, n_threads) == 0) {
|
if (test_driver(n_tests, &node, n_threads) == 0) {
|
||||||
n_passed++;
|
n_passed++;
|
||||||
@ -275,50 +282,66 @@ int main(void) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int j = 0; j < 3; j++) {
|
{ // no parallel, no wait
|
||||||
|
stages[0].parallel = false;
|
||||||
|
stages[1].parallel = false;
|
||||||
stages[0].wait = false;
|
stages[0].wait = false;
|
||||||
stages[1].wait = false;
|
stages[1].wait = false;
|
||||||
|
|
||||||
|
node.task_profile.dev_flags[0] = 0u;
|
||||||
|
|
||||||
|
n_tests++;
|
||||||
|
if (test_driver(n_tests, &node, n_threads) == 0) {
|
||||||
|
n_passed++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{ // both parallel, no wait
|
||||||
|
stages[0].parallel = true;
|
||||||
|
stages[1].parallel = true;
|
||||||
|
stages[0].wait = false;
|
||||||
|
stages[1].wait = false;
|
||||||
|
|
||||||
|
node.task_profile.dev_flags[0] = 0u;
|
||||||
|
|
||||||
|
n_tests++;
|
||||||
|
if (test_driver(n_tests, &node, n_threads) == 0) {
|
||||||
|
n_passed++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{ // stage 0 parallel, stage 1 may wait
|
||||||
|
stages[0].parallel = true;
|
||||||
|
stages[1].parallel = false;
|
||||||
|
stages[0].wait = false;
|
||||||
|
|
||||||
|
{ // stage 1 no wait
|
||||||
|
stages[1].wait = false;
|
||||||
node.task_profile.dev_flags[0] = 0u;
|
node.task_profile.dev_flags[0] = 0u;
|
||||||
|
|
||||||
if (parallel[j] == 0) {
|
n_tests++;
|
||||||
stages[0].parallel = false;
|
if (test_driver(n_tests, &node, n_threads) == 0) {
|
||||||
stages[1].parallel = false;
|
n_passed++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{ // stage 1 wait
|
||||||
|
stages[1].wait = true;
|
||||||
|
if (stages[1].parallel) {
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
|
||||||
|
{ // disable wait_on_done
|
||||||
|
node.task_profile.dev_flags[0] = 0u; // wait now.
|
||||||
|
|
||||||
n_tests++;
|
n_tests++;
|
||||||
if (test_driver(n_tests, &node, n_threads) == 0) {
|
if (test_driver(n_tests, &node, n_threads) == 0) {
|
||||||
n_passed++;
|
n_passed++;
|
||||||
}
|
}
|
||||||
} else if (parallel[j] == 1) {
|
|
||||||
stages[0].parallel = true;
|
|
||||||
stages[1].parallel = false;
|
|
||||||
|
|
||||||
for (int k = 0; k < 2; k++) {
|
|
||||||
stages[1].wait = (k == 1);
|
|
||||||
|
|
||||||
if (!stages[1].wait) {
|
|
||||||
n_tests++;
|
|
||||||
if (test_driver(n_tests, &node, n_threads) == 0) {
|
|
||||||
n_passed++;
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// wait
|
{ // enable wait_on_done
|
||||||
|
node.task_profile.dev_flags[0] = 1u; // wait on done
|
||||||
for (int m = 0; m < 2; m++) {
|
|
||||||
if (m == 1) {
|
|
||||||
node.task_profile.dev_flags[0] = 1u;
|
|
||||||
}
|
|
||||||
n_tests++;
|
|
||||||
if (test_driver(n_tests, &node, n_threads) == 0) {
|
|
||||||
n_passed++;
|
|
||||||
}
|
|
||||||
node.task_profile.dev_flags[0] = 0u;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
stages[0].parallel = true;
|
|
||||||
stages[1].parallel = true;
|
|
||||||
|
|
||||||
n_tests++;
|
n_tests++;
|
||||||
if (test_driver(n_tests, &node, n_threads) == 0) {
|
if (test_driver(n_tests, &node, n_threads) == 0) {
|
||||||
@ -328,6 +351,7 @@ int main(void) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
++n_tests;
|
++n_tests;
|
||||||
|
Loading…
Reference in New Issue
Block a user