llama.cpp/tests/test-barrier.cpp

#include "ggml.h"
#include "ggml-cpu.h"
#include "ggml-backend.h"

#include <chrono>
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <vector>

#define MAX_NARGS 2

int main(int argc, char *argv[]) {

    int n_threads = 4;
    int n_rounds  = 100;

    if (argc > 1) {
        n_threads = std::atoi(argv[1]);
    }

    if (argc > 2) {
        n_rounds  = std::atoi(argv[2]);
    }

    struct ggml_init_params params = {
        /* .mem_size   = */ 1024*1024*1024,
        /* .mem_buffer = */ NULL,
        /* .no_alloc   = */ false,
    };

    struct ggml_context * ctx = ggml_init(params);

    // Create graph
    struct ggml_cgraph * gf = ggml_new_graph(ctx);

    // Lots of small, parallel ops where barriers in between will dominate
    struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,  64);
    for (int i = 0; i < 1000; i++) {
        struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
        out = ggml_mul_mat(ctx, a, out);

        struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
        out = ggml_mul_mat(ctx, d, out);
    }

    ggml_build_forward_expand(gf, out);
    int n_nodes = ggml_graph_n_nodes(gf);

    // Create threadpool
    struct ggml_threadpool_params tpp  = ggml_threadpool_params_default(n_threads);
    struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
    if (!threadpool) {
        fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
        exit(1);
    }

    // Create compute plan
    struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);

    std::vector<uint8_t> work_data(cplan.work_size);
    cplan.work_data = work_data.data();

    std::cerr << "graph-compute with"
              << "\n n_threads: " << n_threads
              << "\n   n_nodes: " << n_nodes
              << "\n  n_rounds: " << n_rounds
              << "\n";
    // ggml_graph_print(gf);

    // Warmup
    ggml_graph_compute(gf, &cplan);

    auto t0 = std::chrono::high_resolution_clock::now();

    for (int i=0; i < n_rounds; i++) {
        ggml_graph_compute(gf, &cplan);
    }

    auto t1 = std::chrono::high_resolution_clock::now();

    auto usec = std::chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
    auto nsec = std::chrono::duration_cast<std::chrono::nanoseconds>(t1-t0).count();
    std::cerr << "graph-compute took " << usec << " usec "
              << "\n " << (float) usec / n_rounds << " usec per-iter"
              << "\n " << (float) nsec / (n_rounds * n_nodes) << " nsec per-node"
              << "\n";

    ggml_threadpool_free(threadpool);
    ggml_free(ctx);

    return 0;
}
threadpool : skip polling for unused threads (#9461) * threadpool: skip polling for unused threads Currently all threads do N polling rounds even if only 1 thread is active (n_threads_cur == 1). This commit adds a check to skip the polling for unused threads (ith >= n_threads_cur). n_threads_cur is now an atomic_int to explicitly tell thread sanitizer that it is written from one thread and read from other threads (not a race conditions). * threadpool: further simplify and improve ggml_barrier Avoid using strict memory order while polling, yet make sure that all threads go through full memory barrier (memory fence) on ggml_barrier entrace and exit. * threads: add simple barrier test This test does lots of small, parallel matmul ops where the barriers in between dominate the overhead. * threadpool: improve thread sync for new-graphs Using the same tricks as ggml_barrier. All the polling is done with relaxed memory order to keep it efficient, once the new graph is detected we do full fence using read-modify-write with strict memory order. * threadpool: improve abort handling Do not use threadpool->ec (exit code) to decide whether to exit the compute loop. threadpool->ec is not atomic which makes thread-sanitizer rightfully unhappy about it. Instead introduce atomic threadpool->abort flag used for this. This is consistent with how we handle threadpool->stop or pause. While at it add an explicit atomic_load for n_threads_cur for consistency. * test-barrier: release threadpool before releasing the context fixes use-after-free detected by gcc thread-sanitizer on x86-64 for some reason llvm sanitizer is not detecting this issue. 2024-09-17 08:19:46 +00:00			`#include "ggml.h"`
ggml : move CPU backend to a separate file (#10144) 2024-11-03 18:34:08 +00:00			`#include "ggml-cpu.h"`
threadpool : skip polling for unused threads (#9461) * threadpool: skip polling for unused threads Currently all threads do N polling rounds even if only 1 thread is active (n_threads_cur == 1). This commit adds a check to skip the polling for unused threads (ith >= n_threads_cur). n_threads_cur is now an atomic_int to explicitly tell thread sanitizer that it is written from one thread and read from other threads (not a race conditions). * threadpool: further simplify and improve ggml_barrier Avoid using strict memory order while polling, yet make sure that all threads go through full memory barrier (memory fence) on ggml_barrier entrace and exit. * threads: add simple barrier test This test does lots of small, parallel matmul ops where the barriers in between dominate the overhead. * threadpool: improve thread sync for new-graphs Using the same tricks as ggml_barrier. All the polling is done with relaxed memory order to keep it efficient, once the new graph is detected we do full fence using read-modify-write with strict memory order. * threadpool: improve abort handling Do not use threadpool->ec (exit code) to decide whether to exit the compute loop. threadpool->ec is not atomic which makes thread-sanitizer rightfully unhappy about it. Instead introduce atomic threadpool->abort flag used for this. This is consistent with how we handle threadpool->stop or pause. While at it add an explicit atomic_load for n_threads_cur for consistency. * test-barrier: release threadpool before releasing the context fixes use-after-free detected by gcc thread-sanitizer on x86-64 for some reason llvm sanitizer is not detecting this issue. 2024-09-17 08:19:46 +00:00			`#include "ggml-backend.h"`

			`#include <chrono>`
			`#include <iostream>`
			`#include <cstdio>`
			`#include <cstdlib>`
			`#include <cassert>`
			`#include <vector>`

			`#define MAX_NARGS 2`

			`int main(int argc, char *argv[]) {`

			`int n_threads = 4;`
			`int n_rounds = 100;`

			`if (argc > 1) {`
			`n_threads = std::atoi(argv[1]);`
			`}`

			`if (argc > 2) {`
			`n_rounds = std::atoi(argv[2]);`
			`}`

			`struct ggml_init_params params = {`
			`/* .mem_size = / 10241024*1024,`
			`/* .mem_buffer = */ NULL,`
			`/* .no_alloc = */ false,`
			`};`

			`struct ggml_context * ctx = ggml_init(params);`

			`// Create graph`
			`struct ggml_cgraph * gf = ggml_new_graph(ctx);`

			`// Lots of small, parallel ops where barriers in between will dominate`
			`struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);`
			`for (int i = 0; i < 1000; i++) {`
			`struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);`
			`out = ggml_mul_mat(ctx, a, out);`

			`struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);`
			`out = ggml_mul_mat(ctx, d, out);`
			`}`

			`ggml_build_forward_expand(gf, out);`
			`int n_nodes = ggml_graph_n_nodes(gf);`

			`// Create threadpool`
			`struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);`
			`struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);`
			`if (!threadpool) {`
			`fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);`
			`exit(1);`
			`}`

			`// Create compute plan`
			`struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);`

			`std::vector<uint8_t> work_data(cplan.work_size);`
			`cplan.work_data = work_data.data();`

			`std::cerr << "graph-compute with"`
			`<< "\n n_threads: " << n_threads`
			`<< "\n n_nodes: " << n_nodes`
			`<< "\n n_rounds: " << n_rounds`
			`<< "\n";`
			`// ggml_graph_print(gf);`

			`// Warmup`
			`ggml_graph_compute(gf, &cplan);`

			`auto t0 = std::chrono::high_resolution_clock::now();`

			`for (int i=0; i < n_rounds; i++) {`
			`ggml_graph_compute(gf, &cplan);`
			`}`

			`auto t1 = std::chrono::high_resolution_clock::now();`

			`auto usec = std::chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();`
			`auto nsec = std::chrono::duration_cast<std::chrono::nanoseconds>(t1-t0).count();`
			`std::cerr << "graph-compute took " << usec << " usec "`
			`<< "\n " << (float) usec / n_rounds << " usec per-iter"`
			`<< "\n " << (float) nsec / (n_rounds * n_nodes) << " nsec per-node"`
			`<< "\n";`

			`ggml_threadpool_free(threadpool);`
			`ggml_free(ctx);`

			`return 0;`
			`}`