threads: improve ggml_barrier scaling with large number of threads (#9598)

Make sure n_barrier and n_barrier_passed do not share the cache line to avoid cache line bouncing. This optimization shows performance improvements even for n_threads <= 8 cases. Resurect TSAN (Thread Sanitizer) check so that we can avoid doing expensive read-modify-write in the normal case and just use thread-fence as originally intended. --- Here is the original description and suggestions from Willy Tarreau : There's currently some false sharing between n_barrier and n_barrier_passed that is amplified in ggml_barrier() by the fact that all threads need to increment n_barrier when entering, while all previous threads continue to read n_barrier_passed, waiting for the last one to release them all. The side effect is that all these readers are slowing down all new threads by making the cache line bounce back and forth between readers and writers. Just placing them in two distinct cache lines is sufficient to boost the performance by 21% on a 80-core ARM server compared to the no-openmp version, and by 3% compared to the openmp version. Note that the variables could have been spread apart in the structure as well, but it doesn't seem that the size of this threadpool struct is critical so here we're simply aligning them. Finally, the same issue was present when leaving the barrier since all threads had to update the n_barrier_passed counter, though only one would add a non-zero value. This alone is responsible for half of the cost due to undesired serialization. It might be possible that using a small array of n_barrier counters could make things even faster on many-core systems, but it would likely complicate the logic needed to detect the last thread. Co-authored-by: Willy Tarreau <w@1wt.eu>
2024-12-25 02:44:36 +00:00 · 2024-09-23 11:42:43 -07:00 · 2024-09-23 11:42:43 -07:00 · f0c7b5edf8
commit f0c7b5edf8
parent 1d48e98e4f
1 changed files with 45 additions and 14 deletions
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -63,6 +63,25 @@ int ggml_sve_cnt_b = 0;
 #pragma warning(disable: 4702)
 #endif
 // Note: once we move threading into a separate C++ file
 // will use std::hardware_destructive_interference_size instead of hardcoding it here
 // and we'll use C++ attribute syntax.
 #define GGML_CACHE_LINE  64
 #if defined(__clang__) || defined(__GNUC__)
 #define GGML_CACHE_ALIGN __attribute__((aligned(GGML_CACHE_LINE)))
 #endif
 #if defined(__has_feature)
 #if __has_feature(thread_sanitizer)
 #define GGML_TSAN_ENABLED 1
 #endif
 #else  // __has_feature
 #if defined(__SANITIZE_THREAD__)
 #define GGML_TSAN_ENABLED 1
 #endif
 #endif // __has_feature
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
@ -72,6 +91,8 @@ int ggml_sve_cnt_b = 0;
 #include <windows.h>
 #if !defined(__clang__)
 #define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
 typedef volatile LONG atomic_int;
 typedef atomic_int atomic_bool;
 typedef atomic_int atomic_flag;
@ -2007,8 +2028,8 @@ struct ggml_threadpool {
    // synchronization primitives
    atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
-    atomic_int n_barrier;
+    atomic_int GGML_CACHE_ALIGN n_barrier;
-    atomic_int n_barrier_passed;
+    atomic_int GGML_CACHE_ALIGN n_barrier_passed;
    atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
    // these are atomic as an annotation for thread-sanitizer
@ -3196,20 +3217,27 @@ static void ggml_barrier(struct ggml_threadpool * tp) {
    // enter barrier (full seq-cst fence)
    int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
    int last = 0;
    if (n_barrier == (n_threads - 1)) {
        // last thread
        atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
-        last = 1;
+
-    } else {
+        // exit barrier (fill seq-cst fence)
-        // wait for other threads
+        atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
-        while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
+        return;
-            ggml_thread_cpu_relax();
+    }
-        }
+
    // wait for other threads
    while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
        ggml_thread_cpu_relax();
    }
    // exit barrier (full seq-cst fence)
-    atomic_fetch_add_explicit(&tp->n_barrier_passed, last, memory_order_seq_cst);
+    // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
    #ifdef GGML_TSAN_ENABLED
    atomic_fetch_add_explicit(&tp->n_barrier_passed, 0, memory_order_seq_cst);
    #else
    atomic_thread_fence(memory_order_seq_cst);
    #endif
 #endif
 }
@ -20240,10 +20268,13 @@ static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * s
 // sync thread state after polling
 static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
-    struct ggml_threadpool * threadpool = state->threadpool;
+    // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
-    // this should just be atomic_thread_fence(seq_cst) but it confuses thread-sanitizer
+    #ifdef GGML_TSAN_ENABLED
-    // so instead we just use a dummy read-modify-write
+    atomic_fetch_add_explicit(&state->threadpool->n_graph, 0, memory_order_seq_cst);
-    atomic_fetch_add_explicit(&threadpool->n_graph, 0, memory_order_seq_cst);
+    #else
    atomic_thread_fence(memory_order_seq_cst);
    #endif
    UNUSED(state);
 }
 static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {