From 21e88c8b0ff2979f416c5b2e240c65acadcf5fb2 Mon Sep 17 00:00:00 2001 From: Vladimir Date: Sat, 1 Apr 2023 20:16:36 +0200 Subject: [PATCH 1/3] run sanitizers in release, otherwise too slow (#5) --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 88e70e495..482fc6457 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -326,7 +326,7 @@ jobs: # sudo apt-get install cmake # # - name: Configure -# run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON +# run: cmake . -DCMAKE_BUILD_TYPE=RelWithDebInfo -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON # # - name: Build # run: | From a65d37ad36f8c5be36ccb8f0d1ddddfd67cab777 Mon Sep 17 00:00:00 2001 From: Vladimir Date: Fri, 31 Mar 2023 21:03:48 +0200 Subject: [PATCH 2/3] using github Pithikos/C-Thread-Pool for threading --- CMakeLists.txt | 5 +- Makefile | 17 +- ggml.c | 234 +++++++++------------ thpool.c | 553 +++++++++++++++++++++++++++++++++++++++++++++++++ thpool.h | 187 +++++++++++++++++ 5 files changed, 855 insertions(+), 141 deletions(-) create mode 100644 thpool.c create mode 100644 thpool.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 37f22700b..e3316f5b0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -227,7 +227,10 @@ endif() add_library(ggml OBJECT ggml.c - ggml.h) + ggml.h + thpool.c + thpool.h + ) target_include_directories(ggml PUBLIC .) target_compile_features(ggml PUBLIC c_std_11) # don't bump diff --git a/Makefile b/Makefile index 83a4514ef..dd0c9e45a 100644 --- a/Makefile +++ b/Makefile @@ -225,6 +225,9 @@ default: main quantize perplexity embedding # Build library # +thpool.o: thpool.c thpool.h + $(CC) $(CFLAGS) -c thpool.c -o thpool.o + ggml.o: ggml.c ggml.h $(CC) $(CFLAGS) -c ggml.c -o ggml.o @@ -237,20 +240,20 @@ common.o: examples/common.cpp examples/common.h clean: rm -vf *.o main quantize perplexity embedding -main: examples/main/main.cpp ggml.o llama.o common.o - $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) +main: examples/main/main.cpp thpool.o ggml.o llama.o common.o + $(CXX) $(CXXFLAGS) examples/main/main.cpp thpool.o ggml.o llama.o common.o -o main $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo -quantize: examples/quantize/quantize.cpp ggml.o llama.o - $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS) +quantize: examples/quantize/quantize.cpp thpool.o ggml.o llama.o + $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp thpool.o ggml.o llama.o -o quantize $(LDFLAGS) perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o - $(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS) + $(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp thpool.o ggml.o llama.o common.o -o perplexity $(LDFLAGS) -embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o - $(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS) +embedding: examples/embedding/embedding.cpp thpool.o ggml.o llama.o common.o + $(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp thpool.o ggml.o llama.o common.o -o embedding $(LDFLAGS) # # Tests diff --git a/ggml.c b/ggml.c index 25fa72632..58e51159c 100644 --- a/ggml.c +++ b/ggml.c @@ -3,6 +3,8 @@ #include "ggml.h" +#include "thpool.h" + #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) @@ -72,6 +74,59 @@ static int sched_yield (void) { Sleep (0); return 0; } + +typedef struct pthread_mutex_tag { + CRITICAL_SECTION critical_section; +} pthread_mutex_t; + +typedef struct pthread_mutexattr_tag { + int attr; +} pthread_mutexattr_t; + +int pthread_mutex_init(pthread_mutex_t * mutex, const pthread_mutexattr_t * attr) { + InitializeCriticalSection (&mutex->critical_section); + return 0; +} + +int pthread_mutex_destroy(pthread_mutex_t * mutex) { + DeleteCriticalSection(&mutex->critical_section); + return 0; +} + + +int pthread_mutex_lock(pthread_mutex_t * mutex) { + EnterCriticalSection(&mutex->critical_section); + return 0; +} + +int pthread_mutex_unlock(pthread_mutex_t * mutex) { + LeaveCriticalSection(&mutex->critical_section); + return 0; +} + +typedef struct pthread_cond_tag { + CONDITION_VARIABLE cond; +} pthread_cond_t; + +int pthread_cond_init(pthread_cond_t * cond, void * unused) { + InitializeConditionVariable (&cond->cond); + return 0; +} + +int pthread_cond_destroy(pthread_cond_t * cond) { + return 0; +} + +int pthread_cond_wait(pthread_cond_t * cond, pthread_mutex_t * mutex) { + SleepConditionVariableCS(&cond->cond, &mutex->critical_section, INFINITE); + return 0; +} + +int pthread_cond_broadcast(pthread_cond_t * cond) { + WakeAllConditionVariable(&cond->cond); + return 0; +} + #else #include #include @@ -2538,6 +2593,7 @@ struct ggml_context { struct ggml_scratch scratch; struct ggml_scratch scratch_save; + threadpool tpool; }; struct ggml_context_container { @@ -2822,6 +2878,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { /*.objects_end =*/ NULL, /*.scratch =*/ { 0, 0, NULL, }, /*.scratch_save =*/ { 0, 0, NULL, }, + /*.thpool =*/ NULL, }; GGML_ASSERT(ctx->mem_buffer != NULL); // check for allocation failure @@ -8954,6 +9011,19 @@ typedef pthread_t ggml_thread_t; #define ggml_thread_create pthread_create #define ggml_thread_join pthread_join +typedef pthread_mutex_t ggml_mutex_t; +typedef pthread_cond_t ggml_cond_t; + +#define ggml_mutex_init pthread_mutex_init +#define ggml_mutex_destroy pthread_mutex_destroy +#define ggml_cond_init pthread_cond_init +#define ggml_cond_destroy pthread_cond_destroy + +#define ggml_mutex_lock pthread_mutex_lock +#define ggml_mutex_unlock pthread_mutex_unlock +#define ggml_cond_broadcast pthread_cond_broadcast +#define ggml_cond_wait pthread_cond_wait + #else //typedef pthread_spinlock_t ggml_lock_t; @@ -8977,17 +9047,31 @@ typedef pthread_t ggml_thread_t; #define ggml_thread_create pthread_create #define ggml_thread_join pthread_join +typedef pthread_mutex_t ggml_mutex_t; +typedef pthread_cond_t ggml_cond_t; + +#define ggml_mutex_init pthread_mutex_init +#define ggml_mutex_destroy pthread_mutex_destroy +#define ggml_cond_init pthread_cond_init +#define ggml_cond_destroy pthread_cond_destroy + +#define ggml_mutex_lock pthread_mutex_lock +#define ggml_mutex_unlock pthread_mutex_unlock +#define ggml_cond_broadcast pthread_cond_broadcast +#define ggml_cond_wait pthread_cond_wait + #endif struct ggml_compute_state_shared { - ggml_lock_t spin; int n_threads; // synchronization primitives - atomic_int n_ready; - atomic_bool has_work; - atomic_bool stop; // stop all threads + int n_ready; + bool has_work; + bool stop; // stop all threads + ggml_mutex_t mutex; + ggml_cond_t cond; }; struct ggml_compute_state { @@ -8999,72 +9083,31 @@ struct ggml_compute_state { struct ggml_compute_state_shared * shared; }; -static thread_ret_t ggml_graph_compute_thread(void * data) { +static void ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; - - const int n_threads = state->shared->n_threads; - - while (true) { - if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) { - atomic_store(&state->shared->has_work, false); - } else { - while (atomic_load(&state->shared->has_work)) { - if (atomic_load(&state->shared->stop)) { - return 0; - } - ggml_lock_lock (&state->shared->spin); - ggml_lock_unlock(&state->shared->spin); - } - } - - atomic_fetch_sub(&state->shared->n_ready, 1); - - // wait for work - while (!atomic_load(&state->shared->has_work)) { - if (atomic_load(&state->shared->stop)) { - return 0; - } - ggml_lock_lock (&state->shared->spin); - ggml_lock_unlock(&state->shared->spin); - } - - // check if we should stop - if (atomic_load(&state->shared->stop)) { - break; - } - - if (state->node) { - if (state->params.ith < state->params.nth) { - ggml_compute_forward(&state->params, state->node); - } - - state->node = NULL; - } else { - break; + if (state->node) { + if (state->params.ith < state->params.nth) { + ggml_compute_forward(&state->params, state->node); } + state->node = NULL; } - - return 0; } void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { const int n_threads = cgraph->n_threads; - struct ggml_compute_state_shared state_shared = { - /*.spin =*/ GGML_LOCK_INITIALIZER, /*.n_threads =*/ n_threads, /*.n_ready =*/ 0, /*.has_work =*/ false, /*.stop =*/ false, + /*.mutex =*/ {0}, + /*.cond =*/ {0}, }; struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL; // create thread pool if (n_threads > 1) { - ggml_lock_init(&state_shared.spin); - - atomic_store(&state_shared.has_work, true); - + ctx->tpool = thpool_init(n_threads); for (int j = 0; j < n_threads - 1; j++) { workers[j] = (struct ggml_compute_state) { .thrd = 0, @@ -9078,10 +9121,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) .node = NULL, .shared = &state_shared, }; - - int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); - GGML_ASSERT(rc == 0); - UNUSED(rc); } } @@ -9319,15 +9358,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) // COMPUTE if (node->n_tasks > 1) { - if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { - atomic_store(&state_shared.has_work, false); - } - - while (atomic_load(&state_shared.has_work)) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - // launch thread pool for (int j = 0; j < n_threads - 1; j++) { workers[j].params = (struct ggml_compute_params) { @@ -9338,16 +9368,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) .wdata = cgraph->work ? cgraph->work->data : NULL, }; workers[j].node = node; + thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[j]); } - - atomic_fetch_sub(&state_shared.n_ready, 1); - - while (atomic_load(&state_shared.n_ready) > 0) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - - atomic_store(&state_shared.has_work, true); } params.type = GGML_TASK_COMPUTE; @@ -9355,34 +9377,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) // wait for thread pool if (node->n_tasks > 1) { - if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { - atomic_store(&state_shared.has_work, false); - } - - while (atomic_load(&state_shared.has_work)) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - - atomic_fetch_sub(&state_shared.n_ready, 1); - - while (atomic_load(&state_shared.n_ready) != 0) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } + thpool_wait(ctx->tpool); } // FINALIZE if (node->n_tasks > 1) { - if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { - atomic_store(&state_shared.has_work, false); - } - - while (atomic_load(&state_shared.has_work)) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - // launch thread pool for (int j = 0; j < n_threads - 1; j++) { workers[j].params = (struct ggml_compute_params) { @@ -9393,16 +9392,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) .wdata = cgraph->work ? cgraph->work->data : NULL, }; workers[j].node = node; + thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[j]); } - - atomic_fetch_sub(&state_shared.n_ready, 1); - - while (atomic_load(&state_shared.n_ready) > 0) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - - atomic_store(&state_shared.has_work, true); } params.type = GGML_TASK_FINALIZE; @@ -9410,21 +9401,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) // wait for thread pool if (node->n_tasks > 1) { - if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { - atomic_store(&state_shared.has_work, false); - } - - while (atomic_load(&state_shared.has_work)) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - - atomic_fetch_sub(&state_shared.n_ready, 1); - - while (atomic_load(&state_shared.n_ready) != 0) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } + thpool_wait(ctx->tpool); } // performance stats (node) @@ -9440,16 +9417,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) // join thread pool if (n_threads > 1) { - atomic_store(&state_shared.stop, true); - atomic_store(&state_shared.has_work, true); - - for (int j = 0; j < n_threads - 1; j++) { - int rc = ggml_thread_join(workers[j].thrd, NULL); - GGML_ASSERT(rc == 0); - UNUSED(rc); - } - - ggml_lock_destroy(&state_shared.spin); + thpool_destroy(ctx->tpool); } // performance stats (graph) diff --git a/thpool.c b/thpool.c new file mode 100644 index 000000000..59e2e5556 --- /dev/null +++ b/thpool.c @@ -0,0 +1,553 @@ +/* ******************************** + * Author: Johan Hanssen Seferidis + * License: MIT + * Description: Library providing a threading pool where you can add + * work. For usage, check the thpool.h file or README.md + * + *//** @file thpool.h *//* + * + ********************************/ + +#if defined(__APPLE__) +#include +#else +#ifndef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 200809L +#endif +#endif +#include +#include +#include +#include +#include +#include +#include +#if defined(__linux__) +#include +#endif + +#include "thpool.h" + +#ifdef THPOOL_DEBUG +#define THPOOL_DEBUG 1 +#else +#define THPOOL_DEBUG 0 +#endif + +#if !defined(DISABLE_PRINT) || defined(THPOOL_DEBUG) +#define err(str) fprintf(stderr, str) +#else +#define err(str) +#endif + +static volatile int threads_keepalive; +static volatile int threads_on_hold; + + + +/* ========================== STRUCTURES ============================ */ + + +/* Binary semaphore */ +typedef struct bsem { + pthread_mutex_t mutex; + pthread_cond_t cond; + int v; +} bsem; + + +/* Job */ +typedef struct job{ + struct job* prev; /* pointer to previous job */ + void (*function)(void* arg); /* function pointer */ + void* arg; /* function's argument */ +} job; + + +/* Job queue */ +typedef struct jobqueue{ + pthread_mutex_t rwmutex; /* used for queue r/w access */ + job *front; /* pointer to front of queue */ + job *rear; /* pointer to rear of queue */ + bsem *has_jobs; /* flag as binary semaphore */ + int len; /* number of jobs in queue */ +} jobqueue; + + +/* Thread */ +typedef struct thread{ + int id; /* friendly id */ + pthread_t pthread; /* pointer to actual thread */ + struct thpool_* thpool_p; /* access to thpool */ +} thread; + + +/* Threadpool */ +typedef struct thpool_{ + thread** threads; /* pointer to threads */ + volatile int num_threads_alive; /* threads currently alive */ + volatile int num_threads_working; /* threads currently working */ + pthread_mutex_t thcount_lock; /* used for thread count etc */ + pthread_cond_t threads_all_idle; /* signal to thpool_wait */ + jobqueue jobqueue; /* job queue */ +} thpool_; + + + + + +/* ========================== PROTOTYPES ============================ */ + + +static int thread_init(thpool_* thpool_p, struct thread** thread_p, int id); +static void* thread_do(struct thread* thread_p); +static void thread_hold(int sig_id); +static void thread_destroy(struct thread* thread_p); + +static int jobqueue_init(jobqueue* jobqueue_p); +static void jobqueue_clear(jobqueue* jobqueue_p); +static void jobqueue_push(jobqueue* jobqueue_p, struct job* newjob_p); +static struct job* jobqueue_pull(jobqueue* jobqueue_p); +static void jobqueue_destroy(jobqueue* jobqueue_p); + +static void bsem_init(struct bsem *bsem_p, int value); +static void bsem_reset(struct bsem *bsem_p); +static void bsem_post(struct bsem *bsem_p); +static void bsem_post_all(struct bsem *bsem_p); +static void bsem_wait(struct bsem *bsem_p); + + + + + +/* ========================== THREADPOOL ============================ */ + + +/* Initialise thread pool */ +struct thpool_* thpool_init(int num_threads){ + + threads_on_hold = 0; + threads_keepalive = 1; + + if (num_threads < 0){ + num_threads = 0; + } + + /* Make new thread pool */ + thpool_* thpool_p; + thpool_p = (struct thpool_*)malloc(sizeof(struct thpool_)); + if (thpool_p == NULL){ + err("thpool_init(): Could not allocate memory for thread pool\n"); + return NULL; + } + thpool_p->num_threads_alive = 0; + thpool_p->num_threads_working = 0; + + /* Initialise the job queue */ + if (jobqueue_init(&thpool_p->jobqueue) == -1){ + err("thpool_init(): Could not allocate memory for job queue\n"); + free(thpool_p); + return NULL; + } + + /* Make threads in pool */ + thpool_p->threads = (struct thread**)malloc(num_threads * sizeof(struct thread *)); + if (thpool_p->threads == NULL){ + err("thpool_init(): Could not allocate memory for threads\n"); + jobqueue_destroy(&thpool_p->jobqueue); + free(thpool_p); + return NULL; + } + + pthread_mutex_init(&(thpool_p->thcount_lock), NULL); + pthread_cond_init(&thpool_p->threads_all_idle, NULL); + + /* Thread init */ + int n; + for (n=0; nthreads[n], n); +#if THPOOL_DEBUG + printf("THPOOL_DEBUG: Created thread %d in pool \n", n); +#endif + } + + /* Wait for threads to initialize */ + while (thpool_p->num_threads_alive != num_threads) {} + + return thpool_p; +} + + +/* Add work to the thread pool */ +int thpool_add_work(thpool_* thpool_p, void (*function_p)(void*), void* arg_p){ + job* newjob; + + newjob=(struct job*)malloc(sizeof(struct job)); + if (newjob==NULL){ + err("thpool_add_work(): Could not allocate memory for new job\n"); + return -1; + } + + /* add function and argument */ + newjob->function=function_p; + newjob->arg=arg_p; + + /* add job to queue */ + jobqueue_push(&thpool_p->jobqueue, newjob); + + return 0; +} + + +/* Wait until all jobs have finished */ +void thpool_wait(thpool_* thpool_p){ + pthread_mutex_lock(&thpool_p->thcount_lock); + while (thpool_p->jobqueue.len || thpool_p->num_threads_working) { + pthread_cond_wait(&thpool_p->threads_all_idle, &thpool_p->thcount_lock); + } + pthread_mutex_unlock(&thpool_p->thcount_lock); +} + + +/* Destroy the threadpool */ +void thpool_destroy(thpool_* thpool_p){ + /* No need to destroy if it's NULL */ + if (thpool_p == NULL) return ; + + volatile int threads_total = thpool_p->num_threads_alive; + + /* End each thread 's infinite loop */ + threads_keepalive = 0; + + /* Give one second to kill idle threads */ + double TIMEOUT = 1.0; + time_t start, end; + double tpassed = 0.0; + time (&start); + while (tpassed < TIMEOUT && thpool_p->num_threads_alive){ + bsem_post_all(thpool_p->jobqueue.has_jobs); + time (&end); + tpassed = difftime(end,start); + } + + /* Poll remaining threads */ + while (thpool_p->num_threads_alive){ + bsem_post_all(thpool_p->jobqueue.has_jobs); + sleep(1); + } + + /* Job queue cleanup */ + jobqueue_destroy(&thpool_p->jobqueue); + /* Deallocs */ + int n; + for (n=0; n < threads_total; n++){ + thread_destroy(thpool_p->threads[n]); + } + free(thpool_p->threads); + free(thpool_p); +} + + +/* Pause all threads in threadpool */ +void thpool_pause(thpool_* thpool_p) { + int n; + for (n=0; n < thpool_p->num_threads_alive; n++){ + pthread_kill(thpool_p->threads[n]->pthread, SIGUSR1); + } +} + + +/* Resume all threads in threadpool */ +void thpool_resume(thpool_* thpool_p) { + // resuming a single threadpool hasn't been + // implemented yet, meanwhile this suppresses + // the warnings + (void)thpool_p; + + threads_on_hold = 0; +} + + +int thpool_num_threads_working(thpool_* thpool_p){ + return thpool_p->num_threads_working; +} + + + + + +/* ============================ THREAD ============================== */ + + +/* Initialize a thread in the thread pool + * + * @param thread address to the pointer of the thread to be created + * @param id id to be given to the thread + * @return 0 on success, -1 otherwise. + */ +static int thread_init (thpool_* thpool_p, struct thread** thread_p, int id){ + + *thread_p = (struct thread*)malloc(sizeof(struct thread)); + if (*thread_p == NULL){ + err("thread_init(): Could not allocate memory for thread\n"); + return -1; + } + + (*thread_p)->thpool_p = thpool_p; + (*thread_p)->id = id; + + pthread_create(&(*thread_p)->pthread, NULL, (void * (*)(void *)) thread_do, (*thread_p)); + pthread_detach((*thread_p)->pthread); + return 0; +} + + +/* Sets the calling thread on hold */ +static void thread_hold(int sig_id) { + (void)sig_id; + threads_on_hold = 1; + while (threads_on_hold){ + sleep(1); + } +} + + +/* What each thread is doing +* +* In principle this is an endless loop. The only time this loop gets interuppted is once +* thpool_destroy() is invoked or the program exits. +* +* @param thread thread that will run this function +* @return nothing +*/ +static void* thread_do(struct thread* thread_p){ + + /* Set thread name for profiling and debugging */ + char thread_name[16] = {0}; + snprintf(thread_name, 16, "thpool-%d", thread_p->id); + +#if defined(__linux__) + /* Use prctl instead to prevent using _GNU_SOURCE flag and implicit declaration */ + prctl(PR_SET_NAME, thread_name); +#elif defined(__APPLE__) && defined(__MACH__) + pthread_setname_np(thread_name); +#else + err("thread_do(): pthread_setname_np is not supported on this system"); +#endif + + /* Assure all threads have been created before starting serving */ + thpool_* thpool_p = thread_p->thpool_p; + + /* Register signal handler */ + struct sigaction act; + sigemptyset(&act.sa_mask); + act.sa_flags = 0; + act.sa_handler = thread_hold; + if (sigaction(SIGUSR1, &act, NULL) == -1) { + err("thread_do(): cannot handle SIGUSR1"); + } + + /* Mark thread as alive (initialized) */ + pthread_mutex_lock(&thpool_p->thcount_lock); + thpool_p->num_threads_alive += 1; + pthread_mutex_unlock(&thpool_p->thcount_lock); + + while(threads_keepalive){ + + bsem_wait(thpool_p->jobqueue.has_jobs); + + if (threads_keepalive){ + + pthread_mutex_lock(&thpool_p->thcount_lock); + thpool_p->num_threads_working++; + pthread_mutex_unlock(&thpool_p->thcount_lock); + + /* Read job from queue and execute it */ + void (*func_buff)(void*); + void* arg_buff; + job* job_p = jobqueue_pull(&thpool_p->jobqueue); + if (job_p) { + func_buff = job_p->function; + arg_buff = job_p->arg; + func_buff(arg_buff); + free(job_p); + } + + pthread_mutex_lock(&thpool_p->thcount_lock); + thpool_p->num_threads_working--; + if (!thpool_p->num_threads_working) { + pthread_cond_signal(&thpool_p->threads_all_idle); + } + pthread_mutex_unlock(&thpool_p->thcount_lock); + + } + } + pthread_mutex_lock(&thpool_p->thcount_lock); + thpool_p->num_threads_alive --; + pthread_mutex_unlock(&thpool_p->thcount_lock); + + return NULL; +} + + +/* Frees a thread */ +static void thread_destroy (thread* thread_p){ + free(thread_p); +} + + + + + +/* ============================ JOB QUEUE =========================== */ + + +/* Initialize queue */ +static int jobqueue_init(jobqueue* jobqueue_p){ + jobqueue_p->len = 0; + jobqueue_p->front = NULL; + jobqueue_p->rear = NULL; + + jobqueue_p->has_jobs = (struct bsem*)malloc(sizeof(struct bsem)); + if (jobqueue_p->has_jobs == NULL){ + return -1; + } + + pthread_mutex_init(&(jobqueue_p->rwmutex), NULL); + bsem_init(jobqueue_p->has_jobs, 0); + + return 0; +} + + +/* Clear the queue */ +static void jobqueue_clear(jobqueue* jobqueue_p){ + + while(jobqueue_p->len){ + free(jobqueue_pull(jobqueue_p)); + } + + jobqueue_p->front = NULL; + jobqueue_p->rear = NULL; + bsem_reset(jobqueue_p->has_jobs); + jobqueue_p->len = 0; + +} + + +/* Add (allocated) job to queue + */ +static void jobqueue_push(jobqueue* jobqueue_p, struct job* newjob){ + + pthread_mutex_lock(&jobqueue_p->rwmutex); + newjob->prev = NULL; + + switch(jobqueue_p->len){ + + case 0: /* if no jobs in queue */ + jobqueue_p->front = newjob; + jobqueue_p->rear = newjob; + break; + + default: /* if jobs in queue */ + jobqueue_p->rear->prev = newjob; + jobqueue_p->rear = newjob; + + } + jobqueue_p->len++; + + bsem_post(jobqueue_p->has_jobs); + pthread_mutex_unlock(&jobqueue_p->rwmutex); +} + + +/* Get first job from queue(removes it from queue) + * Notice: Caller MUST hold a mutex + */ +static struct job* jobqueue_pull(jobqueue* jobqueue_p){ + + pthread_mutex_lock(&jobqueue_p->rwmutex); + job* job_p = jobqueue_p->front; + + switch(jobqueue_p->len){ + + case 0: /* if no jobs in queue */ + break; + + case 1: /* if one job in queue */ + jobqueue_p->front = NULL; + jobqueue_p->rear = NULL; + jobqueue_p->len = 0; + break; + + default: /* if >1 jobs in queue */ + jobqueue_p->front = job_p->prev; + jobqueue_p->len--; + /* more than one job in queue -> post it */ + bsem_post(jobqueue_p->has_jobs); + + } + + pthread_mutex_unlock(&jobqueue_p->rwmutex); + return job_p; +} + + +/* Free all queue resources back to the system */ +static void jobqueue_destroy(jobqueue* jobqueue_p){ + jobqueue_clear(jobqueue_p); + free(jobqueue_p->has_jobs); +} + + + + + +/* ======================== SYNCHRONISATION ========================= */ + + +/* Init semaphore to 1 or 0 */ +static void bsem_init(bsem *bsem_p, int value) { + if (value < 0 || value > 1) { + err("bsem_init(): Binary semaphore can take only values 1 or 0"); + exit(1); + } + pthread_mutex_init(&(bsem_p->mutex), NULL); + pthread_cond_init(&(bsem_p->cond), NULL); + bsem_p->v = value; +} + + +/* Reset semaphore to 0 */ +static void bsem_reset(bsem *bsem_p) { + bsem_init(bsem_p, 0); +} + + +/* Post to at least one thread */ +static void bsem_post(bsem *bsem_p) { + pthread_mutex_lock(&bsem_p->mutex); + bsem_p->v = 1; + pthread_cond_signal(&bsem_p->cond); + pthread_mutex_unlock(&bsem_p->mutex); +} + + +/* Post to all threads */ +static void bsem_post_all(bsem *bsem_p) { + pthread_mutex_lock(&bsem_p->mutex); + bsem_p->v = 1; + pthread_cond_broadcast(&bsem_p->cond); + pthread_mutex_unlock(&bsem_p->mutex); +} + + +/* Wait on semaphore until semaphore has value 0 */ +static void bsem_wait(bsem* bsem_p) { + pthread_mutex_lock(&bsem_p->mutex); + while (bsem_p->v != 1) { + pthread_cond_wait(&bsem_p->cond, &bsem_p->mutex); + } + bsem_p->v = 0; + pthread_mutex_unlock(&bsem_p->mutex); +} diff --git a/thpool.h b/thpool.h new file mode 100644 index 000000000..af3e68d16 --- /dev/null +++ b/thpool.h @@ -0,0 +1,187 @@ +/********************************** + * @author Johan Hanssen Seferidis + * License: MIT + * + **********************************/ + +#ifndef _THPOOL_ +#define _THPOOL_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* =================================== API ======================================= */ + + +typedef struct thpool_* threadpool; + + +/** + * @brief Initialize threadpool + * + * Initializes a threadpool. This function will not return until all + * threads have initialized successfully. + * + * @example + * + * .. + * threadpool thpool; //First we declare a threadpool + * thpool = thpool_init(4); //then we initialize it to 4 threads + * .. + * + * @param num_threads number of threads to be created in the threadpool + * @return threadpool created threadpool on success, + * NULL on error + */ +threadpool thpool_init(int num_threads); + + +/** + * @brief Add work to the job queue + * + * Takes an action and its argument and adds it to the threadpool's job queue. + * If you want to add to work a function with more than one arguments then + * a way to implement this is by passing a pointer to a structure. + * + * NOTICE: You have to cast both the function and argument to not get warnings. + * + * @example + * + * void print_num(int num){ + * printf("%d\n", num); + * } + * + * int main() { + * .. + * int a = 10; + * thpool_add_work(thpool, (void*)print_num, (void*)a); + * .. + * } + * + * @param threadpool threadpool to which the work will be added + * @param function_p pointer to function to add as work + * @param arg_p pointer to an argument + * @return 0 on success, -1 otherwise. + */ +int thpool_add_work(threadpool, void (*function_p)(void*), void* arg_p); + + +/** + * @brief Wait for all queued jobs to finish + * + * Will wait for all jobs - both queued and currently running to finish. + * Once the queue is empty and all work has completed, the calling thread + * (probably the main program) will continue. + * + * Smart polling is used in wait. The polling is initially 0 - meaning that + * there is virtually no polling at all. If after 1 seconds the threads + * haven't finished, the polling interval starts growing exponentially + * until it reaches max_secs seconds. Then it jumps down to a maximum polling + * interval assuming that heavy processing is being used in the threadpool. + * + * @example + * + * .. + * threadpool thpool = thpool_init(4); + * .. + * // Add a bunch of work + * .. + * thpool_wait(thpool); + * puts("All added work has finished"); + * .. + * + * @param threadpool the threadpool to wait for + * @return nothing + */ +void thpool_wait(threadpool); + + +/** + * @brief Pauses all threads immediately + * + * The threads will be paused no matter if they are idle or working. + * The threads return to their previous states once thpool_resume + * is called. + * + * While the thread is being paused, new work can be added. + * + * @example + * + * threadpool thpool = thpool_init(4); + * thpool_pause(thpool); + * .. + * // Add a bunch of work + * .. + * thpool_resume(thpool); // Let the threads start their magic + * + * @param threadpool the threadpool where the threads should be paused + * @return nothing + */ +void thpool_pause(threadpool); + + +/** + * @brief Unpauses all threads if they are paused + * + * @example + * .. + * thpool_pause(thpool); + * sleep(10); // Delay execution 10 seconds + * thpool_resume(thpool); + * .. + * + * @param threadpool the threadpool where the threads should be unpaused + * @return nothing + */ +void thpool_resume(threadpool); + + +/** + * @brief Destroy the threadpool + * + * This will wait for the currently active threads to finish and then 'kill' + * the whole threadpool to free up memory. + * + * @example + * int main() { + * threadpool thpool1 = thpool_init(2); + * threadpool thpool2 = thpool_init(2); + * .. + * thpool_destroy(thpool1); + * .. + * return 0; + * } + * + * @param threadpool the threadpool to destroy + * @return nothing + */ +void thpool_destroy(threadpool); + + +/** + * @brief Show currently working threads + * + * Working threads are the threads that are performing work (not idle). + * + * @example + * int main() { + * threadpool thpool1 = thpool_init(2); + * threadpool thpool2 = thpool_init(2); + * .. + * printf("Working threads: %d\n", thpool_num_threads_working(thpool1)); + * .. + * return 0; + * } + * + * @param threadpool the threadpool of interest + * @return integer number of threads working + */ +int thpool_num_threads_working(threadpool); + + +#ifdef __cplusplus +} +#endif + +#endif From d3bc4df97db470d4e719bab387bb546bc5dabc70 Mon Sep 17 00:00:00 2001 From: Vladimir Date: Sat, 1 Apr 2023 19:45:33 +0200 Subject: [PATCH 3/3] fix windows build --- ggml.c | 89 ----------------------------------------- thpool.c | 118 +++++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 105 insertions(+), 102 deletions(-) diff --git a/ggml.c b/ggml.c index 58e51159c..1c3e28d37 100644 --- a/ggml.c +++ b/ggml.c @@ -52,81 +52,11 @@ static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) { return atomic_fetch_add(ptr, -(dec)); } -typedef HANDLE pthread_t; - -typedef DWORD thread_ret_t; -static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) { - HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL); - if (handle == NULL) - { - return EAGAIN; - } - - *out = handle; - return 0; -} - -static int pthread_join(pthread_t thread, void* unused) { - return (int) WaitForSingleObject(thread, INFINITE); -} - static int sched_yield (void) { Sleep (0); return 0; } -typedef struct pthread_mutex_tag { - CRITICAL_SECTION critical_section; -} pthread_mutex_t; - -typedef struct pthread_mutexattr_tag { - int attr; -} pthread_mutexattr_t; - -int pthread_mutex_init(pthread_mutex_t * mutex, const pthread_mutexattr_t * attr) { - InitializeCriticalSection (&mutex->critical_section); - return 0; -} - -int pthread_mutex_destroy(pthread_mutex_t * mutex) { - DeleteCriticalSection(&mutex->critical_section); - return 0; -} - - -int pthread_mutex_lock(pthread_mutex_t * mutex) { - EnterCriticalSection(&mutex->critical_section); - return 0; -} - -int pthread_mutex_unlock(pthread_mutex_t * mutex) { - LeaveCriticalSection(&mutex->critical_section); - return 0; -} - -typedef struct pthread_cond_tag { - CONDITION_VARIABLE cond; -} pthread_cond_t; - -int pthread_cond_init(pthread_cond_t * cond, void * unused) { - InitializeConditionVariable (&cond->cond); - return 0; -} - -int pthread_cond_destroy(pthread_cond_t * cond) { - return 0; -} - -int pthread_cond_wait(pthread_cond_t * cond, pthread_mutex_t * mutex) { - SleepConditionVariableCS(&cond->cond, &mutex->critical_section, INFINITE); - return 0; -} - -int pthread_cond_broadcast(pthread_cond_t * cond) { - WakeAllConditionVariable(&cond->cond); - return 0; -} - #else #include #include @@ -9042,14 +8972,10 @@ typedef int ggml_lock_t; #define GGML_LOCK_INITIALIZER 0 -typedef pthread_t ggml_thread_t; #define ggml_thread_create pthread_create #define ggml_thread_join pthread_join -typedef pthread_mutex_t ggml_mutex_t; -typedef pthread_cond_t ggml_cond_t; - #define ggml_mutex_init pthread_mutex_init #define ggml_mutex_destroy pthread_mutex_destroy #define ggml_cond_init pthread_cond_init @@ -9063,19 +8989,10 @@ typedef pthread_cond_t ggml_cond_t; #endif struct ggml_compute_state_shared { - int n_threads; - - // synchronization primitives - int n_ready; - bool has_work; - bool stop; // stop all threads - ggml_mutex_t mutex; - ggml_cond_t cond; }; struct ggml_compute_state { - ggml_thread_t thrd; struct ggml_compute_params params; struct ggml_tensor * node; @@ -9097,11 +9014,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) const int n_threads = cgraph->n_threads; struct ggml_compute_state_shared state_shared = { /*.n_threads =*/ n_threads, - /*.n_ready =*/ 0, - /*.has_work =*/ false, - /*.stop =*/ false, - /*.mutex =*/ {0}, - /*.cond =*/ {0}, }; struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL; @@ -9110,7 +9022,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) ctx->tpool = thpool_init(n_threads); for (int j = 0; j < n_threads - 1; j++) { workers[j] = (struct ggml_compute_state) { - .thrd = 0, .params = { .type = GGML_TASK_COMPUTE, .ith = j + 1, diff --git a/thpool.c b/thpool.c index 59e2e5556..76cf5fc3f 100644 --- a/thpool.c +++ b/thpool.c @@ -15,11 +15,111 @@ #define _POSIX_C_SOURCE 200809L #endif #endif +/* +this is not part of original thpool, thats me hacking it to work on windows +*/ +#if defined _MSC_VER || defined(__MINGW32__) +#if !defined(__MINGW32__) +#include +#else +// ref: https://github.com/ggerganov/whisper.cpp/issues/168 +#include +#endif + +unsigned int sleep(unsigned int seconds) { + Sleep(seconds * 1000); + return 0; +} + +typedef HANDLE pthread_t; + +typedef DWORD thread_ret_t; +static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) { + HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL); + if (handle == NULL) + { + return EAGAIN; + } + + *out = handle; + return 0; +} + +static int pthread_join(pthread_t thread, void* unused) { + return (int) WaitForSingleObject(thread, INFINITE); +} + +static int pthread_detach(pthread_t thread) { + CloseHandle(thread); + return 0; +} + +typedef struct pthread_mutex_tag { + CRITICAL_SECTION critical_section; +} pthread_mutex_t; + +typedef struct pthread_mutexattr_tag { + int attr; +} pthread_mutexattr_t; + +int pthread_mutex_init(pthread_mutex_t * mutex, const pthread_mutexattr_t * attr) { + InitializeCriticalSection (&mutex->critical_section); + return 0; +} + +int pthread_mutex_destroy(pthread_mutex_t * mutex) { + DeleteCriticalSection(&mutex->critical_section); + return 0; +} + + +int pthread_mutex_lock(pthread_mutex_t * mutex) { + EnterCriticalSection(&mutex->critical_section); + return 0; +} + +int pthread_mutex_unlock(pthread_mutex_t * mutex) { + LeaveCriticalSection(&mutex->critical_section); + return 0; +} + +typedef struct pthread_cond_tag { + CONDITION_VARIABLE cond; +} pthread_cond_t; + +int pthread_cond_init(pthread_cond_t * cond, void * unused) { + InitializeConditionVariable (&cond->cond); + return 0; +} + +int pthread_cond_destroy(pthread_cond_t * cond) { + return 0; +} + +int pthread_cond_wait(pthread_cond_t * cond, pthread_mutex_t * mutex) { + SleepConditionVariableCS(&cond->cond, &mutex->critical_section, INFINITE); + return 0; +} + +int pthread_cond_broadcast(pthread_cond_t * cond) { + WakeAllConditionVariable(&cond->cond); + return 0; +} + +int pthread_cond_signal(pthread_cond_t * cond) { + WakeConditionVariable(&cond->cond); + return 0; +} +#else #include +#include +#endif + #include #include #include -#include + + #include #include #if defined(__linux__) @@ -247,16 +347,6 @@ void thpool_destroy(thpool_* thpool_p){ free(thpool_p); } - -/* Pause all threads in threadpool */ -void thpool_pause(thpool_* thpool_p) { - int n; - for (n=0; n < thpool_p->num_threads_alive; n++){ - pthread_kill(thpool_p->threads[n]->pthread, SIGUSR1); - } -} - - /* Resume all threads in threadpool */ void thpool_resume(thpool_* thpool_p) { // resuming a single threadpool hasn't been @@ -332,20 +422,22 @@ static void* thread_do(struct thread* thread_p){ #elif defined(__APPLE__) && defined(__MACH__) pthread_setname_np(thread_name); #else - err("thread_do(): pthread_setname_np is not supported on this system"); + // err("thread_do(): pthread_setname_np is not supported on this system"); #endif /* Assure all threads have been created before starting serving */ thpool_* thpool_p = thread_p->thpool_p; /* Register signal handler */ + /* + ///// HACK struct sigaction act; sigemptyset(&act.sa_mask); act.sa_flags = 0; act.sa_handler = thread_hold; if (sigaction(SIGUSR1, &act, NULL) == -1) { err("thread_do(): cannot handle SIGUSR1"); - } + }*/ /* Mark thread as alive (initialized) */ pthread_mutex_lock(&thpool_p->thcount_lock);