diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 88e70e495..482fc6457 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -326,7 +326,7 @@ jobs:
 #          sudo apt-get install cmake
 #
 #      - name: Configure
-#        run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
+#        run: cmake . -DCMAKE_BUILD_TYPE=RelWithDebInfo -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
 #
 #      - name: Build
 #        run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7a73d72c4..08b87a952 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -336,7 +336,10 @@ endif()
 
 add_library(ggml OBJECT
             ggml.c
-            ggml.h)
+            ggml.h
+            thpool.c
+            thpool.h
+            )
 
 target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
diff --git a/Makefile b/Makefile
index cb14ffdbc..5ea3b18b4 100644
--- a/Makefile
+++ b/Makefile
@@ -139,6 +139,9 @@ default: main quantize perplexity embedding
 # Build library
 #
 
+thpool.o: thpool.c thpool.h
+	$(CC)  $(CFLAGS) -c thpool.c -o thpool.o
+
 ggml.o: ggml.c ggml.h
 	$(CC)  $(CFLAGS)   -c ggml.c -o ggml.o
 
@@ -151,20 +154,20 @@ common.o: examples/common.cpp examples/common.h
 clean:
 	rm -vf *.o main quantize perplexity embedding
 
-main: examples/main/main.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
+main: examples/main/main.cpp thpool.o ggml.o llama.o common.o
+	$(CXX) $(CXXFLAGS) examples/main/main.cpp thpool.o ggml.o llama.o common.o -o main $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
 
-quantize: examples/quantize/quantize.cpp ggml.o llama.o
-	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
+quantize: examples/quantize/quantize.cpp thpool.o ggml.o llama.o
+	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp thpool.o ggml.o llama.o -o quantize $(LDFLAGS)
 
 perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp thpool.o ggml.o llama.o common.o -o perplexity $(LDFLAGS)
 
-embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
+embedding: examples/embedding/embedding.cpp thpool.o ggml.o llama.o common.o
+	$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp thpool.o ggml.o llama.o common.o -o embedding $(LDFLAGS)
 
 #
 # Tests
diff --git a/ggml.c b/ggml.c
index 8a60bc383..d3d216c61 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3,6 +3,8 @@
 
 #include "ggml.h"
 
+#include "thpool.h"
+
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
 #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -51,28 +53,11 @@ static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) {
     return atomic_fetch_add(ptr, -(dec));
 }
 
-typedef HANDLE pthread_t;
-
-typedef DWORD thread_ret_t;
-static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
-    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
-    if (handle == NULL)
-    {
-        return EAGAIN;
-    }
-
-    *out = handle;
-    return 0;
-}
-
-static int pthread_join(pthread_t thread, void* unused) {
-    return (int) WaitForSingleObject(thread, INFINITE);
-}
-
 static int sched_yield (void) {
     Sleep (0);
     return 0;
 }
+
 #else
 #include <pthread.h>
 #include <stdatomic.h>
@@ -2726,6 +2711,7 @@ struct ggml_context {
 
     struct ggml_scratch scratch;
     struct ggml_scratch scratch_save;
+    threadpool tpool;
 };
 
 struct ggml_context_container {
@@ -3010,6 +2996,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         /*.objects_end        =*/ NULL,
         /*.scratch            =*/ { 0, 0, NULL, },
         /*.scratch_save       =*/ { 0, 0, NULL, },
+        /*.thpool             =*/ NULL,
     };
 
     GGML_ASSERT(ctx->mem_buffer != NULL); // check for allocation failure
@@ -9214,6 +9201,19 @@ typedef pthread_t ggml_thread_t;
 #define ggml_thread_create pthread_create
 #define ggml_thread_join   pthread_join
 
+typedef pthread_mutex_t ggml_mutex_t;
+typedef pthread_cond_t ggml_cond_t;
+
+#define ggml_mutex_init       pthread_mutex_init
+#define ggml_mutex_destroy    pthread_mutex_destroy
+#define ggml_cond_init        pthread_cond_init
+#define ggml_cond_destroy     pthread_cond_destroy
+
+#define ggml_mutex_lock       pthread_mutex_lock
+#define ggml_mutex_unlock     pthread_mutex_unlock
+#define ggml_cond_broadcast   pthread_cond_broadcast
+#define ggml_cond_wait        pthread_cond_wait
+
 #else
 
 //typedef pthread_spinlock_t ggml_lock_t;
@@ -9232,26 +9232,27 @@ typedef int ggml_lock_t;
 
 #define GGML_LOCK_INITIALIZER 0
 
-typedef pthread_t ggml_thread_t;
 
 #define ggml_thread_create pthread_create
 #define ggml_thread_join   pthread_join
 
+#define ggml_mutex_init       pthread_mutex_init
+#define ggml_mutex_destroy    pthread_mutex_destroy
+#define ggml_cond_init        pthread_cond_init
+#define ggml_cond_destroy     pthread_cond_destroy
+
+#define ggml_mutex_lock       pthread_mutex_lock
+#define ggml_mutex_unlock     pthread_mutex_unlock
+#define ggml_cond_broadcast   pthread_cond_broadcast
+#define ggml_cond_wait        pthread_cond_wait
+
 #endif
 
 struct ggml_compute_state_shared {
-    ggml_lock_t spin;
-
     int n_threads;
-
-    // synchronization primitives
-    atomic_int  n_ready;
-    atomic_bool has_work;
-    atomic_bool stop; // stop all threads
 };
 
 struct ggml_compute_state {
-    ggml_thread_t thrd;
 
     struct ggml_compute_params params;
     struct ggml_tensor * node;
@@ -9259,75 +9260,28 @@ struct ggml_compute_state {
     struct ggml_compute_state_shared * shared;
 };
 
-static thread_ret_t ggml_graph_compute_thread(void * data) {
+static void ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
-
-    const int n_threads = state->shared->n_threads;
-
-    while (true) {
-        if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {
-            atomic_store(&state->shared->has_work, false);
-        } else {
-            while (atomic_load(&state->shared->has_work)) {
-                if (atomic_load(&state->shared->stop)) {
-                    return 0;
-                }
-                ggml_lock_lock  (&state->shared->spin);
-                ggml_lock_unlock(&state->shared->spin);
-            }
-        }
-
-        atomic_fetch_sub(&state->shared->n_ready, 1);
-
-        // wait for work
-        while (!atomic_load(&state->shared->has_work)) {
-            if (atomic_load(&state->shared->stop)) {
-                return 0;
-            }
-            ggml_lock_lock  (&state->shared->spin);
-            ggml_lock_unlock(&state->shared->spin);
-        }
-
-        // check if we should stop
-        if (atomic_load(&state->shared->stop)) {
-            break;
-        }
-
-        if (state->node) {
-            if (state->params.ith < state->params.nth) {
-                ggml_compute_forward(&state->params, state->node);
-            }
-
-            state->node = NULL;
-        } else {
-            break;
+    if (state->node) {
+        if (state->params.ith < state->params.nth) {
+            ggml_compute_forward(&state->params, state->node);
         }
+        state->node = NULL;
     }
-
-    return 0;
 }
 
 void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
     const int n_threads = cgraph->n_threads;
-
     struct ggml_compute_state_shared state_shared = {
-        /*.spin      =*/ GGML_LOCK_INITIALIZER,
         /*.n_threads =*/ n_threads,
-        /*.n_ready   =*/ 0,
-        /*.has_work  =*/ false,
-        /*.stop      =*/ false,
     };
     struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL;
 
     // create thread pool
     if (n_threads > 1) {
-        ggml_lock_init(&state_shared.spin);
-
-        atomic_store(&state_shared.has_work, true);
-
+        ctx->tpool = thpool_init(n_threads);
         for (int j = 0; j < n_threads - 1; j++) {
             workers[j] = (struct ggml_compute_state) {
-                .thrd   = 0,
                 .params = {
                     .type  = GGML_TASK_COMPUTE,
                     .ith   = j + 1,
@@ -9338,10 +9292,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 .node   = NULL,
                 .shared = &state_shared,
             };
-
-            int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
-            GGML_ASSERT(rc == 0);
-            UNUSED(rc);
         }
     }
 
@@ -9579,15 +9529,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
         // COMPUTE
         if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
             // launch thread pool
             for (int j = 0; j < n_threads - 1; j++) {
                 workers[j].params = (struct ggml_compute_params) {
@@ -9598,16 +9539,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     .wdata = cgraph->work ? cgraph->work->data : NULL,
                 };
                 workers[j].node = node;
+                thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[j]);
             }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) > 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_store(&state_shared.has_work, true);
         }
 
         params.type = GGML_TASK_COMPUTE;
@@ -9615,34 +9548,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
         // wait for thread pool
         if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) != 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
+            thpool_wait(ctx->tpool);
         }
 
         // FINALIZE
         if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
             // launch thread pool
             for (int j = 0; j < n_threads - 1; j++) {
                 workers[j].params = (struct ggml_compute_params) {
@@ -9653,16 +9563,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     .wdata = cgraph->work ? cgraph->work->data : NULL,
                 };
                 workers[j].node = node;
+                thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[j]);
             }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) > 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_store(&state_shared.has_work, true);
         }
 
         params.type = GGML_TASK_FINALIZE;
@@ -9670,21 +9572,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
         // wait for thread pool
         if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) != 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
+            thpool_wait(ctx->tpool);
         }
 
         // performance stats (node)
@@ -9700,16 +9588,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
     // join thread pool
     if (n_threads > 1) {
-        atomic_store(&state_shared.stop, true);
-        atomic_store(&state_shared.has_work, true);
-
-        for (int j = 0; j < n_threads - 1; j++) {
-            int rc = ggml_thread_join(workers[j].thrd, NULL);
-            GGML_ASSERT(rc == 0);
-            UNUSED(rc);
-        }
-
-        ggml_lock_destroy(&state_shared.spin);
+        thpool_destroy(ctx->tpool);
     }
 
     // performance stats (graph)
diff --git a/thpool.c b/thpool.c
new file mode 100644
index 000000000..76cf5fc3f
--- /dev/null
+++ b/thpool.c
@@ -0,0 +1,645 @@
+/* ********************************
+ * Author:       Johan Hanssen Seferidis
+ * License:	     MIT
+ * Description:  Library providing a threading pool where you can add
+ *               work. For usage, check the thpool.h file or README.md
+ *
+ *//** @file thpool.h *//*
+ *
+ ********************************/
+
+#if defined(__APPLE__)
+#include <AvailabilityMacros.h>
+#else
+#ifndef _POSIX_C_SOURCE
+#define _POSIX_C_SOURCE 200809L
+#endif
+#endif
+/*
+this is not part of original thpool, thats me hacking it to work on windows
+*/
+#if defined _MSC_VER || defined(__MINGW32__)
+#if !defined(__MINGW32__)
+#include <Windows.h>
+#else
+// ref: https://github.com/ggerganov/whisper.cpp/issues/168
+#include <windows.h>
+#endif
+
+unsigned int sleep(unsigned int seconds) {
+	Sleep(seconds * 1000);
+	return 0;
+}
+
+typedef HANDLE pthread_t;
+
+typedef DWORD thread_ret_t;
+static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
+    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
+    if (handle == NULL)
+    {
+        return EAGAIN;
+    }
+
+    *out = handle;
+    return 0;
+}
+
+static int pthread_join(pthread_t thread, void* unused) {
+    return (int) WaitForSingleObject(thread, INFINITE);
+}
+
+static int pthread_detach(pthread_t thread) {
+	CloseHandle(thread);
+	return 0;
+}
+
+typedef struct pthread_mutex_tag {
+    CRITICAL_SECTION critical_section;
+} pthread_mutex_t;
+
+typedef struct pthread_mutexattr_tag {
+    int attr;
+} pthread_mutexattr_t;
+
+int pthread_mutex_init(pthread_mutex_t * mutex, const pthread_mutexattr_t * attr) {
+    InitializeCriticalSection (&mutex->critical_section);
+    return 0;
+}
+
+int pthread_mutex_destroy(pthread_mutex_t * mutex) {
+    DeleteCriticalSection(&mutex->critical_section);
+    return 0;
+}
+
+
+int pthread_mutex_lock(pthread_mutex_t * mutex) {
+    EnterCriticalSection(&mutex->critical_section);
+    return 0;
+}
+
+int pthread_mutex_unlock(pthread_mutex_t * mutex) {
+    LeaveCriticalSection(&mutex->critical_section);
+    return 0;
+}
+
+typedef struct pthread_cond_tag {
+    CONDITION_VARIABLE cond;
+} pthread_cond_t;
+
+int pthread_cond_init(pthread_cond_t * cond, void * unused) {
+    InitializeConditionVariable (&cond->cond);
+    return 0;
+}
+
+int pthread_cond_destroy(pthread_cond_t * cond) {
+    return 0;
+}
+
+int pthread_cond_wait(pthread_cond_t * cond, pthread_mutex_t * mutex) {
+    SleepConditionVariableCS(&cond->cond, &mutex->critical_section, INFINITE);
+    return 0;
+}
+
+int pthread_cond_broadcast(pthread_cond_t * cond) {
+    WakeAllConditionVariable(&cond->cond);
+    return 0;
+}
+
+int pthread_cond_signal(pthread_cond_t * cond) {
+    WakeConditionVariable(&cond->cond);
+    return 0;
+}
+#else
+#include <unistd.h>
+#include <pthread.h>
+#endif
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+
+#include <errno.h>
+#include <time.h>
+#if defined(__linux__)
+#include <sys/prctl.h>
+#endif
+
+#include "thpool.h"
+
+#ifdef THPOOL_DEBUG
+#define THPOOL_DEBUG 1
+#else
+#define THPOOL_DEBUG 0
+#endif
+
+#if !defined(DISABLE_PRINT) || defined(THPOOL_DEBUG)
+#define err(str) fprintf(stderr, str)
+#else
+#define err(str)
+#endif
+
+static volatile int threads_keepalive;
+static volatile int threads_on_hold;
+
+
+
+/* ========================== STRUCTURES ============================ */
+
+
+/* Binary semaphore */
+typedef struct bsem {
+	pthread_mutex_t mutex;
+	pthread_cond_t   cond;
+	int v;
+} bsem;
+
+
+/* Job */
+typedef struct job{
+	struct job*  prev;                   /* pointer to previous job   */
+	void   (*function)(void* arg);       /* function pointer          */
+	void*  arg;                          /* function's argument       */
+} job;
+
+
+/* Job queue */
+typedef struct jobqueue{
+	pthread_mutex_t rwmutex;             /* used for queue r/w access */
+	job  *front;                         /* pointer to front of queue */
+	job  *rear;                          /* pointer to rear  of queue */
+	bsem *has_jobs;                      /* flag as binary semaphore  */
+	int   len;                           /* number of jobs in queue   */
+} jobqueue;
+
+
+/* Thread */
+typedef struct thread{
+	int       id;                        /* friendly id               */
+	pthread_t pthread;                   /* pointer to actual thread  */
+	struct thpool_* thpool_p;            /* access to thpool          */
+} thread;
+
+
+/* Threadpool */
+typedef struct thpool_{
+	thread**   threads;                  /* pointer to threads        */
+	volatile int num_threads_alive;      /* threads currently alive   */
+	volatile int num_threads_working;    /* threads currently working */
+	pthread_mutex_t  thcount_lock;       /* used for thread count etc */
+	pthread_cond_t  threads_all_idle;    /* signal to thpool_wait     */
+	jobqueue  jobqueue;                  /* job queue                 */
+} thpool_;
+
+
+
+
+
+/* ========================== PROTOTYPES ============================ */
+
+
+static int  thread_init(thpool_* thpool_p, struct thread** thread_p, int id);
+static void* thread_do(struct thread* thread_p);
+static void  thread_hold(int sig_id);
+static void  thread_destroy(struct thread* thread_p);
+
+static int   jobqueue_init(jobqueue* jobqueue_p);
+static void  jobqueue_clear(jobqueue* jobqueue_p);
+static void  jobqueue_push(jobqueue* jobqueue_p, struct job* newjob_p);
+static struct job* jobqueue_pull(jobqueue* jobqueue_p);
+static void  jobqueue_destroy(jobqueue* jobqueue_p);
+
+static void  bsem_init(struct bsem *bsem_p, int value);
+static void  bsem_reset(struct bsem *bsem_p);
+static void  bsem_post(struct bsem *bsem_p);
+static void  bsem_post_all(struct bsem *bsem_p);
+static void  bsem_wait(struct bsem *bsem_p);
+
+
+
+
+
+/* ========================== THREADPOOL ============================ */
+
+
+/* Initialise thread pool */
+struct thpool_* thpool_init(int num_threads){
+
+	threads_on_hold   = 0;
+	threads_keepalive = 1;
+
+	if (num_threads < 0){
+		num_threads = 0;
+	}
+
+	/* Make new thread pool */
+	thpool_* thpool_p;
+	thpool_p = (struct thpool_*)malloc(sizeof(struct thpool_));
+	if (thpool_p == NULL){
+		err("thpool_init(): Could not allocate memory for thread pool\n");
+		return NULL;
+	}
+	thpool_p->num_threads_alive   = 0;
+	thpool_p->num_threads_working = 0;
+
+	/* Initialise the job queue */
+	if (jobqueue_init(&thpool_p->jobqueue) == -1){
+		err("thpool_init(): Could not allocate memory for job queue\n");
+		free(thpool_p);
+		return NULL;
+	}
+
+	/* Make threads in pool */
+	thpool_p->threads = (struct thread**)malloc(num_threads * sizeof(struct thread *));
+	if (thpool_p->threads == NULL){
+		err("thpool_init(): Could not allocate memory for threads\n");
+		jobqueue_destroy(&thpool_p->jobqueue);
+		free(thpool_p);
+		return NULL;
+	}
+
+	pthread_mutex_init(&(thpool_p->thcount_lock), NULL);
+	pthread_cond_init(&thpool_p->threads_all_idle, NULL);
+
+	/* Thread init */
+	int n;
+	for (n=0; n<num_threads; n++){
+		thread_init(thpool_p, &thpool_p->threads[n], n);
+#if THPOOL_DEBUG
+			printf("THPOOL_DEBUG: Created thread %d in pool \n", n);
+#endif
+	}
+
+	/* Wait for threads to initialize */
+	while (thpool_p->num_threads_alive != num_threads) {}
+
+	return thpool_p;
+}
+
+
+/* Add work to the thread pool */
+int thpool_add_work(thpool_* thpool_p, void (*function_p)(void*), void* arg_p){
+	job* newjob;
+
+	newjob=(struct job*)malloc(sizeof(struct job));
+	if (newjob==NULL){
+		err("thpool_add_work(): Could not allocate memory for new job\n");
+		return -1;
+	}
+
+	/* add function and argument */
+	newjob->function=function_p;
+	newjob->arg=arg_p;
+
+	/* add job to queue */
+	jobqueue_push(&thpool_p->jobqueue, newjob);
+
+	return 0;
+}
+
+
+/* Wait until all jobs have finished */
+void thpool_wait(thpool_* thpool_p){
+	pthread_mutex_lock(&thpool_p->thcount_lock);
+	while (thpool_p->jobqueue.len || thpool_p->num_threads_working) {
+		pthread_cond_wait(&thpool_p->threads_all_idle, &thpool_p->thcount_lock);
+	}
+	pthread_mutex_unlock(&thpool_p->thcount_lock);
+}
+
+
+/* Destroy the threadpool */
+void thpool_destroy(thpool_* thpool_p){
+	/* No need to destroy if it's NULL */
+	if (thpool_p == NULL) return ;
+
+	volatile int threads_total = thpool_p->num_threads_alive;
+
+	/* End each thread 's infinite loop */
+	threads_keepalive = 0;
+
+	/* Give one second to kill idle threads */
+	double TIMEOUT = 1.0;
+	time_t start, end;
+	double tpassed = 0.0;
+	time (&start);
+	while (tpassed < TIMEOUT && thpool_p->num_threads_alive){
+		bsem_post_all(thpool_p->jobqueue.has_jobs);
+		time (&end);
+		tpassed = difftime(end,start);
+	}
+
+	/* Poll remaining threads */
+	while (thpool_p->num_threads_alive){
+		bsem_post_all(thpool_p->jobqueue.has_jobs);
+		sleep(1);
+	}
+
+	/* Job queue cleanup */
+	jobqueue_destroy(&thpool_p->jobqueue);
+	/* Deallocs */
+	int n;
+	for (n=0; n < threads_total; n++){
+		thread_destroy(thpool_p->threads[n]);
+	}
+	free(thpool_p->threads);
+	free(thpool_p);
+}
+
+/* Resume all threads in threadpool */
+void thpool_resume(thpool_* thpool_p) {
+    // resuming a single threadpool hasn't been
+    // implemented yet, meanwhile this suppresses
+    // the warnings
+    (void)thpool_p;
+
+	threads_on_hold = 0;
+}
+
+
+int thpool_num_threads_working(thpool_* thpool_p){
+	return thpool_p->num_threads_working;
+}
+
+
+
+
+
+/* ============================ THREAD ============================== */
+
+
+/* Initialize a thread in the thread pool
+ *
+ * @param thread        address to the pointer of the thread to be created
+ * @param id            id to be given to the thread
+ * @return 0 on success, -1 otherwise.
+ */
+static int thread_init (thpool_* thpool_p, struct thread** thread_p, int id){
+
+	*thread_p = (struct thread*)malloc(sizeof(struct thread));
+	if (*thread_p == NULL){
+		err("thread_init(): Could not allocate memory for thread\n");
+		return -1;
+	}
+
+	(*thread_p)->thpool_p = thpool_p;
+	(*thread_p)->id       = id;
+
+	pthread_create(&(*thread_p)->pthread, NULL, (void * (*)(void *)) thread_do, (*thread_p));
+	pthread_detach((*thread_p)->pthread);
+	return 0;
+}
+
+
+/* Sets the calling thread on hold */
+static void thread_hold(int sig_id) {
+    (void)sig_id;
+	threads_on_hold = 1;
+	while (threads_on_hold){
+		sleep(1);
+	}
+}
+
+
+/* What each thread is doing
+*
+* In principle this is an endless loop. The only time this loop gets interuppted is once
+* thpool_destroy() is invoked or the program exits.
+*
+* @param  thread        thread that will run this function
+* @return nothing
+*/
+static void* thread_do(struct thread* thread_p){
+
+	/* Set thread name for profiling and debugging */
+	char thread_name[16] = {0};
+	snprintf(thread_name, 16, "thpool-%d", thread_p->id);
+
+#if defined(__linux__)
+	/* Use prctl instead to prevent using _GNU_SOURCE flag and implicit declaration */
+	prctl(PR_SET_NAME, thread_name);
+#elif defined(__APPLE__) && defined(__MACH__)
+	pthread_setname_np(thread_name);
+#else
+	// err("thread_do(): pthread_setname_np is not supported on this system");
+#endif
+
+	/* Assure all threads have been created before starting serving */
+	thpool_* thpool_p = thread_p->thpool_p;
+
+	/* Register signal handler */
+	/*
+	///// HACK
+	struct sigaction act;
+	sigemptyset(&act.sa_mask);
+	act.sa_flags = 0;
+	act.sa_handler = thread_hold;
+	if (sigaction(SIGUSR1, &act, NULL) == -1) {
+		err("thread_do(): cannot handle SIGUSR1");
+	}*/
+
+	/* Mark thread as alive (initialized) */
+	pthread_mutex_lock(&thpool_p->thcount_lock);
+	thpool_p->num_threads_alive += 1;
+	pthread_mutex_unlock(&thpool_p->thcount_lock);
+
+	while(threads_keepalive){
+
+		bsem_wait(thpool_p->jobqueue.has_jobs);
+
+		if (threads_keepalive){
+
+			pthread_mutex_lock(&thpool_p->thcount_lock);
+			thpool_p->num_threads_working++;
+			pthread_mutex_unlock(&thpool_p->thcount_lock);
+
+			/* Read job from queue and execute it */
+			void (*func_buff)(void*);
+			void*  arg_buff;
+			job* job_p = jobqueue_pull(&thpool_p->jobqueue);
+			if (job_p) {
+				func_buff = job_p->function;
+				arg_buff  = job_p->arg;
+				func_buff(arg_buff);
+				free(job_p);
+			}
+
+			pthread_mutex_lock(&thpool_p->thcount_lock);
+			thpool_p->num_threads_working--;
+			if (!thpool_p->num_threads_working) {
+				pthread_cond_signal(&thpool_p->threads_all_idle);
+			}
+			pthread_mutex_unlock(&thpool_p->thcount_lock);
+
+		}
+	}
+	pthread_mutex_lock(&thpool_p->thcount_lock);
+	thpool_p->num_threads_alive --;
+	pthread_mutex_unlock(&thpool_p->thcount_lock);
+
+	return NULL;
+}
+
+
+/* Frees a thread  */
+static void thread_destroy (thread* thread_p){
+	free(thread_p);
+}
+
+
+
+
+
+/* ============================ JOB QUEUE =========================== */
+
+
+/* Initialize queue */
+static int jobqueue_init(jobqueue* jobqueue_p){
+	jobqueue_p->len = 0;
+	jobqueue_p->front = NULL;
+	jobqueue_p->rear  = NULL;
+
+	jobqueue_p->has_jobs = (struct bsem*)malloc(sizeof(struct bsem));
+	if (jobqueue_p->has_jobs == NULL){
+		return -1;
+	}
+
+	pthread_mutex_init(&(jobqueue_p->rwmutex), NULL);
+	bsem_init(jobqueue_p->has_jobs, 0);
+
+	return 0;
+}
+
+
+/* Clear the queue */
+static void jobqueue_clear(jobqueue* jobqueue_p){
+
+	while(jobqueue_p->len){
+		free(jobqueue_pull(jobqueue_p));
+	}
+
+	jobqueue_p->front = NULL;
+	jobqueue_p->rear  = NULL;
+	bsem_reset(jobqueue_p->has_jobs);
+	jobqueue_p->len = 0;
+
+}
+
+
+/* Add (allocated) job to queue
+ */
+static void jobqueue_push(jobqueue* jobqueue_p, struct job* newjob){
+
+	pthread_mutex_lock(&jobqueue_p->rwmutex);
+	newjob->prev = NULL;
+
+	switch(jobqueue_p->len){
+
+		case 0:  /* if no jobs in queue */
+					jobqueue_p->front = newjob;
+					jobqueue_p->rear  = newjob;
+					break;
+
+		default: /* if jobs in queue */
+					jobqueue_p->rear->prev = newjob;
+					jobqueue_p->rear = newjob;
+
+	}
+	jobqueue_p->len++;
+
+	bsem_post(jobqueue_p->has_jobs);
+	pthread_mutex_unlock(&jobqueue_p->rwmutex);
+}
+
+
+/* Get first job from queue(removes it from queue)
+ * Notice: Caller MUST hold a mutex
+ */
+static struct job* jobqueue_pull(jobqueue* jobqueue_p){
+
+	pthread_mutex_lock(&jobqueue_p->rwmutex);
+	job* job_p = jobqueue_p->front;
+
+	switch(jobqueue_p->len){
+
+		case 0:  /* if no jobs in queue */
+		  			break;
+
+		case 1:  /* if one job in queue */
+					jobqueue_p->front = NULL;
+					jobqueue_p->rear  = NULL;
+					jobqueue_p->len = 0;
+					break;
+
+		default: /* if >1 jobs in queue */
+					jobqueue_p->front = job_p->prev;
+					jobqueue_p->len--;
+					/* more than one job in queue -> post it */
+					bsem_post(jobqueue_p->has_jobs);
+
+	}
+
+	pthread_mutex_unlock(&jobqueue_p->rwmutex);
+	return job_p;
+}
+
+
+/* Free all queue resources back to the system */
+static void jobqueue_destroy(jobqueue* jobqueue_p){
+	jobqueue_clear(jobqueue_p);
+	free(jobqueue_p->has_jobs);
+}
+
+
+
+
+
+/* ======================== SYNCHRONISATION ========================= */
+
+
+/* Init semaphore to 1 or 0 */
+static void bsem_init(bsem *bsem_p, int value) {
+	if (value < 0 || value > 1) {
+		err("bsem_init(): Binary semaphore can take only values 1 or 0");
+		exit(1);
+	}
+	pthread_mutex_init(&(bsem_p->mutex), NULL);
+	pthread_cond_init(&(bsem_p->cond), NULL);
+	bsem_p->v = value;
+}
+
+
+/* Reset semaphore to 0 */
+static void bsem_reset(bsem *bsem_p) {
+	bsem_init(bsem_p, 0);
+}
+
+
+/* Post to at least one thread */
+static void bsem_post(bsem *bsem_p) {
+	pthread_mutex_lock(&bsem_p->mutex);
+	bsem_p->v = 1;
+	pthread_cond_signal(&bsem_p->cond);
+	pthread_mutex_unlock(&bsem_p->mutex);
+}
+
+
+/* Post to all threads */
+static void bsem_post_all(bsem *bsem_p) {
+	pthread_mutex_lock(&bsem_p->mutex);
+	bsem_p->v = 1;
+	pthread_cond_broadcast(&bsem_p->cond);
+	pthread_mutex_unlock(&bsem_p->mutex);
+}
+
+
+/* Wait on semaphore until semaphore has value 0 */
+static void bsem_wait(bsem* bsem_p) {
+	pthread_mutex_lock(&bsem_p->mutex);
+	while (bsem_p->v != 1) {
+		pthread_cond_wait(&bsem_p->cond, &bsem_p->mutex);
+	}
+	bsem_p->v = 0;
+	pthread_mutex_unlock(&bsem_p->mutex);
+}
diff --git a/thpool.h b/thpool.h
new file mode 100644
index 000000000..af3e68d16
--- /dev/null
+++ b/thpool.h
@@ -0,0 +1,187 @@
+/**********************************
+ * @author      Johan Hanssen Seferidis
+ * License:     MIT
+ *
+ **********************************/
+
+#ifndef _THPOOL_
+#define _THPOOL_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* =================================== API ======================================= */
+
+
+typedef struct thpool_* threadpool;
+
+
+/**
+ * @brief  Initialize threadpool
+ *
+ * Initializes a threadpool. This function will not return until all
+ * threads have initialized successfully.
+ *
+ * @example
+ *
+ *    ..
+ *    threadpool thpool;                     //First we declare a threadpool
+ *    thpool = thpool_init(4);               //then we initialize it to 4 threads
+ *    ..
+ *
+ * @param  num_threads   number of threads to be created in the threadpool
+ * @return threadpool    created threadpool on success,
+ *                       NULL on error
+ */
+threadpool thpool_init(int num_threads);
+
+
+/**
+ * @brief Add work to the job queue
+ *
+ * Takes an action and its argument and adds it to the threadpool's job queue.
+ * If you want to add to work a function with more than one arguments then
+ * a way to implement this is by passing a pointer to a structure.
+ *
+ * NOTICE: You have to cast both the function and argument to not get warnings.
+ *
+ * @example
+ *
+ *    void print_num(int num){
+ *       printf("%d\n", num);
+ *    }
+ *
+ *    int main() {
+ *       ..
+ *       int a = 10;
+ *       thpool_add_work(thpool, (void*)print_num, (void*)a);
+ *       ..
+ *    }
+ *
+ * @param  threadpool    threadpool to which the work will be added
+ * @param  function_p    pointer to function to add as work
+ * @param  arg_p         pointer to an argument
+ * @return 0 on success, -1 otherwise.
+ */
+int thpool_add_work(threadpool, void (*function_p)(void*), void* arg_p);
+
+
+/**
+ * @brief Wait for all queued jobs to finish
+ *
+ * Will wait for all jobs - both queued and currently running to finish.
+ * Once the queue is empty and all work has completed, the calling thread
+ * (probably the main program) will continue.
+ *
+ * Smart polling is used in wait. The polling is initially 0 - meaning that
+ * there is virtually no polling at all. If after 1 seconds the threads
+ * haven't finished, the polling interval starts growing exponentially
+ * until it reaches max_secs seconds. Then it jumps down to a maximum polling
+ * interval assuming that heavy processing is being used in the threadpool.
+ *
+ * @example
+ *
+ *    ..
+ *    threadpool thpool = thpool_init(4);
+ *    ..
+ *    // Add a bunch of work
+ *    ..
+ *    thpool_wait(thpool);
+ *    puts("All added work has finished");
+ *    ..
+ *
+ * @param threadpool     the threadpool to wait for
+ * @return nothing
+ */
+void thpool_wait(threadpool);
+
+
+/**
+ * @brief Pauses all threads immediately
+ *
+ * The threads will be paused no matter if they are idle or working.
+ * The threads return to their previous states once thpool_resume
+ * is called.
+ *
+ * While the thread is being paused, new work can be added.
+ *
+ * @example
+ *
+ *    threadpool thpool = thpool_init(4);
+ *    thpool_pause(thpool);
+ *    ..
+ *    // Add a bunch of work
+ *    ..
+ *    thpool_resume(thpool); // Let the threads start their magic
+ *
+ * @param threadpool    the threadpool where the threads should be paused
+ * @return nothing
+ */
+void thpool_pause(threadpool);
+
+
+/**
+ * @brief Unpauses all threads if they are paused
+ *
+ * @example
+ *    ..
+ *    thpool_pause(thpool);
+ *    sleep(10);              // Delay execution 10 seconds
+ *    thpool_resume(thpool);
+ *    ..
+ *
+ * @param threadpool     the threadpool where the threads should be unpaused
+ * @return nothing
+ */
+void thpool_resume(threadpool);
+
+
+/**
+ * @brief Destroy the threadpool
+ *
+ * This will wait for the currently active threads to finish and then 'kill'
+ * the whole threadpool to free up memory.
+ *
+ * @example
+ * int main() {
+ *    threadpool thpool1 = thpool_init(2);
+ *    threadpool thpool2 = thpool_init(2);
+ *    ..
+ *    thpool_destroy(thpool1);
+ *    ..
+ *    return 0;
+ * }
+ *
+ * @param threadpool     the threadpool to destroy
+ * @return nothing
+ */
+void thpool_destroy(threadpool);
+
+
+/**
+ * @brief Show currently working threads
+ *
+ * Working threads are the threads that are performing work (not idle).
+ *
+ * @example
+ * int main() {
+ *    threadpool thpool1 = thpool_init(2);
+ *    threadpool thpool2 = thpool_init(2);
+ *    ..
+ *    printf("Working threads: %d\n", thpool_num_threads_working(thpool1));
+ *    ..
+ *    return 0;
+ * }
+ *
+ * @param threadpool     the threadpool of interest
+ * @return integer       number of threads working
+ */
+int thpool_num_threads_working(threadpool);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif