mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 03:14:35 +00:00
ggml : do not sched_yield when calling BLAS (#4761)
* ggml : do not sched_yield when calling BLAS ggml-ci * ggml : fix do_yield logic ggml-ci * ggml : simplify do_yield logic ggml-ci
This commit is contained in:
parent
3681f22443
commit
c1d7cb28d3
39
ggml.c
39
ggml.c
@ -9704,10 +9704,10 @@ static void ggml_compute_forward_group_norm(
|
|||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||||
// helper function to determine if it is better to use BLAS or not
|
// helper function to determine if it is better to use BLAS or not
|
||||||
// for large matrices, BLAS is faster
|
// for large matrices, BLAS is faster
|
||||||
static bool ggml_compute_forward_mul_mat_use_blas(
|
static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1 = dst->src[1];
|
||||||
struct ggml_tensor * dst) {
|
|
||||||
//const int64_t ne00 = src0->ne[0];
|
//const int64_t ne00 = src0->ne[0];
|
||||||
//const int64_t ne01 = src0->ne[1];
|
//const int64_t ne01 = src0->ne[1];
|
||||||
|
|
||||||
@ -9787,7 +9787,7 @@ static void ggml_compute_forward_mul_mat(
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||||
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
||||||
if (params->ith != 0) {
|
if (params->ith != 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -16301,24 +16301,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||||||
|
|
||||||
//n_tasks = MIN(n_threads, MAX(1, nr0/128));
|
//n_tasks = MIN(n_threads, MAX(1, nr0/128));
|
||||||
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
|
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
|
||||||
if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
|
|
||||||
n_tasks = 1; // TODO: this actually is doing nothing
|
|
||||||
// the threads are still spinning
|
|
||||||
}
|
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
|
||||||
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
|
|
||||||
n_tasks = 1; // TODO: this actually is doing nothing
|
|
||||||
// the threads are still spinning
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
||||||
if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
|
|
||||||
n_tasks = 1; // TODO: this actually is doing nothing
|
|
||||||
// the threads are still spinning
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_MUL_MAT_ID:
|
case GGML_OP_MUL_MAT_ID:
|
||||||
{
|
{
|
||||||
@ -16491,6 +16473,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||||||
state->shared->node_n += 1;
|
state->shared->node_n += 1;
|
||||||
return (thread_ret_t) GGML_EXIT_ABORTED;
|
return (thread_ret_t) GGML_EXIT_ABORTED;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
||||||
// all other threads are finished and spinning
|
// all other threads are finished and spinning
|
||||||
// do finalize and init here so we don't have synchronize again
|
// do finalize and init here so we don't have synchronize again
|
||||||
@ -16556,14 +16539,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||||||
} else {
|
} else {
|
||||||
// wait for other threads to finish
|
// wait for other threads to finish
|
||||||
const int last = node_n;
|
const int last = node_n;
|
||||||
|
|
||||||
|
const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
||||||
// depending on the workload and the operating system.
|
// depending on the workload and the operating system.
|
||||||
// since it is not clear what is the best approach, it should potentially become user-configurable
|
// since it is not clear what is the best approach, it should potentially become user-configurable
|
||||||
// ref: https://github.com/ggerganov/ggml/issues/291
|
// ref: https://github.com/ggerganov/ggml/issues/291
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
// UPD: adding the do_yield flag seems to resolve the issue universally
|
||||||
|
if (do_yield) {
|
||||||
sched_yield();
|
sched_yield();
|
||||||
#endif
|
}
|
||||||
|
|
||||||
node_n = atomic_load(&state->shared->node_n);
|
node_n = atomic_load(&state->shared->node_n);
|
||||||
if (node_n != last) break;
|
if (node_n != last) break;
|
||||||
@ -16642,7 +16629,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|||||||
} else
|
} else
|
||||||
#endif
|
#endif
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||||
if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
|
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
||||||
if (node->src[0]->type != GGML_TYPE_F32) {
|
if (node->src[0]->type != GGML_TYPE_F32) {
|
||||||
// here we need memory just for single 2D matrix from src0
|
// here we need memory just for single 2D matrix from src0
|
||||||
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
|
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
|
||||||
|
Loading…
Reference in New Issue
Block a user