mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 10:54:36 +00:00
ggml : update softmax n_task calculation (#5126)
updated the n_task calculation to use max number of threads possible. This has improved the prompt eval performance by around 5% for DOT kernels and by around 10% for MMLA kernels on AWS Graviton3.
This commit is contained in:
parent
5f1925a8ce
commit
7032f4f634
2
ggml.c
2
ggml.c
@ -16597,7 +16597,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
} break;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
{
|
||||
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
||||
n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user