mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 11:24:35 +00:00
ggml : update softmax n_task calculation (#5126)
updated the n_task calculation to use max number of threads possible. This has improved the prompt eval performance by around 5% for DOT kernels and by around 10% for MMLA kernels on AWS Graviton3.
This commit is contained in:
parent
5f1925a8ce
commit
7032f4f634
2
ggml.c
2
ggml.c
@ -16597,7 +16597,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||||||
} break;
|
} break;
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
{
|
{
|
||||||
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user