ggml : fix cont with transposed tensors when one dimension is 1 (ggml/934)

* ggml_cont: fix issue with transposed tensors when one dimension is 1

when using multiple threads, it is not enough
to check for the tensors to be contiguous for
ggml_compute_forward_dup_same_cont to work correctly.
The tensors strides also need to match.

Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>

* Add ggml_cont tests

Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>

* Remove dead code

it isn't possible to reach this code because
all these functions are invoked by ggml_compute_forward_dup
if and only if src0->type != dst->type

Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>

* Make ggml_compute_forward_dup_same_cont work with contiguous tensors

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>

---------

Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Salvatore Mesoraca 2024-08-28 10:23:02 +02:00 committed by Georgi Gerganov
parent fbb7fcffbc
commit efe6a83e30
2 changed files with 14 additions and 21 deletions

View File

@ -8322,8 +8322,7 @@ static void ggml_compute_forward_dup_same_cont(
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == dst->type); GGML_ASSERT(src0->type == dst->type);
const size_t nb00 = src0->nb[0]; const size_t nb0 = ggml_type_size(src0->type);
const size_t nb0 = dst->nb[0];
const int ith = params->ith; // thread index const int ith = params->ith; // thread index
const int nth = params->nth; // number of threads const int nth = params->nth; // number of threads
@ -8337,8 +8336,8 @@ static void ggml_compute_forward_dup_same_cont(
if (ie0 < ie1) { if (ie0 < ie1) {
memcpy( memcpy(
((char *) dst->data + ie0*nb0), ((char *) dst->data + ie0*nb0),
((char *) src0->data + ie0*nb00), ((char *) src0->data + ie0*nb0),
(ie1 - ie0) * ggml_type_size(src0->type)); (ie1 - ie0) * nb0);
} }
} }
@ -8355,11 +8354,6 @@ static void ggml_compute_forward_dup_f16(
const int ith = params->ith; // thread index const int ith = params->ith; // thread index
const int nth = params->nth; // number of threads const int nth = params->nth; // number of threads
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
ggml_compute_forward_dup_same_cont(params, dst);
return;
}
// parallelize by rows // parallelize by rows
const int nr = ne01; const int nr = ne01;
// number of rows per thread // number of rows per thread
@ -8624,11 +8618,6 @@ static void ggml_compute_forward_dup_bf16(
const int ith = params->ith; // thread index const int ith = params->ith; // thread index
const int nth = params->nth; // number of threads const int nth = params->nth; // number of threads
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
ggml_compute_forward_dup_same_cont(params, dst);
return;
}
// parallelize by rows // parallelize by rows
const int nr = ne01; const int nr = ne01;
// number of rows per thread // number of rows per thread
@ -8980,11 +8969,6 @@ static void ggml_compute_forward_dup_f32(
const int ith = params->ith; // thread index const int ith = params->ith; // thread index
const int nth = params->nth; // number of threads const int nth = params->nth; // number of threads
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
ggml_compute_forward_dup_same_cont(params, dst);
return;
}
// parallelize by rows // parallelize by rows
const int nr = ne01; const int nr = ne01;
// number of rows per thread // number of rows per thread
@ -9294,13 +9278,13 @@ static void ggml_compute_forward_dup_bytes(
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
GGML_ASSERT(src0->type == dst->type); GGML_ASSERT(src0->type == dst->type);
GGML_TENSOR_UNARY_OP_LOCALS;
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
ggml_compute_forward_dup_same_cont(params, dst); ggml_compute_forward_dup_same_cont(params, dst);
return; return;
} }
GGML_TENSOR_UNARY_OP_LOCALS;
const size_t type_size = ggml_type_size(src0->type); const size_t type_size = ggml_type_size(src0->type);
const int ith = params->ith; // thread index const int ith = params->ith; // thread index
const int nth = params->nth; // number of threads const int nth = params->nth; // number of threads

View File

@ -2322,6 +2322,15 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
} }
test_cases.emplace_back(new test_cont()); test_cases.emplace_back(new test_cont());
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}));
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 3 ,5}));
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 3, 5 ,7}));
test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 1 ,1}));
test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 3 ,5}));
test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 3, 5 ,7}));
test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 1 ,1}));
test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 3 ,5}));
test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 3, 5 ,7}));
auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) { auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
for (auto op : {ggml_add, ggml_mul, ggml_div}) { for (auto op : {ggml_add, ggml_mul, ggml_div}) {