From fee3c1d740c0e027c81e2f2f3fb48d619857175f Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Mon, 3 Jun 2024 13:49:56 -0400 Subject: [PATCH] llama : allow doing the equivalent of SSM_CONV with SUM_ROWS and MUL * ggml : allow GGML_OP_CONCAT to work on non-contiguous tensors The implementation already supported it, and this makes Mamba's conv step slightly faster. --- ggml.c | 5 ----- llama.cpp | 20 ++++++++++++++++---- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/ggml.c b/ggml.c index 253b3fa41..1a37ff2f0 100644 --- a/ggml.c +++ b/ggml.c @@ -10992,11 +10992,6 @@ static void ggml_compute_forward_concat_f32( GGML_TENSOR_BINARY_OP_LOCALS - // TODO: support for transposed / permuted tensors - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); - GGML_ASSERT(nb10 == sizeof(float)); - const int32_t dim = ggml_get_op_params_i32(dst, 0); GGML_ASSERT(dim >= 0 && dim < 4); diff --git a/llama.cpp b/llama.cpp index d47364731..36b824d56 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8713,7 +8713,7 @@ static struct ggml_tensor * llm_build_mamba( // conv { // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs} - struct ggml_tensor * conv_x = ggml_concat(ctx, conv, ggml_cont(ctx, ggml_transpose(ctx, x)), 0); + struct ggml_tensor * conv_x = ggml_concat(ctx, conv, ggml_transpose(ctx, x), 0); // copy last (d_conv - 1) columns back into the state cache struct ggml_tensor * last_conv = ggml_view_3d(ctx, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); @@ -8734,6 +8734,8 @@ static struct ggml_tensor * llm_build_mamba( // and then you're left with the resulting x tensor. // For simultaneous sequences, all sequences need to have the same length. + // TODO: remove unused implementations +#if 0 // For some reason, im2col expects a F16 kernel, but doesn't even read from it. // TODO: make im2col accept F32 kernels to directly pass ssm_conv1d to it. // => { d_conv * d_inner, n_seq_tokens, n_seqs} @@ -8741,14 +8743,24 @@ static struct ggml_tensor * llm_build_mamba( ggml_new_tensor_2d(ctx, GGML_TYPE_F16, d_conv, d_inner), conv_x, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F32); + #if 0 + // TODO: CUDA, SYCL, and Vulkan don't (yet) support broadcasting the ne[3] dimension on MUL_MAT x = ggml_reshape_4d(ctx, x, d_conv, 1, d_inner, n_seq_tokens * n_seqs); // => {1, 1, d_inner, n_seq_tokens * n_seqs} x = ggml_mul_mat(ctx, ggml_reshape_3d(ctx, model.layers[il].ssm_conv1d, d_conv, 1, d_inner), x); - x = ggml_reshape_3d(ctx, x, d_inner, n_seq_tokens, n_seqs); + #else + x = ggml_reshape_4d(ctx, x, d_conv, d_inner, n_seq_tokens, n_seqs); - // Alternatively, this does the same as the above - // x = ggml_ssm_conv(ctx, conv_x, model.layers[il].ssm_conv1d); + // NOTE: it seems this is very slighly more performant than MUL_MAT on CPU for small row sizes + // => {1, d_inner, n_seq_tokens, n_seqs} + x = ggml_sum_rows(ctx, ggml_mul(ctx, x, model.layers[il].ssm_conv1d)); + #endif + x = ggml_reshape_3d(ctx, x, d_inner, n_seq_tokens, n_seqs); +#else + // Alternatively, this does the same as the above, but faster + x = ggml_ssm_conv(ctx, conv_x, model.layers[il].ssm_conv1d); +#endif // bias x = ggml_add(ctx, x, model.layers[il].ssm_conv1d_b);