diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 2c8af51a6..5c6159a83 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -6232,7 +6232,7 @@ static __global__ void soft_max_f32(const float * x, const half * mask, const ha const int ix = rowx*ncols + col; const int iy = rowy*ncols + col; - const float val = x[ix]*scale + (mask ? __half2float(mask[iy]) : 0.0f) + (pos ? __half2float(slope*pos[col]) : 0.0f); + const float val = x[ix]*scale + (mask ? __half2float(mask[iy]) : 0.0f) + (pos ? slope*__half2float(pos[col]) : 0.0f); vals[col] = val; max_val = max(max_val, val); diff --git a/ggml.c b/ggml.c index efc570db6..9a2ae6264 100644 --- a/ggml.c +++ b/ggml.c @@ -5192,7 +5192,7 @@ static struct ggml_tensor * ggml_soft_max_impl( GGML_ASSERT(mask->type == GGML_TYPE_F16); GGML_ASSERT(ggml_is_contiguous(mask)); GGML_ASSERT(ggml_is_matrix(mask)); - GGML_ASSERT(ggml_can_repeat_rows(mask, a)); + GGML_ASSERT(mask->ne[1] >= a->ne[1]); } if (pos) { diff --git a/llama.cpp b/llama.cpp index 2359ed10a..5aa3a508d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -102,7 +102,7 @@ #define LLAMA_MAX_NODES 8192 #define LLAMA_MAX_EXPERTS 8 -#define LLAMA_FLASH_ATTN +//#define LLAMA_FLASH_ATTN // // logging @@ -4831,6 +4831,11 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * cur; #if defined(LLAMA_FLASH_ATTN) + GGML_UNUSED(model); + GGML_UNUSED(n_ctx); + + GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention"); + // split cached v into n_head heads (not transposed) struct ggml_tensor * v = ggml_view_3d(ctx, kv.v_l[il], @@ -5260,7 +5265,7 @@ struct llm_build_context { cb(KQ_mask, "KQ_mask", -1); // positions of the tokens in the KV cache - struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); + struct ggml_tensor * KQ_pos = ggml_cast(ctx0, ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0), GGML_TYPE_F16); cb(KQ_pos, "KQ_pos", -1); // shift the entire K-cache if needed @@ -5804,7 +5809,7 @@ struct llm_build_context { cb(KQ_mask, "KQ_mask", -1); // positions of the tokens in the KV cache - struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); + struct ggml_tensor * KQ_pos = ggml_cast(ctx0, ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0), GGML_TYPE_F16); cb(KQ_pos, "KQ_pos", -1); for (int il = 0; il < n_layer; ++il) { @@ -6043,7 +6048,7 @@ struct llm_build_context { cb(KQ_mask, "KQ_mask", -1); // positions of the tokens in the KV cache - struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); + struct ggml_tensor * KQ_pos = ggml_cast(ctx0, ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0), GGML_TYPE_F16); cb(KQ_pos, "KQ_pos", -1); inpL = llm_build_norm(ctx0, inpL, hparams, @@ -6140,7 +6145,7 @@ struct llm_build_context { cb(KQ_mask, "KQ_mask", -1); // positions of the tokens in the KV cache - struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); + struct ggml_tensor * KQ_pos = ggml_cast(ctx0, ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0), GGML_TYPE_F16); cb(KQ_pos, "KQ_pos", -1); for (int il = 0; il < n_layer; ++il) { diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 912223def..278c57299 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1505,7 +1505,7 @@ struct test_attn : public test_case { struct ggml_tensor * cur; cur = ggml_mul_mat (ctx, k, q); - cur = ggml_soft_max_ext(ctx, cur, mask, 1.0f/sqrtf(hs)); + cur = ggml_soft_max_ext(ctx, cur, mask, nullptr, 1.0f/sqrtf(hs), 0.0f); cur = ggml_mul_mat (ctx, v, cur); cur = ggml_permute (ctx, cur, 0, 2, 1, 3); cur = ggml_cont_2d (ctx, cur, hs*nh, nb);