diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 2c8af51a6..5c6159a83 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -6232,7 +6232,7 @@ static __global__ void soft_max_f32(const float * x, const half * mask, const ha
         const int ix = rowx*ncols + col;
         const int iy = rowy*ncols + col;
 
-        const float val = x[ix]*scale + (mask ? __half2float(mask[iy]) : 0.0f) + (pos ? __half2float(slope*pos[col]) : 0.0f);
+        const float val = x[ix]*scale + (mask ? __half2float(mask[iy]) : 0.0f) + (pos ? slope*__half2float(pos[col]) : 0.0f);
 
         vals[col] = val;
         max_val = max(max_val, val);
diff --git a/ggml.c b/ggml.c
index efc570db6..9a2ae6264 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5192,7 +5192,7 @@ static struct ggml_tensor * ggml_soft_max_impl(
         GGML_ASSERT(mask->type == GGML_TYPE_F16);
         GGML_ASSERT(ggml_is_contiguous(mask));
         GGML_ASSERT(ggml_is_matrix(mask));
-        GGML_ASSERT(ggml_can_repeat_rows(mask, a));
+        GGML_ASSERT(mask->ne[1] >= a->ne[1]);
     }
 
     if (pos) {
diff --git a/llama.cpp b/llama.cpp
index 2359ed10a..5aa3a508d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -102,7 +102,7 @@
 #define LLAMA_MAX_NODES   8192
 #define LLAMA_MAX_EXPERTS 8
 
-#define LLAMA_FLASH_ATTN
+//#define LLAMA_FLASH_ATTN
 
 //
 // logging
@@ -4831,6 +4831,11 @@ static struct ggml_tensor * llm_build_kqv(
     struct ggml_tensor * cur;
 
 #if defined(LLAMA_FLASH_ATTN)
+    GGML_UNUSED(model);
+    GGML_UNUSED(n_ctx);
+
+    GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
+
     // split cached v into n_head heads (not transposed)
     struct ggml_tensor * v =
         ggml_view_3d(ctx, kv.v_l[il],
@@ -5260,7 +5265,7 @@ struct llm_build_context {
         cb(KQ_mask, "KQ_mask", -1);
 
         // positions of the tokens in the KV cache
-        struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
+        struct ggml_tensor * KQ_pos = ggml_cast(ctx0, ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0), GGML_TYPE_F16);
         cb(KQ_pos, "KQ_pos", -1);
 
         // shift the entire K-cache if needed
@@ -5804,7 +5809,7 @@ struct llm_build_context {
         cb(KQ_mask, "KQ_mask", -1);
 
         // positions of the tokens in the KV cache
-        struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
+        struct ggml_tensor * KQ_pos = ggml_cast(ctx0, ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0), GGML_TYPE_F16);
         cb(KQ_pos, "KQ_pos", -1);
 
         for (int il = 0; il < n_layer; ++il) {
@@ -6043,7 +6048,7 @@ struct llm_build_context {
         cb(KQ_mask, "KQ_mask", -1);
 
         // positions of the tokens in the KV cache
-        struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
+        struct ggml_tensor * KQ_pos = ggml_cast(ctx0, ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0), GGML_TYPE_F16);
         cb(KQ_pos, "KQ_pos", -1);
 
         inpL = llm_build_norm(ctx0, inpL, hparams,
@@ -6140,7 +6145,7 @@ struct llm_build_context {
         cb(KQ_mask, "KQ_mask", -1);
 
         // positions of the tokens in the KV cache
-        struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
+        struct ggml_tensor * KQ_pos = ggml_cast(ctx0, ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0), GGML_TYPE_F16);
         cb(KQ_pos, "KQ_pos", -1);
 
         for (int il = 0; il < n_layer; ++il) {
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 912223def..278c57299 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1505,7 +1505,7 @@ struct test_attn : public test_case {
         struct ggml_tensor * cur;
 
         cur = ggml_mul_mat     (ctx, k, q);
-        cur = ggml_soft_max_ext(ctx, cur, mask, 1.0f/sqrtf(hs));
+        cur = ggml_soft_max_ext(ctx, cur, mask, nullptr, 1.0f/sqrtf(hs), 0.0f);
         cur = ggml_mul_mat     (ctx, v, cur);
         cur = ggml_permute     (ctx, cur, 0, 2, 1, 3);
         cur = ggml_cont_2d     (ctx, cur, hs*nh, nb);