metal : specialize for head size

2024-12-30 13:24:35 +00:00 · 2024-01-21 12:01:55 +02:00 · 2024-01-21 12:01:55 +02:00 · b97325800a
commit b97325800a
parent 52ae085750
2 changed files with 179 additions and 122 deletions
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -147,7 +147,9 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
    GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
    GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
-    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,
    GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
    GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
    GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,
@ -518,7 +520,9 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,       argsort_f32_i32_asc,     true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,      argsort_f32_i32_desc,    true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,            leaky_relu_f32,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16,        flash_attn_ext_f16,     true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64,    flash_attn_ext_f16_h64,  true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80,    flash_attn_ext_f16_h80,  true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,   flash_attn_ext_f16_h128, true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16,               cpy_f32_f16,             true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32,               cpy_f32_f32,             true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,              cpy_f32_q8_0,            true);
@ -2172,6 +2176,7 @@ static bool ggml_metal_graph_compute(
                    } break;
                case GGML_OP_FLASH_ATTN_EXT:
                    {
                        GGML_ASSERT(ne00 % 4 == 0);
                        GGML_ASSERT(src0->type == GGML_TYPE_F16);
                        struct ggml_tensor * src2 = gf->nodes[i]->src[2];
@ -2202,7 +2207,19 @@ static bool ggml_metal_graph_compute(
                        float scale;
                        memcpy(&scale, dst->op_params, sizeof(float));
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16].pipeline;
+                        id<MTLComputePipelineState> pipeline = nil;
                        switch (ne00) {
                            case 64:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break;
                            case 80:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break;
                            case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break;
                            default:
                                {
                                    GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
                                    GGML_METAL_LOG_ERROR("add template specialization for this size\n");
                                    GGML_ASSERT(false && "add template specialization for this size");
                                }
                        }
                        // TODO: extend if necessary
                        [encoder setComputePipelineState:pipeline];
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -1959,6 +1959,43 @@ kernel void kernel_leaky_relu_f32(
    dst[tpig] = src0[tpig] > 0.0f ? src0[tpig] : src0[tpig] * slope;
 }
 typedef void (flash_attn_ext_f16_t)(
        device const  char * q,
        device const  char * k,
        device const  char * v,
        device const  char * mask,
        device       float * dst,
        constant   int64_t & ne00,
        constant   int64_t & ne01,
        constant   int64_t & ne02,
        constant   int64_t & ne03,
        constant  uint64_t & nb00,
        constant  uint64_t & nb01,
        constant  uint64_t & nb02,
        constant  uint64_t & nb03,
        constant   int64_t & ne10,
        constant   int64_t & ne11,
        constant   int64_t & ne12,
        constant   int64_t & ne13,
        constant  uint64_t & nb10,
        constant  uint64_t & nb11,
        constant  uint64_t & nb12,
        constant  uint64_t & nb13,
        constant   int64_t & ne31,
        constant  uint64_t & nb31,
        constant   int64_t & ne0,
        constant   int64_t & ne1,
        constant   int64_t & ne2,
        constant   int64_t & ne3,
        constant     float & scale,
        threadgroup   half * shared,
        uint3 tgpig[[threadgroup_position_in_grid]],
        uint3 tpitg[[thread_position_in_threadgroup]],
        uint3   ntg[[threads_per_threadgroup]],
        uint  tiisg[[thread_index_in_simdgroup]],
        uint  sgitg[[simdgroup_index_in_threadgroup]]);
 template<int64_t D> // head size
 kernel void kernel_flash_attn_ext_f16(
        device const  char * q,
        device const  char * k,
@ -2002,7 +2039,6 @@ kernel void kernel_flash_attn_ext_f16(
        return;
    }
    const int64_t D = ne00;
    const int64_t D4 = D/4;
    // TODO: can we move this to the stack?
@ -2097,6 +2133,10 @@ kernel void kernel_flash_attn_ext_f16(
    }
 }
 template [[host_name("kernel_flash_attn_ext_f16_h64" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<64>;
 template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<80>;
 template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<128>;
 kernel void kernel_cpy_f16_f16(
        device  const half * src0,
        device        half * dst,