metal : fix synchronization in new matrix multiplication kernel (#2686)

2024-12-24 18:34:36 +00:00 · 2023-08-21 06:59:29 -04:00 · 2023-08-21 06:59:29 -04:00 · dadbed99e6
commit dadbed99e6
parent cb1c0727bd
1 changed files with 2 additions and 1 deletions
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -1898,10 +1898,11 @@ kernel void kernel_mul_mm(device const  uchar * src0,
        threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
                                      + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
        for (int i = 0; i < 8; i++) {
+            threadgroup_barrier(mem_flags::mem_device);
            simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
        }

-        threadgroup_barrier(mem_flags::mem_threadgroup);
+        threadgroup_barrier(mem_flags::mem_device);
        device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
        if (sgitg==0) {
            for (int i = 0; i < n_rows; i++) {