perf: use bigger threadgroups in mm

2025-01-02 14:54:35 +00:00 · 2023-10-11 16:02:53 -07:00 · 2023-10-11 16:02:53 -07:00 · 3327d84a7f
commit 3327d84a7f
parent 46385ee0d5
2 changed files with 2 additions and 2 deletions
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@ -1148,7 +1148,7 @@ void ggml_vk_mul_mat_mat_q4_x(const std::vector<uint32_t>& spirv,
    } else {
        s_algo = komputeManager()->getAlgorithm(__func__);
        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01),
+        s_algo->setWorkgroup({unsigned(ne01)/32,
                              unsigned(ne11),
                              unsigned(std::max(ne12, ne02)),
                              });
--- a/kompute/op_mul_mat_mat_q4_0.comp
+++ b/kompute/op_mul_mat_mat_q4_0.comp
@ -14,7 +14,7 @@
 #extension GL_KHR_shader_subgroup_arithmetic : require
 #extension GL_EXT_debug_printf : enable
-// layout(local_size_x = 8) in;
+layout(local_size_x = 32) in;
 layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout(binding = 1) readonly buffer tensorInB { float inB[]; };