Nomic Vulkan backend (#4456)

Signed-off-by: Jared Van Bortel <jared@nomic.ai> Co-authored-by: niansa <anton-sa@web.de> Co-authored-by: Adam Treat <treat.adam@gmail.com> Co-authored-by: Aaron Miller <apage43@ninjawhale.com> Co-authored-by: ToKiNoBug <tokinobug@163.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: slaren <slarengh@gmail.com>
2025-01-12 03:31:46 +00:00 · 2024-01-29 15:50:50 -05:00 · 2024-01-29 15:50:50 -05:00 · fbf1ddec69
commit fbf1ddec69
parent 2aed77eb06
45 changed files with 4271 additions and 19 deletions
--- a/.ecrc
+++ b/.ecrc
@ -1,4 +1,5 @@
 {
  "Exclude": ["^\\.gitmodules$"],
  "Disable": {
    "IndentSize": true
  }
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -337,6 +337,7 @@ jobs:
      OPENCL_VERSION: 2023.04.17
      CLBLAST_VERSION: 1.6.0
      SDE_VERSION: 9.33.0-2024-01-07
      VULKAN_VERSION: 1.3.261.1
    strategy:
      matrix:
@ -353,6 +354,8 @@ jobs:
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
          - build: 'openblas'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
          - build: 'kompute'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
    steps:
      - name: Clone
@ -361,6 +364,12 @@ jobs:
        with:
          fetch-depth: 0
      - name: Clone Kompute submodule
        id: clone_kompute
        if: ${{ matrix.build == 'kompute' }}
        run: |
          git submodule update --init kompute
      - name: Download OpenCL SDK
        id: get_opencl
        if: ${{ matrix.build == 'clblast' }}
@ -395,6 +404,15 @@ jobs:
          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
      - name: Install Vulkan SDK
        id: get_vulkan
        if: ${{ matrix.build == 'kompute' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
      - name: Build
        id: cmake_build
        run: |
@ -432,7 +450,8 @@ jobs:
      - name: Test
        id: cmake_test
-        if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
+        # not all machines have native AVX-512
        if: ${{ matrix.build != 'clblast' && matrix.build != 'kompute' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
        run: |
          cd build
          ctest -L main -C Release --verbose --timeout 900
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
 [submodule "kompute"]
 	path = kompute
 	url = https://github.com/nomic-ai/kompute.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -103,6 +103,7 @@ option(LLAMA_VULKAN                          "llama: use Vulkan"
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
 option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
 option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
@ -484,7 +485,6 @@ if (LLAMA_HIPBLAS)
    endif()
 endif()
 if (LLAMA_SYCL)
    if ( NOT DEFINED ENV{ONEAPI_ROOT})
        message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
@ -510,6 +510,160 @@ if (LLAMA_SYCL)
    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
 endif()
 if (LLAMA_KOMPUTE)
    add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
    find_package(Vulkan COMPONENTS glslc REQUIRED)
    find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
    if (NOT glslc_executable)
        message(FATAL_ERROR "glslc not found")
    endif()
    function(compile_shader)
      set(options)
      set(oneValueArgs)
      set(multiValueArgs SOURCES)
      cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
      foreach(source ${compile_shader_SOURCES})
        get_filename_component(filename ${source} NAME)
        set(spv_file ${filename}.spv)
        add_custom_command(
            OUTPUT ${spv_file}
            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
              ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
              ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
              ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
              ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
              COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
            COMMENT "Compiling ${source} to ${spv_file}"
        )
        get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
        set(FILE_NAME "shader${RAW_FILE_NAME}")
        string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
        string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
        string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
        set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
        message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
        if(CMAKE_GENERATOR MATCHES "Visual Studio")
            add_custom_command(
              OUTPUT ${OUTPUT_HEADER_FILE}
              COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
              COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
              COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
              COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
              COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
              COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
              COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
              COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
              DEPENDS ${spv_file} xxd
              COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
            )
        else()
            add_custom_command(
              OUTPUT ${OUTPUT_HEADER_FILE}
              COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
              COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
              COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
              COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
              COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
              COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
              COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
              COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
              DEPENDS ${spv_file} xxd
              COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
            )
        endif()
      endforeach()
    endfunction()
    if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
        message(STATUS "Kompute found")
        set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
        add_subdirectory(kompute)
        # Compile our shaders
        compile_shader(SOURCES
          kompute-shaders/op_scale.comp
          kompute-shaders/op_scale_8.comp
          kompute-shaders/op_add.comp
          kompute-shaders/op_addrow.comp
          kompute-shaders/op_mul.comp
          kompute-shaders/op_silu.comp
          kompute-shaders/op_relu.comp
          kompute-shaders/op_gelu.comp
          kompute-shaders/op_softmax.comp
          kompute-shaders/op_norm.comp
          kompute-shaders/op_rmsnorm.comp
          kompute-shaders/op_diagmask.comp
          kompute-shaders/op_mul_mat_mat_f32.comp
          kompute-shaders/op_mul_mat_f16.comp
          kompute-shaders/op_mul_mat_q8_0.comp
          kompute-shaders/op_mul_mat_q4_0.comp
          kompute-shaders/op_mul_mat_q4_1.comp
          kompute-shaders/op_mul_mat_q6_k.comp
          kompute-shaders/op_getrows_f16.comp
          kompute-shaders/op_getrows_q4_0.comp
          kompute-shaders/op_getrows_q4_1.comp
          kompute-shaders/op_getrows_q6_k.comp
          kompute-shaders/op_rope_f16.comp
          kompute-shaders/op_rope_f32.comp
          kompute-shaders/op_cpy_f16_f16.comp
          kompute-shaders/op_cpy_f16_f32.comp
          kompute-shaders/op_cpy_f32_f16.comp
          kompute-shaders/op_cpy_f32_f32.comp
        )
        # Create a custom target for our generated shaders
        add_custom_target(generated_shaders DEPENDS
          shaderop_scale.h
          shaderop_scale_8.h
          shaderop_add.h
          shaderop_addrow.h
          shaderop_mul.h
          shaderop_silu.h
          shaderop_relu.h
          shaderop_gelu.h
          shaderop_softmax.h
          shaderop_norm.h
          shaderop_rmsnorm.h
          shaderop_diagmask.h
          shaderop_mul_mat_mat_f32.h
          shaderop_mul_mat_f16.h
          shaderop_mul_mat_q8_0.h
          shaderop_mul_mat_q4_0.h
          shaderop_mul_mat_q4_1.h
          shaderop_mul_mat_q6_k.h
          shaderop_getrows_f16.h
          shaderop_getrows_q4_0.h
          shaderop_getrows_q4_1.h
          shaderop_getrows_q6_k.h
          shaderop_rope_f16.h
          shaderop_rope_f32.h
          shaderop_cpy_f16_f16.h
          shaderop_cpy_f16_f32.h
          shaderop_cpy_f32_f16.h
          shaderop_cpy_f32_f32.h
        )
        # Create a custom command that depends on the generated_shaders
        add_custom_command(
            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
            COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
            DEPENDS generated_shaders
            COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
        )
        # Add the stamp to the main sources to ensure dependency tracking
        set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
        set(GGML_HEADERS_KOMPUTE ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
        add_compile_definitions(GGML_USE_KOMPUTE)
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
    else()
        message(WARNING "Kompute not found")
    endif()
 endif()
 function(get_flags CCID CCVER)
    set(C_FLAGS "")
    set(CXX_FLAGS "")
@ -859,6 +1013,7 @@ add_library(ggml OBJECT
            ${GGML_SOURCES_MPI}     ${GGML_HEADERS_MPI}
            ${GGML_SOURCES_EXTRA}   ${GGML_HEADERS_EXTRA}
            ${GGML_SOURCES_SYCL}    ${GGML_HEADERS_SYCL}
            ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
            )
 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -373,6 +373,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
    extern GGML_CALL int ggml_backend_vk_reg_devices(void);
    ggml_backend_vk_reg_devices();
 #endif
 #ifdef GGML_USE_KOMPUTE
    extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
    ggml_backend_kompute_reg_devices();
 #endif
 }
 GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
--- a/ggml-kompute.h
+++ b/ggml-kompute.h
@ -0,0 +1,46 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct ggml_vk_device {
    int index;
    int type; // same as VkPhysicalDeviceType
    size_t heapSize;
    const char * name;
    const char * vendor;
    int subgroupSize;
    uint64_t bufferAlignment;
    uint64_t maxAlloc;
 };
 struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
 bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
 bool ggml_vk_has_vulkan(void);
 bool ggml_vk_has_device(void);
 struct ggml_vk_device ggml_vk_current_device(void);
 //
 // backend API
 //
 // forward declaration
 typedef struct ggml_backend * ggml_backend_t;
 GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
 GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
 GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
 #ifdef __cplusplus
 }
 #endif
--- a/1
+++ b/1
@ -0,0 +1 @@
 Subproject commit 4565194ed7c32d1d2efa32ceab4d3c6cae006306
--- a/kompute-shaders/common.comp
+++ b/kompute-shaders/common.comp
@ -0,0 +1,102 @@
 #extension GL_EXT_shader_16bit_storage: require
 #extension GL_EXT_shader_8bit_storage: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #extension GL_EXT_shader_explicit_arithmetic_types_int8: require
 #extension GL_EXT_shader_explicit_arithmetic_types_int16: require
 #extension GL_EXT_control_flow_attributes: enable
 #extension GL_KHR_shader_subgroup_arithmetic : require
 #extension GL_EXT_debug_printf : enable
 #define QK4_0 32
 #define QK4_1 32
 #define GELU_COEF_A 0.044715
 #define SQRT_2_OVER_PI 0.79788456080286535587989211986876
 #define TWOPI_F 6.283185307179586f
 #define QK_K 256
 #define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
 #define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
 #define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
 #define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
 #define sizeof_block_q4_0 0x12
 struct block_q4_0 {
    float16_t d;
    uint8_t qs[QK4_0 / 2];
 };
 mat4 dequantize_q4_0(const block_q4_0 xb, uint il) {
    const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
    const float d2 = d1 / 256.f;
    const float md = -8.f * xb.d;
    const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
    const uint16_t mask1 = mask0 << 8;
    mat4 reg;
    for (int i=0;i<8;i++) {
        uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
        reg[i/2][2*(i%2)+0] = d1 * (b & mask0) + md;
        reg[i/2][2*(i%2)+1] = d2 * (b & mask1) + md;
    }
    return reg;
 }
 #define sizeof_block_q4_1 0x14
 struct block_q4_1 {
    float16_t d;
    float16_t m;
    uint8_t qs[QK4_1 / 2];
 };
 mat4 dequantize_q4_1(const block_q4_1 xb, uint il) {
    const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
    const float d2 = d1 / 256.f;
    const float  m = xb.m;
    const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
    const uint16_t mask1 = mask0 << 8;
    mat4 reg;
    for (int i=0;i<8;i++) {
        uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
        reg[i/2][2*(i%2)+0] = ((b & mask0) * d1) + m;
        reg[i/2][2*(i%2)+1] = ((b & mask1) * d2) + m;
    }
    return reg;
 }
 #define sizeof_block_q6_k 210
 struct block_q6_k {
    uint8_t ql[QK_K/2];      // quants, lower 4 bits
    uint8_t qh[QK_K/4];      // quants, upper 2 bits
    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
    float16_t d;             // super-block scale
 };
 mat4 dequantize_q6_k(const block_q6_k xb, uint il) {
    const float16_t d_all = xb.d;
    const uint qlIndex = 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
    const uint qhIndex = 32*(il/8) + 16*(il&1);
    float16_t sc = xb.scales[(il%2) + 2 * ((il/2))];
    il = (il/2) & 3;
    const uint16_t  kmask1 = il>1 ? uint16_t(il>2 ? 192 : 48) : uint16_t(il>0 ? 12 : 3);
    const uint16_t  kmask2 = il>1 ? uint8_t(0xF0)             : uint8_t(0x0F);
    const float16_t coef   = il>1 ? float16_t(1.f/16.f)       : float16_t(1.f);
    const float16_t ml = float16_t(d_all * sc * 32.f);
    const float16_t dl = float16_t(d_all * sc * coef);
    mat4 reg;
    for (int i = 0; i < 16; ++i) {
        const float16_t q = (il&1) != 0 ? ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 2))
                                        : ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 4));
        reg[i/4][i%4] = dl * q - ml;
    }
    return reg;
 }
 #define QK8_0 32
 // struct block_q8_0 {
 //     float16_t d;         // delta
 //     int8_t    qs[QK8_0]; // quants
 // };
 #define sizeof_block_q8_0 34
--- a/kompute-shaders/op_add.comp
+++ b/kompute-shaders/op_add.comp
@ -0,0 +1,58 @@
 #version 450
 #include "common.comp"
 layout(local_size_x = 1024) in;
 layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
 layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
 layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
 layout(push_constant) uniform PushConstants {
    uint inAOff;
    uint inBOff;
    uint outOff;
    int ne00;
    int nb00;
    int nb01;
    int nb02;
    int nb03;
    int ne10;
    int ne11;
    int ne12;
    int ne13;
    int nb10;
    int nb11;
    int nb12;
    int nb13;
    int ne0;
    int nb0;
    int nb1;
    int nb2;
    int nb3;
  //int offs; // TODO: needed for GGML_OP_ACC, see metal code
 } pcs;
 // general-purpose kernel for addition of two tensors
 // pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
 // cons: not very efficient
 void main() {
    const uint i03 = gl_WorkGroupID.z;
    const uint i02 = gl_WorkGroupID.y;
    const uint i01 = gl_WorkGroupID.x;
    const uint i13 = i03 % pcs.ne13;
    const uint i12 = i02 % pcs.ne12;
    const uint i11 = i01 % pcs.ne11;
    int offs = 0; // TMP (see above)
    uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + offs) / 4);
    uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11       ) / 4);
    uint dst_off  = uint((i03*pcs.nb3  + i02*pcs.nb2  + i01*pcs.nb1  + offs) / 4);
    for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
        const uint i10 = i0 % pcs.ne10;
        out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] + inB[pcs.inBOff + src1_off + i10];
    }
 }
--- a/kompute-shaders/op_addrow.comp
+++ b/kompute-shaders/op_addrow.comp
@ -0,0 +1,25 @@
 #version 450
 #include "common.comp"
 layout(local_size_x = 1) in;
 layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
 layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
 layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
 layout(push_constant) uniform PushConstants {
    uint inAOff;
    uint inBOff;
    uint outOff;
    uint row;
 } pcs;
 void main() {
    const uint baseIndex = gl_WorkGroupID.x * 4;
    for (uint x = 0; x < 4; x++) {
        const uint i = baseIndex + x;
        out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
    }
 }
--- a/kompute-shaders/op_cpy_f16_f16.comp
+++ b/kompute-shaders/op_cpy_f16_f16.comp
@ -0,0 +1,52 @@
 #version 450
 #include "common.comp"
 #define IN_TYPE float16_t
 #define IN_TYPE_SIZE 2
 #define OUT_TYPE float16_t
 #define OUT_TYPE_SIZE 2
 layout(local_size_x = 1024) in;
 layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
 layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
 layout (push_constant) uniform parameter {
    uint inOff;
    uint outOff;
    int ne00;
    int ne01;
    int ne02;
    uint nb00;
    uint nb01;
    uint nb02;
    uint nb03;
    int ne0;
    int ne1;
    int ne2;
    uint nb0;
    uint nb1;
    uint nb2;
    uint nb3;
 } pcs;
 void main() {
    const uint i03 = gl_WorkGroupID.z;
    const uint i02 = gl_WorkGroupID.y;
    const uint i01 = gl_WorkGroupID.x;
    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
        out_[dst_data+i00] = OUT_TYPE(in_[src]);
    }
 }
--- a/kompute-shaders/op_cpy_f16_f32.comp
+++ b/kompute-shaders/op_cpy_f16_f32.comp
@ -0,0 +1,52 @@
 #version 450
 #include "common.comp"
 #define IN_TYPE float16_t
 #define IN_TYPE_SIZE 2
 #define OUT_TYPE float
 #define OUT_TYPE_SIZE 4
 layout(local_size_x = 1024) in;
 layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
 layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
 layout (push_constant) uniform parameter {
    uint inOff;
    uint outOff;
    int ne00;
    int ne01;
    int ne02;
    uint nb00;
    uint nb01;
    uint nb02;
    uint nb03;
    int ne0;
    int ne1;
    int ne2;
    uint nb0;
    uint nb1;
    uint nb2;
    uint nb3;
 } pcs;
 void main() {
    const uint i03 = gl_WorkGroupID.z;
    const uint i02 = gl_WorkGroupID.y;
    const uint i01 = gl_WorkGroupID.x;
    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
        out_[dst_data+i00] = OUT_TYPE(in_[src]);
    }
 }
--- a/kompute-shaders/op_cpy_f32_f16.comp
+++ b/kompute-shaders/op_cpy_f32_f16.comp
@ -0,0 +1,52 @@
 #version 450
 #include "common.comp"
 #define IN_TYPE float
 #define IN_TYPE_SIZE 4
 #define OUT_TYPE float16_t
 #define OUT_TYPE_SIZE 2
 layout(local_size_x = 1024) in;
 layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
 layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
 layout (push_constant) uniform parameter {
    uint inOff;
    uint outOff;
    int ne00;
    int ne01;
    int ne02;
    uint nb00;
    uint nb01;
    uint nb02;
    uint nb03;
    int ne0;
    int ne1;
    int ne2;
    uint nb0;
    uint nb1;
    uint nb2;
    uint nb3;
 } pcs;
 void main() {
    const uint i03 = gl_WorkGroupID.z;
    const uint i02 = gl_WorkGroupID.y;
    const uint i01 = gl_WorkGroupID.x;
    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
        out_[dst_data+i00] = OUT_TYPE(in_[src]);
    }
 }
--- a/kompute-shaders/op_cpy_f32_f32.comp
+++ b/kompute-shaders/op_cpy_f32_f32.comp
@ -0,0 +1,52 @@
 #version 450
 #include "common.comp"
 #define IN_TYPE float
 #define IN_TYPE_SIZE 4
 #define OUT_TYPE float
 #define OUT_TYPE_SIZE 4
 layout(local_size_x = 1024) in;
 layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
 layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
 layout (push_constant) uniform parameter {
    uint inOff;
    uint outOff;
    int ne00;
    int ne01;
    int ne02;
    uint nb00;
    uint nb01;
    uint nb02;
    uint nb03;
    int ne0;
    int ne1;
    int ne2;
    uint nb0;
    uint nb1;
    uint nb2;
    uint nb3;
 } pcs;
 void main() {
    const uint i03 = gl_WorkGroupID.z;
    const uint i02 = gl_WorkGroupID.y;
    const uint i01 = gl_WorkGroupID.x;
    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
        out_[dst_data+i00] = OUT_TYPE(in_[src]);
    }
 }
--- a/kompute-shaders/op_diagmask.comp
+++ b/kompute-shaders/op_diagmask.comp
@ -0,0 +1,30 @@
 #version 450
 #include "common.comp"
 layout(local_size_x = 1) in;
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 layout(push_constant) uniform PushConstants {
    uint inOff;
    uint outOff;
    uint n_past;
    int ne00;
    int ne01;
 } pcs;
 void main() {
    const uint i02 = gl_WorkGroupID.z;
    const uint i01 = gl_WorkGroupID.y;
    const uint i00 = gl_WorkGroupID.x;
    const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00;
    if (i00 > pcs.n_past + i01) {
        out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000);
    } else {
        out_[index + pcs.outOff] = in_[index + pcs.inOff];
    }
 }
--- a/kompute-shaders/op_gelu.comp
+++ b/kompute-shaders/op_gelu.comp
@ -0,0 +1,22 @@
 #version 450
 #include "common.comp"
 layout(local_size_x = 1) in;
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 layout(push_constant) uniform PushConstants {
    uint inOff;
    uint outOff;
 } pcs;
 void main() {
    const uint baseIndex = gl_WorkGroupID.x * 8;
    for (uint x = 0; x < 8; x++) {
        const uint i = baseIndex + x;
        const float y = in_[i + pcs.inOff];
        out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(clamp(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y), -15.0, 15.0)));
    }
 }
--- a/kompute-shaders/op_getrows.comp
+++ b/kompute-shaders/op_getrows.comp
@ -0,0 +1,17 @@
 void main() {
    const uint i = gl_WorkGroupID.x;
    const int r = inB[i + pcs.inBOff];
    int z = 0;
    for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) {
        const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK;
        const mat4 result = dequantize_block(inIndex, ind%NL);
        for (uint j = 0; j < 4; ++j) {
            for (uint k = 0; k < 4; ++k) {
                const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z;
                out_[outIndex] = result[j][k];
                ++z;
            }
        }
    }
 }
--- a/kompute-shaders/op_getrows_f16.comp
+++ b/kompute-shaders/op_getrows_f16.comp
@ -0,0 +1,31 @@
 #version 450
 #include "common.comp"
 layout(local_size_x = 1) in;
 layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { int inB[]; };
 layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
 layout (push_constant) uniform parameter {
    uint inAOff;
    uint inBOff;
    uint outOff;
    int ne00;
    int nb01;
    int nb1;
 } pcs;
 void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
    for (int j = 0; j < k; j++) {
        out_[y + j] = inA[x + j];
    }
 }
 void main() {
    const uint i = gl_WorkGroupID.x;
    const int r = inB[i + pcs.inBOff];
    dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
 }
--- a/kompute-shaders/op_getrows_q4_0.comp
+++ b/kompute-shaders/op_getrows_q4_0.comp
@ -0,0 +1,38 @@
 #version 450
 #include "common.comp"
 #define NL 2
 #define BYTES_FOR_TYPE 4 /*bytes for float*/
 #define SIZE_OF_BLOCK sizeof_block_q4_0
 layout(local_size_x = 1) in;
 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { int inB[]; };
 layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
 layout (push_constant) uniform parameter {
    uint inAOff;
    uint inBOff;
    uint outOff;
    int ne00;
    int nb01;
    int nb1;
 } pcs;
 block_q4_0 get_unaligned_block_q4_0(uint index) {
    block_q4_0 fres;
    fres.d = u8BufToFloat16(inA, index);
    [[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
        fres.qs[it] = inA[index+2+it];
    }
    return fres;
 }
 mat4 dequantize_block(uint index, uint il) {
    const block_q4_0 block = get_unaligned_block_q4_0(index);
    return dequantize_q4_0(block, il);
 }
 #include "op_getrows.comp"
--- a/kompute-shaders/op_getrows_q4_1.comp
+++ b/kompute-shaders/op_getrows_q4_1.comp
@ -0,0 +1,39 @@
 #version 450
 #include "common.comp"
 #define NL 2
 #define BYTES_FOR_TYPE 4 /*bytes for float*/
 #define SIZE_OF_BLOCK sizeof_block_q4_1
 layout(local_size_x = 1) in;
 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { int inB[]; };
 layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
 layout (push_constant) uniform parameter {
    uint inAOff;
    uint inBOff;
    uint outOff;
    int ne00;
    int nb01;
    int nb1;
 } pcs;
 block_q4_1 get_unaligned_block_q4_1(uint index) {
    block_q4_1 fres;
    fres.d = u8BufToFloat16(inA, index);
    fres.m = u8BufToFloat16(inA, index+2);
    [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
        fres.qs[it] = inA[index+4+it];
    }
    return fres;
 }
 mat4 dequantize_block(uint index, uint il) {
    const block_q4_1 block = get_unaligned_block_q4_1(index);
    return dequantize_q4_1(block, il);
 }
 #include "op_getrows.comp"
--- a/kompute-shaders/op_getrows_q6_k.comp
+++ b/kompute-shaders/op_getrows_q6_k.comp
@ -0,0 +1,44 @@
 #version 450
 #include "common.comp"
 #define NL 16
 #define BYTES_FOR_TYPE 4 /*bytes for float*/
 #define SIZE_OF_BLOCK sizeof_block_q6_k
 layout(local_size_x = 1) in;
 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { int inB[]; };
 layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
 layout (push_constant) uniform parameter {
    uint inAOff;
    uint inBOff;
    uint outOff;
    int ne00;
    int nb01;
    int nb1;
 } pcs;
 block_q6_k get_unaligned_block_q6_k(uint index) {
    block_q6_k fres;
    [[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
        fres.ql[it] = inA[index + it];
    }
    [[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
        fres.qh[it] = inA[index + QK_K/2 + it];
    }
    [[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
        fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
    }
    fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
    return fres;
 }
 mat4 dequantize_block(uint index, uint il) {
    const block_q6_k block = get_unaligned_block_q6_k(index);
    return dequantize_q6_k(block, il);
 }
 #include "op_getrows.comp"
--- a/kompute-shaders/op_mul.comp
+++ b/kompute-shaders/op_mul.comp
@ -0,0 +1,52 @@
 #version 450
 #include "common.comp"
 layout(local_size_x = 1024) in;
 layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
 layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
 layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
 layout(push_constant) uniform PushConstants {
    uint inAOff;
    uint inBOff;
    uint outOff;
    int ne00;
    int nb00;
    int nb01;
    int nb02;
    int nb03;
    int ne10;
    int ne11;
    int ne12;
    int ne13;
    int nb10;
    int nb11;
    int nb12;
    int nb13;
    int ne0;
    int nb0;
    int nb1;
    int nb2;
    int nb3;
 } pcs;
 void main() {
    const uint i03 = gl_WorkGroupID.z;
    const uint i02 = gl_WorkGroupID.y;
    const uint i01 = gl_WorkGroupID.x;
    const uint i13 = i03 % pcs.ne13;
    const uint i12 = i02 % pcs.ne12;
    const uint i11 = i01 % pcs.ne11;
    uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01) / 4);
    uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11) / 4);
    uint dst_off  = uint((i03*pcs.nb3  + i02*pcs.nb2  + i01*pcs.nb1)  / 4);
    for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
        const uint i10 = i0 % pcs.ne10;
        out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] * inB[pcs.inBOff + src1_off + i10];
    }
 }
--- a/kompute-shaders/op_mul_mat_f16.comp
+++ b/kompute-shaders/op_mul_mat_f16.comp
@ -0,0 +1,67 @@
 #version 450
 #include "common.comp"
 #extension GL_KHR_shader_subgroup_arithmetic : require
 layout(local_size_x_id = 0) in;
 layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { float inB[]; };
 layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
 layout (push_constant) uniform parameter {
    uint inAOff;
    uint inBOff;
    uint outOff;
    int ne00;
    int ne01;
    int ne02;
    uint nb00;
    uint nb01;
    uint nb02;
    int ne10;
    int ne11;
    int ne12;
    uint nb10;
    uint nb11;
    uint nb12;
    int ne0;
    int ne1;
    uint r2;
    uint r3;
 } pcs;
 #define N_F16_F32 4
 void main() {
    const uint r0 = gl_WorkGroupID.x;
    const uint rb = gl_WorkGroupID.y*N_F16_F32;
    const uint im = gl_WorkGroupID.z;
    const uint i12 = im%pcs.ne12;
    const uint i13 = im/pcs.ne12;
    const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb02*pcs.ne02;
    const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
    for (uint row = 0; row < N_F16_F32; ++row) {
        uint r1 = rb + row;
        if (r1 >= pcs.ne11) {
            break;
        }
        const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // Based from inB
        float sumf = 0;
        for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
            sumf += float(inA[x+i]) * float(inB[y+i]);
        }
        const float all_sum = subgroupAdd(sumf);
        if (subgroupElect()) {
            out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
        }
    }
 }
--- a/kompute-shaders/op_mul_mat_mat_f32.comp
+++ b/kompute-shaders/op_mul_mat_mat_f32.comp
@ -0,0 +1,51 @@
 #version 450
 #include "common.comp"
 #extension GL_KHR_shader_subgroup_arithmetic : require
 #extension GL_EXT_debug_printf : enable
 // device subgroup size
 layout (local_size_x_id = 0) in;
 layout(binding = 0) readonly buffer tensorInA { float inA[]; };
 layout(binding = 1) readonly buffer tensorInB { float inB[]; };
 layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
 layout(push_constant) uniform parameter {
  uint inAOff;
  uint inBOff;
  uint outOff;
  int ne00;
  int ne01;
  int ne02;
  int ne11;
  int ne12;
  uint nb01;
  uint nb02;
  uint nb11;
  uint nb12;
  uint nb1;
  uint nb2;
 }
 pcs;
 void main() {
  uvec3 gid = gl_WorkGroupID;
  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
  float sum = 0.0f;
  for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
      sum += float(inA[x+i]) * float(inB[y+i]);
  }
  const float all_sum = subgroupAdd(sum);
  if (subgroupElect()) {
    out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
  }
 }
--- a/kompute-shaders/op_mul_mat_q4_0.comp
+++ b/kompute-shaders/op_mul_mat_q4_0.comp
@ -0,0 +1,33 @@
 #version 450
 #include "common.comp"
 #define BLOCKS_IN_QUANT QK4_0
 #define SIZE_OF_BLOCK sizeof_block_q4_0
 #define N_ROWS 4
 #include "op_mul_mv_q_n_pre.comp"
 // The q4_0 version of this function
 float block_q_n_dot_y(uint block_index, uint yb, uint il) {
    vec2 acc = vec2(0.0, 0.0);
    const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
    float d = float(u8BufToFloat16(inA, index));
    float sumy = 0.0f;
    for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
        const uint16_t b = u8BufToU16(inA, index + 2 + il + i);
        const float yl0 = inB[yb + i];
        const float yl1 = inB[yb + i + 1];
        const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
        const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
        sumy += yl0 + yl1 + yl8 + yl9;
        acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
        acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
    }
    return d * (sumy * -8.f + acc[0] + acc[1]);
 }
 #include "op_mul_mv_q_n.comp"
--- a/kompute-shaders/op_mul_mat_q4_1.comp
+++ b/kompute-shaders/op_mul_mat_q4_1.comp
@ -0,0 +1,35 @@
 #version 450
 #include "common.comp"
 #define BLOCKS_IN_QUANT QK4_1
 #define SIZE_OF_BLOCK sizeof_block_q4_1
 #define N_ROWS 4
 #include "op_mul_mv_q_n_pre.comp"
 // The q4_1 version of this function
 float block_q_n_dot_y(uint block_index, uint yb, uint il) {
    vec2 acc = vec2(0.0, 0.0);
    const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
    float d = float(u8BufToFloat16(inA, index));
    float m = float(u8BufToFloat16(inA, index+2));
    float sumy = 0.0f;
    for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
        const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
        const float yl0 = inB[yb + i];
        const float yl1 = inB[yb + i + 1];
        const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
        const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
        sumy += yl0 + yl1 + yl8 + yl9;
        acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
        acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
    }
    return d * (acc[0] + acc[1]) + sumy * m;
 }
 #include "op_mul_mv_q_n.comp"
--- a/kompute-shaders/op_mul_mat_q6_k.comp
+++ b/kompute-shaders/op_mul_mat_q6_k.comp
@ -0,0 +1,94 @@
 #version 450
 #include "common.comp"
 #define SIZE_OF_BLOCK sizeof_block_q6_k
 layout(local_size_x_id = 0) in;
 layout(local_size_y_id = 1) in;
 layout(local_size_z = 1) in;
 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { float inB[]; };
 layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
 layout (push_constant) uniform parameter {
    uint inAOff;
    uint inBOff;
    uint outOff;
    int ne00;
    int ne10;
    int ne0;
    int ne1;
    int ne01;
    int gqa;
 } pcs;
 void main() {
    const uint8_t kmask1 = uint8_t(0x03);
    const uint8_t kmask2 = uint8_t(0x0C);
    const uint8_t kmask3 = uint8_t(0x30);
    const uint8_t kmask4 = uint8_t(0xC0);
    const uint nb = pcs.ne00/QK_K;
    const uint r0 = gl_WorkGroupID.x;
    const uint r1 = gl_WorkGroupID.y;
    const uint r2 = gl_WorkGroupID.z;
    const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
    const uint offset0 = r2/pcs.gqa*(nb*pcs.ne0);
    const uint x = row * nb + offset0; // Based from inA without base offset
    const uint yy = r1*pcs.ne10 + r2*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
    float sumf = 0;
    // bits of invocation ID for gl_SubgroupSize=32:
    //  x   x   x   x   x
    //  4   3   2   1   0
    // (     tid     ) ix
    //  ip (   il    )
    const uint block_stride = gl_SubgroupSize / 16;         // number of blocks each subgroup processes
    const uint tid  = gl_SubgroupInvocationID/block_stride; // first block_stride groups have tid=0
    const uint ix   = gl_SubgroupInvocationID%block_stride; // first block is 0..block_stride-1
    const uint ip   = tid/8;        // first or second half of block (0 or 1)
    const uint il   = tid%8;        // each half has 8 parts, one per scale
    const uint n    = 4;            // 4 scales at a time (and 4 sums)
    const uint l0   = n*il;         // offset into half-block, 0..28
    const uint is   = 8*ip + l0/16; // 0, 1, 8, 9
    const uint y_offset = 128*ip + l0;
    const uint q_offset_l = 64*ip + l0;
    const uint q_offset_h = 32*ip + l0;
    for (uint i = ix; i < nb; i += block_stride) {
        const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
        const uint qlIndex = q_offset_l;
        const uint q2Index = qlIndex + QK_K/8;
        const uint qhIndex = q_offset_h;
        const uint y = yy + i * QK_K + y_offset;
        float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f};
        for (uint l = 0; l < n; ++l) {
            const uint8_t currentQ1 = inA[baseIndex + qlIndex + l];
            const uint8_t currentQ2 = inA[baseIndex + q2Index + l];
            const uint8_t currentQh = inA[baseIndex + QK_K/2 + qhIndex + l];
            sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32);
            sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32);
            sums[2] += inB[y+l+64] * (int8_t((currentQ1  >> 4) | ((currentQh & kmask3) << 0)) - 32);
            sums[3] += inB[y+l+96] * (int8_t((currentQ2  >> 4) | ((currentQh & kmask4) >> 2)) - 32);
        }
        float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16);
        sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is]));
    }
    const float tot = subgroupAdd(sumf);
    if (subgroupElect()) {
        out_[r1*pcs.ne0 + r2*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
    }
 }
--- a/kompute-shaders/op_mul_mat_q8_0.comp
+++ b/kompute-shaders/op_mul_mat_q8_0.comp
@ -0,0 +1,73 @@
 #version 450
 #include "common.comp"
 #include "op_mul_mv_q_n_pre.comp"
 #define SIZE_OF_D 2
 #define N_DST 4 // each SIMD group works on 4 rows
 #define N_SIMDGROUP 2 // number of SIMD groups in a thread group
 #define N_SIMDWIDTH 32 // assuming SIMD group size is 32
 #define NB_Q8_0 8
 void main() {
    // NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
    if (gl_SubgroupInvocationID > 31)
        return;
    const int nr  = N_DST;
    const int nsg = N_SIMDGROUP;
    const int nw  = N_SIMDWIDTH;
    const int nb = pcs.ne00/QK8_0;
    const uint r0 = gl_WorkGroupID.x;
    const uint r1 = gl_WorkGroupID.y;
    const uint im = gl_WorkGroupID.z;
    const uint first_row = (r0 * nsg + gl_SubgroupID) * nr;
    const uint i12 = im%pcs.ne12;
    const uint i13 = im/pcs.ne12;
    const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
    const uint x = offset0*sizeof_block_q8_0 + pcs.inAOff; // Based from inA
    const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff; // based from inB
    float yl[NB_Q8_0];
    float sumf[N_DST]={0.f, 0.f, 0.f, 0.f};
    const uint ix = gl_SubgroupInvocationID.x/4;
    const uint il = gl_SubgroupInvocationID.x%4;
    uint yb = y + ix * QK8_0 + NB_Q8_0*il;
    // each thread in a SIMD group deals with NB_Q8_0 quants at a time
    for (uint ib = ix; ib < nb; ib += nw/4) {
        for (int i = 0; i < NB_Q8_0; ++i) {
            yl[i] = inB[yb + i];
        }
        for (int row = 0; row < nr; row++) {
            const uint block_offset = (ib+row*nb) * sizeof_block_q8_0;
            float sumq = 0.f;
            for (int iq = 0; iq < NB_Q8_0; ++iq) {
                const int8_t qs_iq = int8_t(inA[x + block_offset + SIZE_OF_D + NB_Q8_0*il + iq]);
                sumq += qs_iq * yl[iq];
            }
            const float16_t d = u8BufToFloat16(inA, x + block_offset);
            sumf[row] += sumq*d;
        }
        yb += NB_Q8_0 * nw;
    }
    for (int row = 0; row < nr; ++row) {
        const float tot = subgroupAdd(sumf[row]);
        if (subgroupElect() && first_row + row < pcs.ne01) {
            out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row] = tot;
        }
    }
 }
--- a/kompute-shaders/op_mul_mv_q_n.comp
+++ b/kompute-shaders/op_mul_mv_q_n.comp
@ -0,0 +1,48 @@
 void main() {
    // NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
    if (gl_SubgroupInvocationID > 31)
        return;
    const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
    const uint r0 = gl_WorkGroupID.x;
    const uint r1 = gl_WorkGroupID.y;
    const uint im = gl_WorkGroupID.z;
    const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;
    const uint i12 = im%pcs.ne12;
    const uint i13 = im/pcs.ne12;
    const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
    const uint x = offset0; // Based from inA without base offset
    const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
    float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
    const uint ix = gl_SubgroupInvocationID/2;
    const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);
    uint yb = y + ix * BLOCKS_IN_QUANT + il;
    //debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
    //    gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
    //    gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
    for (uint ib = ix; ib < nb; ib += 16) {
        for (int row = 0; row < N_ROWS; row++) {
            const uint block_index = x + ib + row * nb;
            sumf[row] += block_q_n_dot_y(block_index, yb, il);
        }
        yb += BLOCKS_IN_QUANT * 16;
    }
    for (int row = 0; row < N_ROWS; ++row) {
        const float tot = subgroupAdd(sumf[row]);
        if (first_row + row < pcs.ne01 && subgroupElect()) {
            out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;
        }
    }
 }
--- a/kompute-shaders/op_mul_mv_q_n_pre.comp
+++ b/kompute-shaders/op_mul_mv_q_n_pre.comp
@ -0,0 +1,22 @@
 layout(local_size_x_id = 0) in;
 layout(local_size_y = 1) in;
 layout(local_size_z = 1) in;
 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { float inB[]; };
 layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
 layout (push_constant) uniform parameter {
    uint inAOff;
    uint inBOff;
    uint outOff;
    int  ne00;
    int  ne01;
    int  ne02;
    int  ne10;
    int  ne12;
    int  ne0;
    int  ne1;
    uint r2;
    uint r3;
 } pcs;
--- a/kompute-shaders/op_norm.comp
+++ b/kompute-shaders/op_norm.comp
@ -0,0 +1,84 @@
 #version 450
 #include "common.comp"
 layout(local_size_x = 256) in;
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict tensorOut { float out_[]; };
 layout(push_constant) uniform PushConstants {
    uint inOff;
    uint outOff;
    uint ne00;
    uint nb01;
    float eps;
 } pcs;
 shared float sum[gl_WorkGroupSize.x];
 void main() {
    const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
    // MEAN
    // parallel sum
    sum[gl_LocalInvocationID.x] = 0.0;
    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
        sum[gl_LocalInvocationID.x] += in_[x+i00];
    }
    // reduce
    barrier();
    memoryBarrierShared();
    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
        if (gl_LocalInvocationID.x < i) {
            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
        }
        barrier();
        memoryBarrierShared();
    }
    // broadcast
    if (gl_LocalInvocationID.x == 0) {
        sum[0] /= float(pcs.ne00);
    }
    barrier();
    memoryBarrierShared();
    const float mean = sum[0];
    // recenter
    const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
        out_[y+i00] = in_[x+i00] - mean;
    }
    // VARIANCE
    // parallel sum
    sum[gl_LocalInvocationID.x] = 0.0;
    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
        sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
    }
    // reduce
    barrier();
    memoryBarrierShared();
    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
        if (gl_LocalInvocationID.x < i) {
            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
        }
        barrier();
        memoryBarrierShared();
    }
    // broadcast
    if (gl_LocalInvocationID.x == 0) {
        sum[0] /= float(pcs.ne00);
    }
    barrier();
    memoryBarrierShared();
    const float variance = sum[0];
    const float scale = 1.0f/sqrt(variance + pcs.eps);
    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
        out_[y+i00] *= scale;
    }
 }
--- a/kompute-shaders/op_relu.comp
+++ b/kompute-shaders/op_relu.comp
@ -0,0 +1,21 @@
 #version 450
 #include "common.comp"
 layout(local_size_x = 1) in;
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 layout(push_constant) uniform PushConstants {
    uint inOff;
    uint outOff;
 } pcs;
 void main() {
    const uint baseIndex = gl_WorkGroupID.x * 4;
    for (uint x = 0; x < 4; x++) {
        const uint i = baseIndex + x;
        out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]);
    }
 }
--- a/kompute-shaders/op_rmsnorm.comp
+++ b/kompute-shaders/op_rmsnorm.comp
@ -0,0 +1,53 @@
 #version 450
 #include "common.comp"
 layout(local_size_x = 512) in;
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict tensorOut { float out_[]; };
 layout(push_constant) uniform PushConstants {
    uint inOff;
    uint outOff;
    uint ne00;
    uint nb01;
    float eps;
 } pcs;
 shared float sum[gl_WorkGroupSize.x];
 void main() {
    const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
    // parallel sum
    sum[gl_LocalInvocationID.x] = 0.0;
    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
        sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
    }
    // reduce
    barrier();
    memoryBarrierShared();
    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
        if (gl_LocalInvocationID.x < i) {
            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
        }
        barrier();
        memoryBarrierShared();
    }
    // broadcast
    if (gl_LocalInvocationID.x == 0) {
        sum[0] /= float(pcs.ne00);
    }
    barrier();
    memoryBarrierShared();
    const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
    const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
        out_[y+i00] = in_[x+i00] * scale;
    }
 }
--- a/kompute-shaders/op_rope_f16.comp
+++ b/kompute-shaders/op_rope_f16.comp
@ -0,0 +1,73 @@
 #version 450
 #include "rope_common.comp"
 layout(binding = 0) buffer restrict readonly  tensorInA { float16_t inA[]; };
 layout(binding = 1) buffer restrict readonly  tensorInB { int       inB[]; };
 layout(binding = 2) buffer restrict writeonly tensorOut { float16_t out_[]; };
 void main() {
    const uint i3 = gl_WorkGroupID.z;
    const uint i2 = gl_WorkGroupID.y;
    const uint i1 = gl_WorkGroupID.x;
    const bool is_neox = (pcs.mode & 2) != 0;
    float corr_dims[2];
    rope_yarn_corr_dims(pcs.n_dims, pcs.n_orig_ctx, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
    const int p = inB[pcs.inBOff + i2];
    float theta = float(p);
    if (!is_neox) {
        for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
            float cos_theta, sin_theta;
            rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
            theta *= theta_scale;
            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
            const float x0 = float(inA[src]);
            const float x1 = float(inA[src+1]);
            out_[dst_data]   = float16_t(x0*cos_theta - x1*sin_theta);
            out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
        }
    } else {
        const float inv_ndims = -1.f/pcs.n_dims;
        for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
            const uint cur_rot = ic;
            float cos_theta, sin_theta;
            rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
            theta *= theta_scale;
            const uint i0 = ic/2;
            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
            const float x0 = float(inA[src]);
            const float x1 = float(inA[src+pcs.n_dims/2]);
            out_[dst_data]              = float16_t(x0*cos_theta - x1*sin_theta);
            out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
        }
        for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
            const uint i0 = ic;
            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
            out_[dst_data + 0] = inA[src + 0];
            out_[dst_data + 1] = inA[src + 1];
        }
    }
 }
--- a/kompute-shaders/op_rope_f32.comp
+++ b/kompute-shaders/op_rope_f32.comp
@ -0,0 +1,73 @@
 #version 450
 #include "rope_common.comp"
 layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };
 layout(binding = 1) buffer restrict readonly  tensorInB { int   inB[]; };
 layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
 void main() {
    const uint i3 = gl_WorkGroupID.z;
    const uint i2 = gl_WorkGroupID.y;
    const uint i1 = gl_WorkGroupID.x;
    const bool is_neox = (pcs.mode & 2) != 0;
    float corr_dims[2];
    rope_yarn_corr_dims(pcs.n_dims, pcs.n_orig_ctx, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
    const int p = inB[pcs.inBOff + i2];
    float theta = float(p);
    if (!is_neox) {
        for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
            float cos_theta, sin_theta;
            rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
            theta *= theta_scale;
            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
            const float x0 = inA[src];
            const float x1 = inA[src+1];
            out_[dst_data]   = x0*cos_theta - x1*sin_theta;
            out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
        }
    } else {
        const float inv_ndims = -1.f/pcs.n_dims;
        for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
            const uint cur_rot = ic;
            float cos_theta, sin_theta;
            rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
            theta *= theta_scale;
            const uint i0 = ic/2;
            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
            const float x0 = inA[src];
            const float x1 = inA[src+pcs.n_dims/2];
            out_[dst_data] = x0*cos_theta - x1*sin_theta;
            out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
        }
        for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
            const uint i0 = ic;
            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
            out_[dst_data + 0] = inA[src + 0];
            out_[dst_data + 1] = inA[src + 1];
        }
    }
 }
--- a/kompute-shaders/op_scale.comp
+++ b/kompute-shaders/op_scale.comp
@ -0,0 +1,19 @@
 #version 450
 #include "common.comp"
 layout(local_size_x = 1) in;
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 layout(push_constant) uniform PushConstants {
    uint inOff;
    uint outOff;
    float scale;
 } pcs;
 void main() {
    const uint i = gl_WorkGroupID.x;
    out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
 }
--- a/kompute-shaders/op_scale_8.comp
+++ b/kompute-shaders/op_scale_8.comp
@ -0,0 +1,23 @@
 #version 450
 #include "common.comp"
 layout(local_size_x = 1) in;
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 layout(push_constant) uniform PushConstants {
    uint inOff;
    uint outOff;
    float scale;
 } pcs;
 void main() {
    const uint baseIndex = gl_WorkGroupID.x * 8;
    for (uint x = 0; x < 8; x++) {
        const uint i = baseIndex + x;
        out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
    }
 }
--- a/kompute-shaders/op_silu.comp
+++ b/kompute-shaders/op_silu.comp
@ -0,0 +1,22 @@
 #version 450
 #include "common.comp"
 layout(local_size_x = 1) in;
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 layout(push_constant) uniform PushConstants {
    uint inOff;
    uint outOff;
 } pcs;
 void main() {
    const uint baseIndex = gl_WorkGroupID.x * 4;
    for (uint x = 0; x < 4; x++) {
        const uint i = baseIndex + x;
        const float y = in_[i + pcs.inOff];
        out_[i + pcs.outOff] = y / (1.0 + exp(-y));
    }
 }
--- a/kompute-shaders/op_softmax.comp
+++ b/kompute-shaders/op_softmax.comp
@ -0,0 +1,56 @@
 // TODO: implement multi-simd softmax (llama.cpp commit e16b9fa4)
 #version 450
 #include "common.comp"
 layout(local_size_x_id = 0) in;
 layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
 layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
 layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
 layout(push_constant) uniform PushConstants {
    uint inAOff;
    uint inBOff;
    uint outOff;
    int ne00;
    int ne01;
    int ne02;
    float scale;
    int mask;
 } pcs;
 void main() {
    if (gl_SubgroupInvocationID > 31)
        return;
    const uint i03 = gl_WorkGroupID.z;
    const uint i02 = gl_WorkGroupID.y;
    const uint i01 = gl_WorkGroupID.x;
    const uint extra_off = i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00;
    const uint psrc0 = extra_off + pcs.inAOff; // Based from inA
    const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB
    const uint pdst = extra_off + pcs.outOff; // Based from out_
    // parallel max
    float localMax = uintBitsToFloat(0xFF800000);
    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
        localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f));
    }
    float max_ = subgroupMax(localMax);
    // parallel sum
    float localSum = 0.0f;
    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
        const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f) - max_);
        localSum += exp_psrc0;
        out_[pdst + i00] = exp_psrc0;
    }
    const float sum = subgroupAdd(localSum);
    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
        out_[pdst + i00] /= sum;
    }
 }
--- a/kompute-shaders/rope_common.comp
+++ b/kompute-shaders/rope_common.comp
@ -0,0 +1,67 @@
 #include "common.comp"
 // TODO: use a local size of 32 or more (Metal uses 1024)
 layout(local_size_x = 1) in;
 layout (push_constant) uniform parameter {
    uint inAOff;
    uint inBOff;
    uint outOff;
    int n_dims;
    int mode;
    int n_orig_ctx;
    float freq_base;
    float freq_scale;
    float ext_factor;
    float attn_factor;
    float beta_fast;
    float beta_slow;
    uint nb00;
    uint nb01;
    uint nb02;
    uint nb03;
    int ne0;
    uint nb0;
    uint nb1;
    uint nb2;
    uint nb3;
 } pcs;
 float rope_yarn_ramp(const float low, const float high, const float i0) {
    const float y = (i0 / 2 - low) / max(0.001f, high - low);
    return 1.0f - min(1.0f, max(0.0f, y));
 }
 // YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
 // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
 void rope_yarn(
    float theta_extrap, float freq_scale, float corr_dims[2], float i0, float ext_factor, float mscale,
    out float cos_theta, out float sin_theta
 ) {
    // Get n-d rotational scaling corrected for extrapolation
    float theta_interp = freq_scale * theta_extrap;
    float theta = theta_interp;
    if (ext_factor != 0.0f) {
        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
        // Get n-d magnitude scaling corrected for interpolation
        mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
    }
    cos_theta = cos(theta) * mscale;
    sin_theta = sin(theta) * mscale;
 }
 // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
 // `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
 float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
    return n_dims * log(n_orig_ctx / (n_rot * TWOPI_F)) / (2 * log(base));
 }
 void rope_yarn_corr_dims(
    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, out float dims[2]
 ) {
    // start and end correction dims
    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
 }
--- a/llama.cpp
+++ b/llama.cpp
@ -15,6 +15,8 @@
 #  include "ggml-vulkan.h"
 #elif defined(GGML_USE_SYCL)
 #  include "ggml-sycl.h"
 #elif defined(GGML_USE_KOMPUTE)
 #   include "ggml-kompute.h"
 #endif
 #ifdef GGML_USE_METAL
@ -1313,6 +1315,11 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
    buft = ggml_backend_sycl_buffer_type(gpu);
 #elif defined(GGML_USE_CLBLAST)
    buft = ggml_backend_opencl_buffer_type();
 #elif defined(GGML_USE_KOMPUTE)
    buft = ggml_backend_kompute_buffer_type(gpu);
    if (buft == nullptr) {
        LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
    }
 #endif
    if (buft == nullptr) {
@ -4107,7 +4114,7 @@ static bool llm_load_tensors(
 }
 // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
-static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
+static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
    try {
        llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
@ -4128,6 +4135,22 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
            return 0;
        }
 #ifdef GGML_USE_KOMPUTE
        if (ggml_vk_has_device() && params.n_gpu_layers > 0 && (
            !(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
            || !(
                model.ftype == LLAMA_FTYPE_ALL_F32 ||
                model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
                model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
                model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
            )
        )) {
            // disable Vulkan due to unsupported model architecture or quantization type
            // TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file
            params.n_gpu_layers = 0;
        }
 #endif
        if (!llm_load_tensors(
            ml, model, params.n_gpu_layers, params.split_mode,  params.main_gpu, params.tensor_split, params.use_mlock,
            params.progress_callback, params.progress_callback_user_data
@ -10259,6 +10282,16 @@ struct llama_context * llama_new_context_with_model(
            }
            ctx->backends.push_back(backend);
        }
 #elif defined(GGML_USE_KOMPUTE)
        if (model->n_gpu_layers > 0) {
            auto * backend = ggml_backend_kompute_init(model->main_gpu);
            if (backend == nullptr) {
                LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
                llama_free(ctx);
                return nullptr;
            }
            ctx->backends.push_back(backend);
        }
 #endif
        ctx->backend_cpu = ggml_backend_cpu_init();
        if (ctx->backend_cpu == nullptr) {
--- a/llama.h
+++ b/llama.h
@ -49,7 +49,8 @@
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION 4
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
    defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -370,12 +370,15 @@ struct test_case {
        printf("  %s(%s): ", op_desc(out).c_str(), vars().c_str());
        fflush(stdout);
-        // check if backends support op
+        // check if the backends support the ops
        bool supported = true;
        for (ggml_backend_t backend : {backend1, backend2}) {
-            if (!ggml_backend_supports_op(backend, out)) {
+            for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
                if (!ggml_backend_supports_op(backend, t)) {
                    printf("not supported [%s] ", ggml_backend_name(backend));
                    supported = false;
                    break;
                }
            }
        }
        if (!supported) {
@ -626,6 +629,13 @@ struct test_unary : public test_case {
        ggml_tensor * out = ggml_unary(ctx, in, op);
        return out;
    }
    void initialize_tensors(ggml_context * ctx) override {
        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
            // test extended range of values to check for NaNs in GELU
            init_tensor_uniform(t, -150.f, 150.f);
        }
    }
 };
 // GGML_OP_GET_ROWS
@ -1066,18 +1076,24 @@ struct test_diag_mask_inf : public test_case {
 struct test_soft_max : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne;
    const float scale;
    const bool mask;
    std::string vars() override {
-        return VARS_TO_STR2(type, ne);
+        return VARS_TO_STR4(type, ne, scale, mask);
    }
    test_soft_max(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10})
+            std::array<int64_t, 4> ne = {10, 10, 10, 10},
-        : type(type), ne(ne) {}
+            float scale = 1.0f,
            bool mask = false)
        : type(type), ne(ne), scale(scale), mask(mask) {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_soft_max(ctx, a);
+        ggml_tensor * b = nullptr;
        if (mask) { b = ggml_new_tensor_2d(ctx, type, ne[0], ne[1]); }
        ggml_tensor * out = ggml_soft_max_ext(ctx, a, b, scale);
        return out;
    }
 };
@ -1474,6 +1490,393 @@ struct test_moe : public test_case {
    }
 };
 enum llm_norm_type {
    LLM_NORM,
    LLM_NORM_RMS,
 };
 struct llama_hparams {
    uint32_t n_vocab;
    uint32_t n_embd;
    uint32_t n_head;
    uint32_t n_head_kv;
    static constexpr uint32_t n_layer = 1;
    uint32_t n_rot;
    uint32_t n_embd_head; // dimension of values (d_v)
    uint32_t n_ff;
    float f_norm_eps;
    float f_norm_rms_eps;
    // cparams
    static constexpr uint32_t n_ctx = 512; // user-specified context size
    static constexpr uint32_t n_orig_ctx = n_ctx;
    // batch
    int32_t n_tokens;
    // llm_build_context
    static constexpr int32_t n_kv    = 32; // size of KV cache to consider (n_kv <= n_ctx
    static constexpr int32_t kv_head = 1;  // index of where we store new KV data in the cache
    uint32_t n_embd_gqa() const { // dimension of key embeddings across all k-v heads
        return n_embd_head * n_head_kv;
    }
 };
 // LLM base class
 struct test_llm : public test_case {
    llama_hparams hp;
 protected:
    test_llm(llama_hparams hp)
        : hp(std::move(hp)) {
    }
 public:
    struct ggml_tensor * llm_build_norm(
            struct ggml_context * ctx,
             struct ggml_tensor * cur,
             struct ggml_tensor * mw,
             struct ggml_tensor * mb,
                  llm_norm_type   type) {
        switch (type) {
            case LLM_NORM:     cur = ggml_norm    (ctx, cur, hp.f_norm_eps); break;
            case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hp.f_norm_rms_eps); break;
        }
        cur = ggml_mul(ctx, cur, mw);
        if (mb) {
            cur = ggml_add(ctx, cur, mb);
        }
        return cur;
    }
    void llm_build_kv_store(
            struct ggml_context * ctx,
             struct ggml_tensor * k_l,
             struct ggml_tensor * v_l,
             struct ggml_tensor * k_cur,
             struct ggml_tensor * v_cur) {
        // compute the transposed [n_tokens, n_embd] V matrix
        struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, hp.n_embd_gqa(), hp.n_tokens));
        struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, k_l, hp.n_tokens*hp.n_embd_gqa(),
                (ggml_row_size(k_l->type, hp.n_embd_gqa()))*hp.kv_head);
        struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, v_l, hp.n_tokens, hp.n_embd_gqa(),
                (  hp.n_ctx)*ggml_element_size(v_l),
                (hp.kv_head)*ggml_element_size(v_l));
        // important: storing RoPE-ed version of K in the KV cache!
        ggml_cpy(ctx, k_cur,   k_cache_view);
        ggml_cpy(ctx, v_cur_t, v_cache_view);
    }
    // if max_alibi_bias > 0 then apply ALiBi
    struct ggml_tensor * llm_build_kqv(
            struct ggml_context * ctx,
             struct ggml_tensor * k_l,
             struct ggml_tensor * v_l,
             struct ggml_tensor * q_cur,
             struct ggml_tensor * kq_mask,
                        float     kq_scale) {
        struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
        struct ggml_tensor * k =
            ggml_view_3d(ctx, k_l,
                    hp.n_embd_head, hp.n_kv, hp.n_head_kv,
                    ggml_row_size(k_l->type, hp.n_embd_gqa()),
                    ggml_row_size(k_l->type, hp.n_embd_head),
                    0);
        struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
        kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
        // split cached v into n_head heads
        struct ggml_tensor * v =
            ggml_view_3d(ctx, v_l,
                    hp.n_kv, hp.n_embd_head, hp.n_head_kv,
                    ggml_element_size(v_l)*hp.n_ctx,
                    ggml_element_size(v_l)*hp.n_ctx*hp.n_embd_head,
                    0);
        struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
        struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
        struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, hp.n_embd_head*hp.n_head, hp.n_tokens);
        struct ggml_tensor * wo = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
        cur = ggml_mul_mat(ctx, wo, cur);
        return cur;
    }
    void initialize_tensors(ggml_context * ctx) override {
        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
            if (t->type == GGML_TYPE_I32) {
                // pos
                std::vector<int> data(hp.n_tokens);
                for (int i = 0; i < hp.n_tokens; i++) {
                    data[i] = rand() % hp.n_ctx;
                }
                ggml_backend_tensor_set(t, data.data(), 0, hp.n_tokens * sizeof(int));
            } else {
                init_tensor_uniform(t);
            }
        }
    }
 };
 // Llama
 struct test_llama : public test_llm {
    static constexpr float freq_base = 10000.0f;
    static constexpr float freq_scale = 1.0f;
    static constexpr float ext_factor = 0.0f;
    static constexpr float attn_factor = 1.0f;
    static constexpr float beta_fast = 32.0f;
    static constexpr float beta_slow = 1.0f;
    std::string op_desc(ggml_tensor * t) override {
        GGML_UNUSED(t);
        return "LLAMA";
    }
    std::string vars() override {
        auto n_tokens = hp.n_tokens;
        return VARS_TO_STR1(n_tokens);
    }
    double max_nmse_err() override {
        return 2e-3;
    }
    test_llama(int n_tokens = 1)
        : test_llm({
            /*n_vocab        =*/ 32000,
            /*n_embd         =*/ 3200,
            /*n_head         =*/ 32,
            /*n_head_kv      =*/ 32,
            /*n_rot          =*/ 100,
            /*n_embd_head    =*/ 100,
            /*n_ff           =*/ 8640,
            /*f_norm_eps     =*/ 0.f,
            /*f_norm_rms_eps =*/ 1e-5f,
            /*n_tokens       =*/ n_tokens,
        }) {
    }
    ggml_tensor * build_graph(ggml_context * ctx) override {
        struct ggml_tensor * cur;
        struct ggml_tensor * inpL;
        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1);
        ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
        ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
        for (uint32_t il = 0; il < hp.n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
            // norm
            ggml_tensor * attn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
            cur = llm_build_norm(ctx, inpL, attn_norm, nullptr, LLM_NORM_RMS);
            // self-attention
            {
                ggml_tensor * wq = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
                ggml_tensor * wk = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
                ggml_tensor * wv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
                // compute Q and K and RoPE them
                struct ggml_tensor * Qcur = ggml_mul_mat(ctx, wq, cur);
                struct ggml_tensor * Kcur = ggml_mul_mat(ctx, wk, cur);
                struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur);
                Qcur = ggml_rope_custom(
                    ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head,    hp.n_tokens), inp_pos,
                    hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                );
                Kcur = ggml_rope_custom(
                    ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos,
                    hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                );
                llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
                cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
            }
            struct ggml_tensor * ffn_inp = ggml_add(ctx, cur, inpSA);
            // feed-forward network
            ggml_tensor * ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
            cur = llm_build_norm(ctx, ffn_inp, ffn_norm, nullptr, LLM_NORM_RMS);
            ggml_tensor * ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
            ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff,   hp.n_embd);
            ggml_tensor * ffn_up   = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
            struct ggml_tensor * tmp = ggml_mul_mat(ctx, ffn_up, cur);
            cur = ggml_mul_mat(ctx, ffn_gate, cur);
            cur = ggml_silu(ctx, cur);
            cur = ggml_mul(ctx, cur, tmp);
            cur = ggml_mul_mat(ctx, ffn_down, cur);
            cur = ggml_add(ctx, cur, ffn_inp);
            // input for next layer
            inpL = cur;
        }
        cur = inpL;
        ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
        cur = llm_build_norm(ctx, cur, output_norm, nullptr, LLM_NORM_RMS);
        // lm_head
        ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_vocab);
        cur = ggml_mul_mat(ctx, output, cur);
        return cur;
    }
 };
 // Falcon
 struct test_falcon : public test_llm {
    static constexpr float freq_base = 10000.0f;
    static constexpr float freq_scale = 1.0f;
    static constexpr float ext_factor = 0.0f;
    static constexpr float attn_factor = 1.0f;
    static constexpr float beta_fast = 32.0f;
    static constexpr float beta_slow = 1.0f;
    std::string op_desc(ggml_tensor * t) override {
        GGML_UNUSED(t);
        return "FALCON";
    }
    std::string vars() override {
        auto n_tokens = hp.n_tokens;
        return VARS_TO_STR1(n_tokens);
    }
    double max_nmse_err() override {
        return 2e-3;
    }
    test_falcon(int n_tokens = 1)
        : test_llm({
            /*n_vocab        =*/ 32000,
            /*n_embd         =*/ 3200,
            /*n_head         =*/ 50,
            /*n_head_kv      =*/ 1,
            /*n_rot          =*/ 64,
            /*n_embd_head    =*/ 64,
            /*n_ff           =*/ 8640,
            /*f_norm_eps     =*/ 1e-5f,
            /*f_norm_rms_eps =*/ 0.f,
            /*n_tokens       =*/ n_tokens,
        }) {
    }
    ggml_tensor * build_graph(ggml_context * ctx) override {
        struct ggml_tensor * cur;
        struct ggml_tensor * inpL;
        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1);
        ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
        ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
        for (uint32_t il = 0; il < hp.n_layer; ++il) {
            // norm
            ggml_tensor * attn_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
            ggml_tensor * attn_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
            ggml_tensor * attn_norm = llm_build_norm(ctx, inpL, attn_norm_w, attn_norm_b, LLM_NORM);
            // self-attention
            {
                cur = attn_norm;
                ggml_tensor * wqkv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd + 2*hp.n_embd_gqa());
                cur = ggml_mul_mat(ctx, wqkv, cur);
                struct ggml_tensor * Qcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd,     hp.n_tokens, cur->nb[1], 0*sizeof(float)*(hp.n_embd)));
                struct ggml_tensor * Kcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd)));
                struct ggml_tensor * Vcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd + hp.n_embd_gqa())));
                Qcur = ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head,    hp.n_tokens);
                Kcur = ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens);
                // using mode = 2 for neox mode
                Qcur = ggml_rope_custom(
                    ctx, Qcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx,
                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                );
                Kcur = ggml_rope_custom(
                    ctx, Kcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx,
                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                );
                llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
                cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
            }
            struct ggml_tensor * ffn_inp = cur;
            // feed forward
            {
                ggml_tensor * ffn_up   = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
                ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
                cur = attn_norm;
                cur = ggml_mul_mat(ctx, ffn_up, cur);
                cur = ggml_gelu(ctx, cur);
                cur = ggml_mul_mat(ctx, ffn_down, cur);
            }
            cur = ggml_add(ctx, cur, ffn_inp);
            cur = ggml_add(ctx, cur, inpL);
            // input for next layer
            inpL = cur;
        }
        cur = inpL;
        ggml_tensor * output_norm   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
        ggml_tensor * output_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
        cur = llm_build_norm(ctx, cur, output_norm, output_norm_b, LLM_NORM);
        // lm_head
        ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q8_0, hp.n_embd, hp.n_vocab);
        cur = ggml_mul_mat(ctx, output, cur);
        return cur;
    }
 };
 static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
    std::vector<std::unique_ptr<test_case>> test_cases;
    std::default_random_engine rng(0);
@ -1626,6 +2029,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
        exponent <<= 1;
    }
    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, 0.1f));
    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, 0.1f, true));
    for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
        test_cases.emplace_back(new test_rope(type, {128,  32, 10, 1}, 128, 0, 512)); // llama 7B
        test_cases.emplace_back(new test_rope(type, {128,  40, 10, 1}, 128, 0, 512)); // llama 13B
@ -1662,6 +2068,14 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    //test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
 #endif
    // these tests are disabled to save execution time, but they can be handy for debugging
 #if 0
    test_cases.emplace_back(new test_llama(1));
    test_cases.emplace_back(new test_llama(2));
    test_cases.emplace_back(new test_falcon(1));
    test_cases.emplace_back(new test_falcon(2));
 #endif
    // run tests
    if (mode == MODE_TEST) {
        ggml_backend_t backend_cpu = ggml_backend_cpu_init();
--- a/tests/test-c.c
+++ b/tests/test-c.c
@ -1,3 +1,7 @@
 #include "llama.h"
 #ifdef GGML_USE_KOMPUTE
 #include "ggml-kompute.h"
 #endif
 int main(void) {}
		`@ -0,0 +1 @@`
							`Subproject commit 4565194ed7c32d1d2efa32ceab4d3c6cae006306`