From ba15dfd0be8d08390fe29c88a8e82b1089af3a4c Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Thu, 22 Jun 2023 12:58:07 +0200
Subject: [PATCH] Nomic vulkan backend licensed under the Software for Open
 Models License (SOM), version 1.0.

---
 .gitmodules                                   |    0
 CMakeLists.txt                                |  123 ++
 LICENSE_SOM.txt                               |   30 +
 examples/main/main.cpp                        |    8 +
 ggml-vulkan.cpp                               | 1313 +++++++++++++++++
 ggml-vulkan.h                                 |   61 +
 ggml.c                                        |   32 +-
 kompute/.ccls                                 |   27 +
 kompute/.clang-format                         |    5 +
 kompute/.dockerignore                         |    4 +
 kompute/.github/workflows/cpp_examples.yml    |   58 +
 kompute/.github/workflows/cpp_tests.yml       |  104 ++
 kompute/.github/workflows/python_tests.yml    |   28 +
 kompute/CMakeLists.txt                        |  187 +++
 kompute/LICENSE                               |  203 +++
 kompute/Makefile                              |  210 +++
 kompute/README.md                             |  513 +++++++
 kompute/cmake/bin2h.cmake                     |  106 ++
 kompute/cmake/bin_file_to_header.cmake        |   19 +
 kompute/cmake/check_vulkan_version.cmake      |  139 ++
 kompute/cmake/code_coverage.cmake             |   35 +
 kompute/cmake/deprecation_warnings.cmake      |   15 +
 kompute/cmake/komputeConfig.cmake.in          |    8 +
 kompute/cmake/vulkan_shader_compiler.cmake    |   43 +
 kompute/config/FindSphinx.cmake               |   16 +
 kompute/external/bin/xxd.c                    |  819 ++++++++++
 kompute/kompute-config.cmake                  |   28 +
 kompute/op_add.comp                           |  145 ++
 kompute/op_addrow.comp                        |  145 ++
 kompute/op_cpy_f16_f16.comp                   |  176 +++
 kompute/op_cpy_f16_f32.comp                   |  176 +++
 kompute/op_cpy_f32_f16.comp                   |  176 +++
 kompute/op_cpy_f32_f32.comp                   |  168 +++
 kompute/op_diagmask.comp                      |  153 ++
 kompute/op_gelu.comp                          |  142 ++
 kompute/op_getrows_f16.comp                   |  150 ++
 kompute/op_getrows_q4_0.comp                  |  179 +++
 kompute/op_getrows_q4_1.comp                  |  181 +++
 kompute/op_mul.comp                           |  145 ++
 kompute/op_mul_mat_f16.comp                   |  177 +++
 kompute/op_mul_mat_q4_0.comp                  |  195 +++
 kompute/op_mul_mat_q4_1.comp                  |  218 +++
 kompute/op_mulrow.comp                        |  145 ++
 kompute/op_norm.comp                          |  209 +++
 kompute/op_relu.comp                          |  141 ++
 kompute/op_rmsnorm.comp                       |  178 +++
 kompute/op_rope.comp                          |  183 +++
 kompute/op_scale.comp                         |  142 ++
 kompute/op_silu.comp                          |  141 ++
 kompute/op_softmax.comp                       |  197 +++
 kompute/scripts/convert_shaders.py            |  148 ++
 kompute/scripts/requirements.txt              |   11 +
 kompute/setup.py                              |   93 ++
 kompute/src/Algorithm.cpp                     |  450 ++++++
 kompute/src/CMakeLists.txt                    |   82 +
 kompute/src/Core.cpp                          |   27 +
 kompute/src/Manager.cpp                       |  493 +++++++
 kompute/src/OpAlgoDispatch.cpp                |   65 +
 kompute/src/OpBufferSyncDevice.cpp            |   51 +
 kompute/src/OpBufferSyncLocal.cpp             |   51 +
 kompute/src/OpMemoryBarrier.cpp               |   74 +
 kompute/src/OpTensorCopy.cpp                  |   90 ++
 kompute/src/OpTensorSyncDevice.cpp            |   61 +
 kompute/src/OpTensorSyncLocal.cpp             |   76 +
 kompute/src/Sequence.cpp                      |  396 +++++
 kompute/src/Tensor.cpp                        |  451 ++++++
 kompute/src/include/CMakeLists.txt            |   46 +
 kompute/src/include/kompute/Algorithm.hpp     |  338 +++++
 kompute/src/include/kompute/Core.hpp          |   39 +
 kompute/src/include/kompute/Kompute.hpp       |   21 +
 kompute/src/include/kompute/Manager.hpp       |  267 ++++
 kompute/src/include/kompute/Sequence.hpp      |  313 ++++
 kompute/src/include/kompute/Tensor.hpp        |  306 ++++
 kompute/src/include/kompute/logger/Logger.hpp |  197 +++
 .../kompute/operations/OpAlgoDispatch.hpp     |   86 ++
 .../src/include/kompute/operations/OpBase.hpp |   62 +
 .../kompute/operations/OpBufferSyncDevice.hpp |   50 +
 .../kompute/operations/OpBufferSyncLocal.hpp  |   50 +
 .../kompute/operations/OpMemoryBarrier.hpp    |   81 +
 .../src/include/kompute/operations/OpMult.hpp |   58 +
 .../kompute/operations/OpTensorCopy.hpp       |   63 +
 .../kompute/operations/OpTensorSyncDevice.hpp |   66 +
 .../kompute/operations/OpTensorSyncLocal.hpp  |   66 +
 kompute/src/logger/CMakeLists.txt             |   69 +
 kompute/src/logger/Logger.cpp                 |  101 ++
 kompute/src/shaders/CMakeLists.txt            |    5 +
 kompute/src/shaders/glsl/CMakeLists.txt       |   26 +
 .../glsl/ShaderLogisticRegression.comp        |   52 +
 .../glsl/ShaderLogisticRegression.hpp.in      |  310 ++++
 kompute/src/shaders/glsl/ShaderOpMult.comp    |   28 +
 kompute/src/shaders/glsl/ShaderOpMult.hpp.in  |  101 ++
 kompute/src/shaders/hlsl/computeheadless.comp |   29 +
 llama.cpp                                     |   47 +-
 llama.h                                       |    2 +-
 undump.py                                     |   18 +
 95 files changed, 13489 insertions(+), 23 deletions(-)
 create mode 100644 .gitmodules
 create mode 100644 LICENSE_SOM.txt
 create mode 100644 ggml-vulkan.cpp
 create mode 100644 ggml-vulkan.h
 create mode 100644 kompute/.ccls
 create mode 100644 kompute/.clang-format
 create mode 100644 kompute/.dockerignore
 create mode 100644 kompute/.github/workflows/cpp_examples.yml
 create mode 100644 kompute/.github/workflows/cpp_tests.yml
 create mode 100644 kompute/.github/workflows/python_tests.yml
 create mode 100644 kompute/CMakeLists.txt
 create mode 100644 kompute/LICENSE
 create mode 100644 kompute/Makefile
 create mode 100644 kompute/README.md
 create mode 100644 kompute/cmake/bin2h.cmake
 create mode 100644 kompute/cmake/bin_file_to_header.cmake
 create mode 100644 kompute/cmake/check_vulkan_version.cmake
 create mode 100644 kompute/cmake/code_coverage.cmake
 create mode 100644 kompute/cmake/deprecation_warnings.cmake
 create mode 100644 kompute/cmake/komputeConfig.cmake.in
 create mode 100644 kompute/cmake/vulkan_shader_compiler.cmake
 create mode 100644 kompute/config/FindSphinx.cmake
 create mode 100644 kompute/external/bin/xxd.c
 create mode 100644 kompute/kompute-config.cmake
 create mode 100644 kompute/op_add.comp
 create mode 100644 kompute/op_addrow.comp
 create mode 100644 kompute/op_cpy_f16_f16.comp
 create mode 100644 kompute/op_cpy_f16_f32.comp
 create mode 100644 kompute/op_cpy_f32_f16.comp
 create mode 100644 kompute/op_cpy_f32_f32.comp
 create mode 100644 kompute/op_diagmask.comp
 create mode 100644 kompute/op_gelu.comp
 create mode 100644 kompute/op_getrows_f16.comp
 create mode 100644 kompute/op_getrows_q4_0.comp
 create mode 100644 kompute/op_getrows_q4_1.comp
 create mode 100644 kompute/op_mul.comp
 create mode 100644 kompute/op_mul_mat_f16.comp
 create mode 100644 kompute/op_mul_mat_q4_0.comp
 create mode 100644 kompute/op_mul_mat_q4_1.comp
 create mode 100644 kompute/op_mulrow.comp
 create mode 100644 kompute/op_norm.comp
 create mode 100644 kompute/op_relu.comp
 create mode 100644 kompute/op_rmsnorm.comp
 create mode 100644 kompute/op_rope.comp
 create mode 100644 kompute/op_scale.comp
 create mode 100644 kompute/op_silu.comp
 create mode 100644 kompute/op_softmax.comp
 create mode 100644 kompute/scripts/convert_shaders.py
 create mode 100644 kompute/scripts/requirements.txt
 create mode 100644 kompute/setup.py
 create mode 100644 kompute/src/Algorithm.cpp
 create mode 100644 kompute/src/CMakeLists.txt
 create mode 100644 kompute/src/Core.cpp
 create mode 100644 kompute/src/Manager.cpp
 create mode 100644 kompute/src/OpAlgoDispatch.cpp
 create mode 100644 kompute/src/OpBufferSyncDevice.cpp
 create mode 100644 kompute/src/OpBufferSyncLocal.cpp
 create mode 100644 kompute/src/OpMemoryBarrier.cpp
 create mode 100644 kompute/src/OpTensorCopy.cpp
 create mode 100644 kompute/src/OpTensorSyncDevice.cpp
 create mode 100644 kompute/src/OpTensorSyncLocal.cpp
 create mode 100644 kompute/src/Sequence.cpp
 create mode 100644 kompute/src/Tensor.cpp
 create mode 100644 kompute/src/include/CMakeLists.txt
 create mode 100644 kompute/src/include/kompute/Algorithm.hpp
 create mode 100644 kompute/src/include/kompute/Core.hpp
 create mode 100644 kompute/src/include/kompute/Kompute.hpp
 create mode 100644 kompute/src/include/kompute/Manager.hpp
 create mode 100644 kompute/src/include/kompute/Sequence.hpp
 create mode 100644 kompute/src/include/kompute/Tensor.hpp
 create mode 100644 kompute/src/include/kompute/logger/Logger.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpAlgoDispatch.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpBase.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpMemoryBarrier.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpMult.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpTensorCopy.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp
 create mode 100644 kompute/src/logger/CMakeLists.txt
 create mode 100644 kompute/src/logger/Logger.cpp
 create mode 100644 kompute/src/shaders/CMakeLists.txt
 create mode 100644 kompute/src/shaders/glsl/CMakeLists.txt
 create mode 100644 kompute/src/shaders/glsl/ShaderLogisticRegression.comp
 create mode 100644 kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in
 create mode 100644 kompute/src/shaders/glsl/ShaderOpMult.comp
 create mode 100644 kompute/src/shaders/glsl/ShaderOpMult.hpp.in
 create mode 100644 kompute/src/shaders/hlsl/computeheadless.comp
 create mode 100644 undump.py

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..e69de29bb
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c4a649a97..88585fb93 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,6 +86,7 @@ option(LLAMA_HIPBLAS                         "llama: use hipBLAS"
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
+option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
@@ -412,6 +413,127 @@ if (LLAMA_HIPBLAS)
     endif()
 endif()
 
+if (LLAMA_KOMPUTE)
+    find_package(Vulkan COMPONENTS glslc REQUIRED)
+    find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
+    if (NOT glslc_executable)
+        message(FATAL_ERROR "glslc not found")
+    endif()
+
+    function(compile_shader)
+      set(options)
+      set(oneValueArgs)
+      set(multiValueArgs SOURCES)
+      cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+      foreach(source ${compile_shader_SOURCES})
+        set(spv_file ${source}.spv)
+        add_custom_command(
+            OUTPUT ${spv_file}
+            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+            COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+            COMMENT "Compiling ${source} to ${source}.spv"
+        )
+
+        get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
+        set(FILE_NAME "shader${RAW_FILE_NAME}")
+        string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
+        string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
+        string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
+        set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
+        message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
+        add_custom_command(
+          OUTPUT ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
+          COMMAND xxd -i ${spv_file} >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+          DEPENDS ${spv_file}
+          COMMENT "Converting to hpp: ${FILE_NAME}"
+        )
+      endforeach()
+    endfunction()
+
+    if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
+        message(STATUS "Kompute found")
+        add_subdirectory(kompute)
+
+        # Compile our shaders
+        compile_shader(SOURCES
+          kompute/op_scale.comp
+          kompute/op_add.comp
+          kompute/op_addrow.comp
+          kompute/op_mul.comp
+          kompute/op_mulrow.comp
+          kompute/op_silu.comp
+          kompute/op_relu.comp
+          kompute/op_gelu.comp
+          kompute/op_softmax.comp
+          kompute/op_norm.comp
+          kompute/op_rmsnorm.comp
+          kompute/op_diagmask.comp
+          kompute/op_mul_mat_f16.comp
+          kompute/op_mul_mat_q4_0.comp
+          kompute/op_mul_mat_q4_1.comp
+          kompute/op_getrows_f16.comp
+          kompute/op_getrows_q4_0.comp
+          kompute/op_getrows_q4_1.comp
+          kompute/op_rope.comp
+          kompute/op_cpy_f16_f16.comp
+          kompute/op_cpy_f16_f32.comp
+          kompute/op_cpy_f32_f16.comp
+          kompute/op_cpy_f32_f32.comp
+        )
+
+        # Create a custom target for our generated shaders
+        add_custom_target(generated_shaders DEPENDS
+          shaderop_scale.h
+          shaderop_add.h
+          shaderop_addrow.h
+          shaderop_mul.h
+          shaderop_mulrow.h
+          shaderop_silu.h
+          shaderop_relu.h
+          shaderop_gelu.h
+          shaderop_softmax.h
+          shaderop_norm.h
+          shaderop_rmsnorm.h
+          shaderop_diagmask.h
+          shaderop_mul_mat_f16.h
+          shaderop_mul_mat_q4_0.h
+          shaderop_mul_mat_q4_1.h
+          shaderop_getrows_f16.h
+          shaderop_getrows_q4_0.h
+          shaderop_getrows_q4_1.h
+          shaderop_rope.h
+          shaderop_cpy_f16_f16.h
+          shaderop_cpy_f16_f32.h
+          shaderop_cpy_f32_f16.h
+          shaderop_cpy_f32_f32.h
+        )
+
+        # Create a custom command that depends on the generated_shaders
+        add_custom_command(
+            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp
+            COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp
+            DEPENDS generated_shaders
+            COMMENT "Ensuring shaders are generated before compiling ggml-vulkan.cpp"
+        )
+
+        # Add the stamp to the main sources to ensure dependency tracking
+        set(GGML_SOURCES_KOMPUTE ggml-vulkan.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp)
+        set(GGML_HEADERS_KOMPUTE ggml-vulkan.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp)
+        add_compile_definitions(GGML_USE_KOMPUTE)
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
+        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
+    else()
+        message(WARNING "Kompute not found")
+    endif()
+endif()
+
 if (LLAMA_ALL_WARNINGS)
     if (NOT MSVC)
         set(c_flags
@@ -648,6 +770,7 @@ add_library(ggml OBJECT
             ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
             ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
             ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
+            ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
             )
 
 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
diff --git a/LICENSE_SOM.txt b/LICENSE_SOM.txt
new file mode 100644
index 000000000..eb912c0fd
--- /dev/null
+++ b/LICENSE_SOM.txt
@@ -0,0 +1,30 @@
+Software for Open Models License (SOM)
+Version 1.0 dated August 30th, 2023
+
+This license governs use of the accompanying Software. If you use the Software, you accept this license. If you do not accept the license, do not use the Software.
+
+This license is intended to encourage open release of models created, modified, processed, or otherwise used via the Software under open licensing terms, and should be interpreted in light of that intent.
+
+1. Definitions
+The “Licensor” is the person or entity who is making the Software available under this license. “Software” is the software made available by Licensor under this license.
+A “Model” is the output of a machine learning algorithm, and excludes the Software.
+“Model Source Materials” must include the Model and model weights, and may include any input data, input data descriptions, documentation or training descriptions for the Model.
+“Open Licensing Terms” means: (a) any open source license approved by the Open Source Initiative, or (b) any other terms that make the Model Source Materials publicly available free of charge, and allow recipients to use, modify and distribute the Model Source Materials. Terms described in (b) may include reasonable restrictions such as non-commercial or non-production limitations, or require use in compliance with law.
+
+2. Grant of Rights. Subject to the conditions and limitations in section 3:
+(A) Copyright Grant. Licensor grants you a non-exclusive, worldwide, royalty-free copyright license to copy, modify, and distribute the Software and any modifications of the Software you create under this license. The foregoing license includes without limitation the right to create, modify, and use Models using this Software.
+
+(B) Patent Grant. Licensor grants you a non-exclusive, worldwide, royalty-free license, under any patents owned or controlled by Licensor, to make, have made, use, sell, offer for sale, import, or otherwise exploit the Software.  No license is granted to patent rights that are not embodied in the operation of the Software in the form provided by Licensor.
+
+3. Conditions and Limitations
+(A) Model Licensing and Access. If you use the Software to create, modify, process, or otherwise use any Model, including usage to create inferences with a Model, whether or not you make the Model available to others, you must make that Model Source Materials publicly available under Open Licensing Terms. 
+
+(B) No Re-Licensing. If you redistribute the Software, or modifications to the Software made under the license granted above, you must make it available only under the terms of this license. You may offer additional terms such as warranties, maintenance and support, but You, and not Licensor, are responsible for performing such terms.
+
+(C) No Trademark License. This license does not grant you rights to use the Licensor’s name, logo, or trademarks.
+
+(D) If you assert in writing a claim against any person or entity alleging that the use of the Software infringes any patent, all of your licenses to the Software under Section 2 end automatically as of the date you asserted the claim.
+
+(E) If you distribute any portion of the Software, you must retain all copyright, patent, trademark, and attribution notices that are present in the Software, and you must include a copy of this license.
+
+(F) The Software is licensed “as-is.” You bear the entire risk of using it. Licensor gives You no express warranties, guarantees or conditions. You may have additional consumer rights under your local laws that this license cannot change. To the extent permitted under your local laws, the Licensor disclaims and excludes the implied warranties of merchantability, fitness for a particular purpose and non-infringement. To the extent this disclaimer is unlawful, you, and not Licensor, are responsible for any liability.
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index d78112260..16f8fc72b 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -33,6 +33,10 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+#if defined(GGML_USE_KOMPUTE)
+#include "ggml-vulkan.h"
+#endif
+
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
 static gpt_params               * g_params;
@@ -171,6 +175,10 @@ int main(int argc, char ** argv) {
     g_model = &model;
     g_ctx = &ctx;
 
+#if defined(GGML_USE_KOMPUTE)
+    ggml_vk_init_device(0, "gpu");
+#endif
+
     // load the model and apply lora adapter, if any
     LOG("%s: load the model and apply lora adapter, if any\n", __func__);
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
new file mode 100644
index 000000000..32590d03e
--- /dev/null
+++ b/ggml-vulkan.cpp
@@ -0,0 +1,1313 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "ggml-vulkan.h"
+#include "ggml.h"
+
+// These are generated at build time by cmake custom command
+#include "shaderop_scale.h"
+#include "shaderop_add.h"
+#include "shaderop_addrow.h"
+#include "shaderop_mul.h"
+#include "shaderop_mulrow.h"
+#include "shaderop_silu.h"
+#include "shaderop_relu.h"
+#include "shaderop_gelu.h"
+#include "shaderop_softmax.h"
+#include "shaderop_norm.h"
+#include "shaderop_rmsnorm.h"
+#include "shaderop_diagmask.h"
+#include "shaderop_mul_mat_f16.h"
+#include "shaderop_mul_mat_q4_0.h"
+#include "shaderop_mul_mat_q4_1.h"
+#include "shaderop_getrows_f16.h"
+#include "shaderop_getrows_q4_0.h"
+#include "shaderop_getrows_q4_1.h"
+#include "shaderop_rope.h"
+#include "shaderop_cpy_f16_f16.h"
+#include "shaderop_cpy_f16_f32.h"
+#include "shaderop_cpy_f32_f16.h"
+#include "shaderop_cpy_f32_f32.h"
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <memory>
+#include <vector>
+#include <utility>
+#include <fstream>
+#include <exception>
+#include <thread>
+#include <mutex>
+#include <atomic>
+#include <cstring>
+#include <immintrin.h>
+#include <kompute/Kompute.hpp>
+
+#ifndef __STDC_IEC_559__
+#warning Your C implementation does not seem to be IEC 559 compliant, which is required for proper Vulkan interop.
+#endif
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+typedef ggml_fp16_t half;
+
+struct ggml_kompute_context {
+    bool hasH2DAll = false;
+    std::vector<ggml_vk_memory> buffers;
+    std::shared_ptr<vk::DescriptorPool> pool;
+    static ggml_kompute_context *instance;
+    ggml_kompute_context() {
+        instance = this;
+    }
+};
+
+ggml_kompute_context *ggml_kompute_context::instance;
+
+kp::Manager mgr;
+
+#ifdef __linux__
+__attribute__((constructor))
+static void enable_sam() {
+    setenv("RADV_PERFTEST", "sam", false);
+}
+#endif
+
+static bool ggml_vk_checkPhysicalDeviceFeatures(vk::PhysicalDevice physicalDevice) {
+    vk::PhysicalDeviceFeatures availableFeatures;
+    physicalDevice.getFeatures(&availableFeatures);
+
+    if (!availableFeatures.shaderInt16)
+        return false;
+
+    vk::PhysicalDeviceVulkan11Features availableFeatures11;
+    vk::PhysicalDeviceVulkan12Features availableFeatures12;
+
+    availableFeatures11.pNext = &availableFeatures12;
+    availableFeatures12.pNext = nullptr;
+
+    vk::PhysicalDeviceFeatures2 features2;
+    features2.pNext = &availableFeatures11;
+
+    physicalDevice.getFeatures2(&features2);
+
+    if (!availableFeatures11.uniformAndStorageBuffer16BitAccess ||
+        !availableFeatures11.storageBuffer16BitAccess) {
+        return false;
+    }
+
+    if (!availableFeatures12.storageBuffer8BitAccess ||
+        !availableFeatures12.uniformAndStorageBuffer8BitAccess ||
+        !availableFeatures12.shaderFloat16 ||
+        !availableFeatures12.shaderInt8) {
+        return false;
+    }
+
+    return true;
+}
+
+static std::string ggml_vk_getVendorName(uint32_t vendorID) {
+    switch (vendorID) {
+        case 0x10DE:
+            return "nvidia";
+        case 0x1002:
+            return "amd";
+        case 0x8086:
+            return "intel";
+        default:
+            return "unknown";
+    }
+}
+
+std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
+    std::vector<vk::PhysicalDevice> physicalDevices = mgr.listDevices();
+    uint32_t deviceCount = physicalDevices.size();
+
+    std::vector<ggml_vk_device> results;
+
+    if (deviceCount == 0)
+        return results;
+
+    for (uint32_t i = 0; i < deviceCount; i++) {
+        VkPhysicalDeviceProperties properties;
+        vkGetPhysicalDeviceProperties(physicalDevices.at(i), &properties);
+
+        VkPhysicalDeviceMemoryProperties memoryProperties;
+        vkGetPhysicalDeviceMemoryProperties(physicalDevices.at(i), &memoryProperties);
+
+        const uint32_t major = VK_VERSION_MAJOR(properties.apiVersion);
+        const uint32_t minor = VK_VERSION_MINOR(properties.apiVersion);
+        if (major < 1 || minor < 2)
+            continue;
+
+        if (!ggml_vk_checkPhysicalDeviceFeatures(physicalDevices.at(i)))
+            continue;
+
+        size_t heapSize = 0;
+        for (uint32_t j = 0; j < memoryProperties.memoryHeapCount; ++j) {
+            VkMemoryHeap heap = memoryProperties.memoryHeaps[j];
+            if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
+                heapSize = heap.size;
+                break;
+            }
+        }
+
+        if (heapSize < memoryRequired)
+            continue;
+
+        ggml_vk_device d;
+        d.index = i;
+        d.type = properties.deviceType;
+        d.heapSize = heapSize;
+        d.name = properties.deviceName;
+        d.vendor = ggml_vk_getVendorName(properties.vendorID);
+        results.push_back(d);
+    }
+
+    std::stable_sort(results.begin(), results.end(),
+        [](const ggml_vk_device& lhs, const ggml_vk_device& rhs) -> bool {
+            if (lhs.type != rhs.type) {
+                if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return true;
+                if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return false;
+
+                if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return true;
+                if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return false;
+            }
+            return lhs.heapSize < rhs.heapSize;
+        }
+    );
+
+    return results;
+}
+
+static void ggml_vk_filterByVendor(std::vector<ggml_vk_device>& devices, const std::string& targetVendor) {
+    devices.erase(
+        std::remove_if(devices.begin(), devices.end(),
+            [&targetVendor](const ggml_vk_device& device) {
+                return device.vendor != targetVendor;
+            }),
+        devices.end()
+    );
+}
+
+static void ggml_vk_filterByName(std::vector<ggml_vk_device>& devices, const std::string& targetName) {
+    devices.erase(
+        std::remove_if(devices.begin(), devices.end(),
+            [&targetName](const ggml_vk_device& device) {
+                return device.name != targetName;
+            }),
+        devices.end()
+    );
+}
+
+bool ggml_vk_init_device(size_t memoryRequired, const std::string &device) {
+    if (device.empty())
+        return false;
+
+    std::vector<ggml_vk_device> devices = ggml_vk_available_devices(memoryRequired);
+    if (device == "gpu") {
+        if (devices.size() != 0)
+            return ggml_vk_init_device(devices.front());
+    } else if (device == "amd" || device == "nvidia" || device == "intel") {
+        ggml_vk_filterByVendor(devices, device);
+        if (devices.size() != 0)
+            return ggml_vk_init_device(devices.front());
+    } else {
+        ggml_vk_filterByName(devices, device);
+        if (devices.size() != 0)
+            return ggml_vk_init_device(devices.front());
+    }
+
+    return ggml_vk_has_device();
+}
+
+bool ggml_vk_init_device(const ggml_vk_device &device) {
+    return ggml_vk_init_device(device.index);
+}
+
+bool ggml_vk_init_device(int device) {
+    mgr.initializeDevice(device, {},
+                         {"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
+                          "VK_KHR_16bit_storage", "VK_KHR_storage_buffer_storage_class"});
+    return ggml_vk_has_device();
+}
+
+bool ggml_vk_has_device() {
+    return mgr.hasDevice();
+}
+
+ggml_vk_device ggml_vk_current_device() {
+    if (!mgr.hasDevice())
+        return ggml_vk_device();
+
+    std::vector<ggml_vk_device> devices = ggml_vk_available_devices(0);
+    ggml_vk_filterByName(devices, mgr.physicalDevice()->getProperties().deviceName);
+    return devices.front();
+}
+
+ggml_kompute_context *ggml_vk_init() {
+    return new ggml_kompute_context;
+}
+
+bool ggml_vk_has_h2d_all(struct ggml_kompute_context * ctx) {
+    return ctx->hasH2DAll;
+}
+
+void ggml_vk_free(struct ggml_kompute_context * ctx) {
+    delete ctx;
+}
+
+static
+void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t size) {
+    std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
+        vk::DescriptorPoolSize(
+          vk::DescriptorType::eStorageBuffer,
+          3 * size // Descriptor count is number of possible tensors to pass into an algorithm
+          )
+    };
+
+    vk::DescriptorPoolCreateInfo descriptorPoolInfo(
+      vk::DescriptorPoolCreateFlags(),
+      size, // Max sets
+      static_cast<uint32_t>(descriptorPoolSizes.size()),
+      descriptorPoolSizes.data());
+
+    ctx->pool = std::make_shared<vk::DescriptorPool>();
+    vk::Result r = mgr.device()->createDescriptorPool(
+      &descriptorPoolInfo, nullptr, ctx->pool.get());
+    if (r != vk::Result::eSuccess)
+        std::cerr << "Error allocating descriptor pool" << vk::to_string(r);
+}
+
+static
+void ggml_vk_free_descriptor_pool(struct ggml_kompute_context * ctx) {
+    if (ctx->pool) {
+        mgr.device()->destroy(
+          *ctx->pool,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        ctx->pool = nullptr;
+    }
+}
+
+static
+vk::Buffer *ggml_vk_allocate_buffer(size_t size) {
+    vk::BufferCreateInfo bufferCreateInfo;
+    bufferCreateInfo.size = size;
+    bufferCreateInfo.usage = vk::BufferUsageFlagBits::eStorageBuffer |
+                             vk::BufferUsageFlagBits::eTransferSrc |
+                             vk::BufferUsageFlagBits::eTransferDst;
+    bufferCreateInfo.sharingMode = vk::SharingMode::eExclusive;
+
+    vk::Buffer *vkBuffer = new vk::Buffer;
+    vk::Result r = mgr.device()->createBuffer(&bufferCreateInfo, nullptr, vkBuffer);
+    if (r != vk::Result::eSuccess)
+        std::cerr << "Error allocating buffer" << vk::to_string(r);
+    return vkBuffer;
+}
+
+static
+vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, vk::MemoryRequirements requirements, bool *isHostVisible) {
+
+    uint32_t memoryTypeIndex = -1;
+    bool memoryTypeIndexFound = false;
+    vk::PhysicalDeviceMemoryProperties memoryProperties = mgr.physicalDevice()->getMemoryProperties();
+    for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) {
+        if (requirements.memoryTypeBits & (1 << i)) {
+            if (((memoryProperties.memoryTypes[i]).propertyFlags &
+                 flags) == flags) {
+                memoryTypeIndex = i;
+                memoryTypeIndexFound = true;
+                if (isHostVisible && (memoryProperties.memoryTypes[i].propertyFlags & vk::MemoryPropertyFlagBits::eHostVisible)) {
+                    *isHostVisible = true;
+                }
+                break;
+            }
+        }
+    }
+    if (!memoryTypeIndexFound) {
+        throw std::runtime_error(
+          "Memory type index for buffer creation not found");
+    }
+
+    vk::MemoryAllocateInfo allocInfo;
+    allocInfo.allocationSize = size;
+    allocInfo.memoryTypeIndex = memoryTypeIndex;
+    vk::DeviceMemory *vkDeviceMemory =  new vk::DeviceMemory;
+    vk::Result r = mgr.device()->allocateMemory(&allocInfo, nullptr, vkDeviceMemory);
+    if (r != vk::Result::eSuccess)
+        std::cerr << "Error allocating memory" << vk::to_string(r);
+    return vkDeviceMemory;
+}
+
+size_t ggml_vk_aligned_offset(size_t offset) {
+
+    static size_t minStorageBufferOffsetAlignment = 0;
+    if (minStorageBufferOffsetAlignment == 0) {
+        vk::PhysicalDeviceProperties deviceProperties;
+        deviceProperties = mgr.physicalDevice()->getProperties();
+        vk::PhysicalDeviceLimits deviceLimits = deviceProperties.limits;
+        minStorageBufferOffsetAlignment = deviceLimits.minStorageBufferOffsetAlignment;
+    }
+
+    // If offset is already aligned, return it directly
+    if (offset % minStorageBufferOffsetAlignment == 0) {
+        return offset;
+    }
+
+    // Otherwise, return the largest multiple of minStorageBufferOffsetAlignment less than offset
+    return (offset / minStorageBufferOffsetAlignment) * minStorageBufferOffsetAlignment;
+}
+
+static void ggml_vk_h2d_buffer(const ggml_vk_memory &memory) {
+    if (memory.stagingBuffer)
+        mgr.sequence()->eval<kp::OpBufferSyncDevice>(memory.primaryBuffer, memory.stagingBuffer, memory.size);
+}
+
+static void ggml_vk_d2h_buffer(const ggml_vk_memory &memory) {
+    if (memory.stagingBuffer)
+        mgr.sequence()->eval<kp::OpBufferSyncLocal>(memory.primaryBuffer, memory.stagingBuffer, memory.size);
+}
+
+ggml_vk_memory ggml_vk_allocate(size_t size) {
+    ggml_vk_memory memory;
+    bool isHostVisible = false;
+    {
+        memory.primaryBuffer = ggml_vk_allocate_buffer(size);
+        vk::MemoryRequirements memoryRequirements = mgr.device()->getBufferMemoryRequirements(*memory.primaryBuffer);
+        vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eDeviceLocal;
+        memory.primaryMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
+        mgr.device()->bindBufferMemory(*memory.primaryBuffer, *memory.primaryMemory, 0);
+        if (isHostVisible) {
+            vk::Result r = mgr.device()->mapMemory(*memory.primaryMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
+            if (r != vk::Result::eSuccess)
+                std::cerr << "Error mapping memory" << vk::to_string(r);
+        }
+    }
+
+    if (!isHostVisible) {
+        memory.stagingBuffer = ggml_vk_allocate_buffer(size);
+        vk::MemoryRequirements memoryRequirements = mgr.device()->getBufferMemoryRequirements(*memory.stagingBuffer);
+        vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eHostVisible |
+                                                      vk::MemoryPropertyFlagBits::eHostCoherent |
+                                                      vk::MemoryPropertyFlagBits::eHostCached;
+        memory.stagingMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
+        mgr.device()->bindBufferMemory(*memory.stagingBuffer, *memory.stagingMemory, 0);
+        vk::Result r = mgr.device()->mapMemory(*memory.stagingMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
+        if (r != vk::Result::eSuccess)
+            std::cerr << "Error mapping memory" << vk::to_string(r);
+    }
+
+    memory.size = size;
+    return memory;
+}
+
+void ggml_vk_free_memory(ggml_vk_memory &memory)
+{
+    mgr.device()->destroy(
+      *memory.primaryBuffer,
+      (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+    if (memory.stagingBuffer) {
+        mgr.device()->destroy(
+          *memory.stagingBuffer,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+    }
+    mgr.device()->freeMemory(
+      *memory.primaryMemory,
+      (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+    if (memory.stagingMemory) {
+        mgr.device()->freeMemory(
+          *memory.stagingMemory,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+    }
+}
+
+static
+decltype(ggml_kompute_context::buffers)::iterator ggml_vk_find_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint64_t & offset) {
+    for (auto it = ctx->buffers.begin(); ; it++) {
+        if (it == ctx->buffers.end()) {
+            fprintf(stderr, "%s: Failed to find tensor %p\n", __func__, t->data);
+            return it;
+        }
+        if (it->data <= t->data &&
+                reinterpret_cast<intptr_t>(it->data) + it->size >= (reinterpret_cast<intptr_t>(t->data) + ggml_nbytes(t))) {
+            offset = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(it->data);
+            return it;
+        }
+    }
+}
+
+static
+const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint32_t *alignedOffset) {
+    uint64_t originalOffset = 0;
+    auto res = ggml_vk_find_tensor(ctx, t, originalOffset);
+    if (res == ctx->buffers.end()) {
+        static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
+        return nullTensor;
+    }
+
+    // Create a tensor whose memory will be composed of our buffers at the correct offset
+    const size_t nelements = ggml_nelements(t);
+    size_t nbytes = ggml_nbytes(t);
+
+    size_t vulkanOffset = ggml_vk_aligned_offset(originalOffset);
+    if (alignedOffset) {
+        *alignedOffset = originalOffset - vulkanOffset;
+        nbytes += *alignedOffset;
+    }
+
+    return mgr.tensor(
+        t->data,
+        nelements,
+        nbytes, kp::Tensor::TensorDataTypes::eFloat,
+        res->primaryMemory, res->primaryBuffer,
+        res->stagingMemory, res->stagingBuffer,
+        vulkanOffset);
+}
+
+void ggml_vk_add_buffer(
+        struct ggml_kompute_context * ctx,
+        const char * /*name*/,
+        const ggml_vk_memory &memory) {
+    ctx->buffers.emplace_back(memory);
+}
+
+void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
+    const auto res = ggml_vk_get_tensor(ctx, t, nullptr);
+    GGML_ASSERT(res);
+    mgr.sequence()->eval<kp::OpTensorSyncDevice>({res});
+}
+
+void ggml_vk_h2d_all(struct ggml_kompute_context * ctx) {
+    for (auto& it : ctx->buffers) {
+        ggml_vk_h2d_buffer(it);
+    }
+    ctx->hasH2DAll = true;
+}
+
+void ggml_vk_d2h_all(struct ggml_kompute_context * ctx) {
+    for (auto& it : ctx->buffers) {
+        ggml_vk_d2h_buffer(it);
+    }
+}
+
+void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
+    const auto res = ggml_vk_get_tensor(ctx, t, nullptr);
+
+    GGML_ASSERT(res);
+    mgr.sequence()->eval<kp::OpTensorSyncLocal>({res});
+}
+
+std::vector<uint32_t> getSpirvShader(const unsigned char* rawData, size_t size) {
+    if (size % sizeof(uint32_t) != 0) {
+        throw std::runtime_error("Invalid size: must be divisible by sizeof(uint32_t)");
+    }
+
+    const uint32_t* data_ptr = reinterpret_cast<const uint32_t*>(rawData);
+    size_t count = size / sizeof(uint32_t);
+    return std::vector<uint32_t>(data_ptr, data_ptr + count);
+}
+
+inline static
+uint32_t safe_divide(uint32_t a, uint32_t b) {
+    if (b <= 1) {
+        return a;
+    }
+    if ((a % b) != 0) {
+        fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b);
+        GGML_ASSERT(!"safe_divide result would've had remainder");
+    }
+    return a / b;
+}
+
+void ggml_vk_add(kp::Sequence& seq,
+                    const std::shared_ptr<kp::Tensor>& inA,
+                    const std::shared_ptr<kp::Tensor>& inB,
+                    const std::shared_ptr<kp::Tensor>& out,
+                    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                    uint32_t size) {
+
+    const static auto spirv = getSpirvShader(kp::shader_data::op_add_comp_spv,
+        kp::shader_data::op_add_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+    } const pushConsts {
+        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4)
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_addrow(kp::Sequence& seq,
+                 const std::shared_ptr<kp::Tensor>& inA,
+                 const std::shared_ptr<kp::Tensor>& inB,
+                 const std::shared_ptr<kp::Tensor>& out,
+                 uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                 uint32_t size, uint32_t row = 0) {
+
+    const static auto spirv = getSpirvShader(kp::shader_data::op_addrow_comp_spv,
+        kp::shader_data::op_addrow_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        uint32_t row;
+    } const pushConsts {
+        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        row
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_mul(kp::Sequence& seq,
+                    const std::shared_ptr<kp::Tensor>& inA,
+                    const std::shared_ptr<kp::Tensor>& inB,
+                    const std::shared_ptr<kp::Tensor>& out,
+                    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                    uint32_t size) {
+
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_comp_spv,
+        kp::shader_data::op_mul_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+    } const pushConsts {
+        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4)
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_mulrow(kp::Sequence& seq,
+                 const std::shared_ptr<kp::Tensor>& inA,
+                 const std::shared_ptr<kp::Tensor>& inB,
+                 const std::shared_ptr<kp::Tensor>& out,
+                 uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                 uint32_t size, uint32_t row = 0) {
+
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mulrow_comp_spv,
+        kp::shader_data::op_mulrow_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        uint32_t row;
+    } const pushConsts {
+        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        row
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_scale(kp::Sequence& seq,
+                   const std::shared_ptr<kp::Tensor>& in,
+                   const std::shared_ptr<kp::Tensor>& out,
+                   uint32_t inOff, uint32_t outOff,
+                   uint32_t size, float scale) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_scale_comp_spv,
+        kp::shader_data::op_scale_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inOff, outOff;
+        float scale;
+    } const pushConsts {
+        safe_divide(inOff, 4), safe_divide(outOff, 4),
+        scale
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_xxlu(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
+                  const std::shared_ptr<kp::Tensor>& in,
+                  const std::shared_ptr<kp::Tensor>& out,
+                  uint32_t inOff, uint32_t outOff,
+                  uint32_t size) {
+    struct PushConstants {
+        uint32_t inOff, outOff;
+    } const pushConsts {
+        safe_divide(inOff, 4), safe_divide(outOff, 4),
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+template <typename... Args>
+void ggml_vk_silu(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_silu_comp_spv,
+        kp::shader_data::op_silu_comp_spv_len);
+
+    ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_relu(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_relu_comp_spv,
+        kp::shader_data::op_relu_comp_spv_len);
+
+    ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_gelu(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_gelu_comp_spv,
+        kp::shader_data::op_gelu_comp_spv_len);
+
+    ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
+}
+
+void ggml_vk_soft_max(kp::Sequence& seq,
+                      const std::shared_ptr<kp::Tensor>& in,
+                      const std::shared_ptr<kp::Tensor>& out,
+                      uint32_t inOff, uint32_t outOff,
+                      int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03) {
+
+    const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv,
+        kp::shader_data::op_softmax_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inOff, outOff;
+        int32_t ne00, ne01, ne02;
+    } pushConsts {
+        safe_divide(inOff, 4), safe_divide(outOff, 4),
+        ne00, ne01, ne02
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
+                   const std::shared_ptr<kp::Tensor>& in,
+                   const std::shared_ptr<kp::Tensor>& out,
+                   uint32_t inOff, uint32_t outOff,
+                   int32_t ne00, int32_t nb01,
+                   int32_t nrows) {
+    GGML_ASSERT(nb01%sizeof(float) == 0);
+    GGML_ASSERT(ne00%sizeof(float) == 0);
+
+    const float epsilon = 1e-6f; // this is what ggml.c uses for rms norm
+
+    struct PushConstants {
+        uint32_t inOff, outOff;
+        uint32_t ne00, nb01;
+        float eps;
+    } pushConsts {
+        safe_divide(inOff, 4), safe_divide(outOff, 4),
+        (uint32_t)ne00, (uint32_t)nb01, epsilon
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({(uint32_t)nrows});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+template <typename... Args>
+void ggml_vk_norm(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_norm_comp_spv,
+        kp::shader_data::op_norm_comp_spv_len);
+
+    ggml_vk_norm_(spirv, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_rms_norm(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_rmsnorm_comp_spv,
+        kp::shader_data::op_rmsnorm_comp_spv_len);
+
+    ggml_vk_norm_(spirv, std::forward<Args>(args)...);
+}
+
+void ggml_vk_diag_mask_inf(kp::Sequence& seq,
+                           const std::shared_ptr<kp::Tensor>& in,
+                           const std::shared_ptr<kp::Tensor>& out,
+                           uint32_t inOff, uint32_t outOff,
+                           uint32_t n_past,
+                           int32_t ne00, int32_t ne01, int32_t ne02) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_diagmask_comp_spv,
+        kp::shader_data::op_diagmask_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inOff, outOff;
+        uint32_t n_past;
+        int32_t ne00, ne01;
+    } pushConsts {
+        safe_divide(inOff, 4), safe_divide(outOff, 4),
+        n_past,
+        ne00, ne01
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({unsigned(ne00), unsigned(ne01), unsigned(ne02)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_mul_mat_f16(kp::Sequence& seq,
+                         const std::shared_ptr<kp::Tensor>& inA,
+                         const std::shared_ptr<kp::Tensor>& inB,
+                         const std::shared_ptr<kp::Tensor>& out,
+                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                         int32_t ne00, int32_t ne01,
+                         uint32_t nb01, uint32_t nb02,
+                         int32_t ne11, int32_t ne12,
+                         uint32_t nb11, uint32_t nb12,
+                         int32_t ne0, int32_t ne1) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_f16_comp_spv,
+        kp::shader_data::op_mul_mat_f16_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00;
+        uint32_t nb01, nb02;
+        uint32_t nb11, nb12;
+        int32_t ne0, ne1;
+    } pushConsts {
+        safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, nb01, nb02, nb11, nb12, ne0, ne1,
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_size, kp::Sequence& seq,
+                          const std::shared_ptr<kp::Tensor>& inA,
+                          const std::shared_ptr<kp::Tensor>& inB,
+                          const std::shared_ptr<kp::Tensor>& out,
+                          uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                          int32_t ne00, int32_t ne10, int32_t ne0,
+                          int32_t ne01, int32_t ne11) {
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, ne10, ne0;
+    } pushConsts {
+        safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, ne10, ne0,
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11)}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+template <typename... Args>
+void ggml_vk_mul_mat_q4_0(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_0_comp_spv,
+        kp::shader_data::op_mul_mat_q4_0_comp_spv_len);
+
+    ggml_vk_mul_mat_q4_x(spirv, 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
+}
+
+// FIXME: This could be improved like was done in q4_0 version but needs testing...
+template <typename... Args>
+void ggml_vk_mul_mat_q4_1(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_1_comp_spv,
+        kp::shader_data::op_mul_mat_q4_1_comp_spv_len);
+
+    ggml_vk_mul_mat_q4_x(spirv, 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
+}
+
+void ggml_vk_get_rows(const std::vector<uint32_t>& spirv,
+                      unsigned element_size, unsigned qk,
+                      kp::Sequence& seq,
+                      const std::shared_ptr<kp::Tensor>& inA,
+                      const std::shared_ptr<kp::Tensor>& inB,
+                      const std::shared_ptr<kp::Tensor>& out,
+                      uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                      int32_t ne00, int32_t nb01, int32_t nb1,
+                      uint32_t size) {
+    GGML_ASSERT(nb01%element_size == 0);
+    GGML_ASSERT(nb1%sizeof(float) == 0);
+    if (qk) GGML_ASSERT(ne00%qk == 0);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, nb01, nb1;
+    } pushConsts {
+        safe_divide(inAOff, element_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, nb01, nb1
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+template <typename... Args>
+void ggml_vk_get_rows_f16(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv,
+        kp::shader_data::op_getrows_f16_comp_spv_len);
+
+    ggml_vk_get_rows(spirv, sizeof(half), 0, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_get_rows_q4_0(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_0_comp_spv,
+        kp::shader_data::op_getrows_q4_0_comp_spv_len);
+
+    ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK4_0, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_get_rows_q4_1(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_1_comp_spv,
+        kp::shader_data::op_getrows_q4_1_comp_spv_len);
+
+    ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK4_1, std::forward<Args>(args)...);
+}
+
+void ggml_vk_rope(kp::Sequence& seq,
+                  const std::shared_ptr<kp::Tensor>& in,
+                  const std::shared_ptr<kp::Tensor>& out,
+                  uint32_t inOff, uint32_t outOff,
+                  uint32_t n_past, int32_t n_dims, int32_t mode,
+                  float freq_base, float freq_scale,
+                  int32_t ne01, int32_t ne02, int32_t ne03,
+                  uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
+                  int32_t ne0,
+                  uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_rope_comp_spv,
+        kp::shader_data::op_rope_comp_spv_len);
+
+    GGML_ASSERT(nb03%sizeof(float) == 0);
+    GGML_ASSERT(nb02%sizeof(float) == 0);
+    GGML_ASSERT(nb01%sizeof(float) == 0);
+    GGML_ASSERT(nb00%sizeof(float) == 0);
+    GGML_ASSERT(nb3%sizeof(float) == 0);
+    GGML_ASSERT(nb2%sizeof(float) == 0);
+    GGML_ASSERT(nb1%sizeof(float) == 0);
+    GGML_ASSERT(nb0%sizeof(float) == 0);
+
+    struct PushConstants {
+        uint32_t inOff, outOff;
+        uint32_t n_past;
+        int32_t n_dims, mode;
+        float freq_base, freq_scale;
+        uint32_t nb00, nb01, nb02, nb03;
+        int32_t ne0;
+        uint32_t nb0, nb1, nb2, nb3;
+    } pushConsts {
+        safe_divide(inOff, 4), safe_divide(outOff, 4),
+        n_past, n_dims, mode,
+        freq_base, freq_scale,
+        nb00, nb01, nb02, nb03,
+        ne0,
+        nb0, nb1, nb2, nb3
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+template<uint32_t in_element_size, uint32_t out_element_size>
+void ggml_vk_cpy(const std::vector<uint32_t>& spirv,
+                 kp::Sequence& seq,
+                 const std::shared_ptr<kp::Tensor>& in,
+                 const std::shared_ptr<kp::Tensor>& out,
+                 uint32_t inOff, uint32_t outOff,
+                 int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
+                 uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
+                 int32_t ne0, int32_t ne1, int32_t ne2,
+                 uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3) {
+    struct PushConstants {
+        uint32_t inOff, outOff;
+        int32_t ne00, ne01, ne02;
+        uint32_t nb00, nb01, nb02, nb03;
+        int32_t ne0, ne1, ne2;
+        uint32_t nb0, nb1, nb2, nb3;
+    } pushConsts {
+        safe_divide(inOff, in_element_size), safe_divide(outOff, out_element_size),
+        ne00, ne01, ne02,
+        nb00, nb01, nb02, nb03,
+        ne0, ne1, ne2,
+        nb0, nb1, nb2, nb3
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+template <typename... Args>
+void ggml_vk_cpy_f32_f16(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f16_comp_spv,
+        kp::shader_data::op_cpy_f32_f16_comp_spv_len);
+    ggml_vk_cpy<4, 2>(spirv, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_cpy_f32_f32(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f32_comp_spv,
+        kp::shader_data::op_cpy_f32_f32_comp_spv_len);
+    ggml_vk_cpy<4, 4>(spirv, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_cpy_f16_f16(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f16_comp_spv,
+        kp::shader_data::op_cpy_f16_f16_comp_spv_len);
+    ggml_vk_cpy<2, 2>(spirv, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_cpy_f16_f32(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f32_comp_spv,
+        kp::shader_data::op_cpy_f16_f32_comp_spv_len);
+    ggml_vk_cpy<2, 4>(spirv, std::forward<Args>(args)...);
+}
+
+void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
+    const int n_seq = 8;
+
+    // FIXME: Figure out if we can somehow optimize the size of the pool... right now we're setting
+    // it to the size of the graph, but I think it can be made smaller?
+    ggml_vk_allocate_descriptor_pool(ctx, gf->n_nodes);
+
+    std::vector<std::shared_ptr<kp::Sequence>> sequences(n_seq);
+
+    for (auto& sequence : sequences) {
+        sequence = mgr.sequence();
+    }
+    for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) {
+        const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq;
+
+        auto& seq = *sequences[seq_idx];
+
+        const int node_start = (seq_idx + 0) * n_nodes_per_seq;
+        const int node_end = (seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq;
+
+        for (int i = node_start; i < node_end; ++i) {
+            struct ggml_tensor * src0 = gf->nodes[i]->src[0];
+            struct ggml_tensor * src1 = gf->nodes[i]->src[1];
+            struct ggml_tensor * dst = gf->nodes[i];
+            GGML_ASSERT(dst->data != nullptr);
+
+            const int32_t ne00 = src0 ? src0->ne[0] : 0;
+            const int32_t ne01 = src0 ? src0->ne[1] : 0;
+            const int32_t ne02 = src0 ? src0->ne[2] : 0;
+            const int32_t ne03 = src0 ? src0->ne[3] : 0;
+
+            const uint32_t nb00 = src0 ? src0->nb[0] : 0;
+            const uint32_t nb01 = src0 ? src0->nb[1] : 0;
+            const uint32_t nb02 = src0 ? src0->nb[2] : 0;
+            const uint32_t nb03 = src0 ? src0->nb[3] : 0;
+
+            const int32_t ne10 = src1 ? src1->ne[0] : 0;
+            const int32_t ne11 = src1 ? src1->ne[1] : 0;
+            const int32_t ne12 = src1 ? src1->ne[2] : 0;
+//            const int32_t ne13 = src1 ? src1->ne[3] : 0;
+
+//            const uint32_t nb10 = src1 ? src1->nb[0] : 0;
+            const uint32_t nb11 = src1 ? src1->nb[1] : 0;
+            const uint32_t nb12 = src1 ? src1->nb[2] : 0;
+//            const uint32_t nb13 = src1 ? src1->nb[3] : 0;
+
+            const int32_t ne0 = dst ? dst->ne[0] : 0;
+            const int32_t ne1 = dst ? dst->ne[1] : 0;
+            const int32_t ne2 = dst ? dst->ne[2] : 0;
+//            const int32_t ne3 = dst ? dst->ne[3] : 0;
+
+            const uint32_t nb0 = dst ? dst->nb[0] : 0;
+            const uint32_t nb1 = dst ? dst->nb[1] : 0;
+            const uint32_t nb2 = dst ? dst->nb[2] : 0;
+            const uint32_t nb3 = dst ? dst->nb[3] : 0;
+
+            const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
+//            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+            const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
+
+            const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
+            uint32_t off_src0 = 0;
+            uint32_t off_src1 = 0;
+            uint32_t off_dst = 0;
+            const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(ctx, src0, &off_src0) : nullTensor;
+            const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(ctx, src1, &off_src1) : nullTensor;
+            const std::shared_ptr<kp::Tensor>& id_dst  = dst ? ggml_vk_get_tensor(ctx, dst, &off_dst)  : nullTensor;
+
+            switch (dst->op) {
+                case GGML_OP_RESHAPE:
+                case GGML_OP_VIEW:
+                case GGML_OP_TRANSPOSE:
+                case GGML_OP_PERMUTE:
+                    {
+                        // noop
+                    } break;
+                case GGML_OP_ADD:
+                    {
+                        if (ggml_nelements(src1) == ne10) {
+                            // src1 is a row
+                            ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst), ne00);
+                        } else {
+                            ggml_vk_add(seq, id_src0, id_src1, id_dst,  off_src0, off_src1, off_dst, ggml_nelements(dst));
+                        }
+                    } break;
+                case GGML_OP_MUL:
+                    {
+                        if (ggml_nelements(src1) == ne10) {
+                            // src1 is a row
+                            ggml_vk_mulrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst), ne00);
+                        } else {
+                            ggml_vk_mul(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst));
+                        }
+                    } break;
+                case GGML_OP_SCALE:
+                    {
+                        const float scale = *(const float *) src1->data;
+                        ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst), scale);
+                    } break;
+                case GGML_OP_UNARY:
+                    switch (ggml_get_unary_op(gf->nodes[i])) {
+                        case GGML_UNARY_OP_SILU:
+                            {
+                                ggml_vk_silu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst));
+                            } break;
+                        case GGML_UNARY_OP_RELU:
+                            {
+                                ggml_vk_relu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst));
+                            } break;
+                        case GGML_UNARY_OP_GELU:
+                            {
+                                ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst));
+                            } break;
+                        default:
+                            {
+                                fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                                GGML_ASSERT(false);
+                            }
+                    } break;
+                case GGML_OP_SOFT_MAX:
+                    {
+                        ggml_vk_soft_max(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03);
+                    } break;
+                case GGML_OP_DIAG_MASK_INF:
+                    {
+                        const int n_past = ((int32_t *)(dst->op_params))[0];
+                        ggml_vk_diag_mask_inf(seq, id_src0, id_dst, off_src0, off_dst, n_past, ne00, ne01, ne02);
+                    } break;
+                case GGML_OP_NORM:
+                    {
+                        ggml_vk_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0));
+                    } break;
+                case GGML_OP_RMS_NORM:
+                    {
+                        ggml_vk_rms_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0));
+                    } break;
+                case GGML_OP_MUL_MAT:
+                    {
+                        if ((src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32)
+                            && src1->type == GGML_TYPE_F32) {
+                            ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
+                        } else if (src0->type == GGML_TYPE_Q4_0
+                                   && src1->type == GGML_TYPE_F32) {
+                            ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11);
+                        } else if (src0->type == GGML_TYPE_Q4_1
+                                   && src1->type == GGML_TYPE_F32) {
+                            ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11);
+                        } else {
+                            fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0->type, src1->type);
+                            goto not_implemented;
+                        }
+                    } break;
+                case GGML_OP_GET_ROWS:
+                    {
+                        if (src0->type == GGML_TYPE_F16) {
+                            ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
+                        } else if (src0->type == GGML_TYPE_Q4_0) {
+                            ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
+                        } else if (src0->type == GGML_TYPE_Q4_1) {
+                            ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
+                        } else {
+                            fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0->type);
+                            goto not_implemented;
+                        }
+                    } break;
+                case GGML_OP_ROPE:
+                    {
+                        const int n_past = ((int32_t *) dst->op_params)[0];
+                        const int n_dims = ((int32_t *) dst->op_params)[1];
+                        const int mode   = ((int32_t *) dst->op_params)[2];
+                        float freq_base;
+                        float freq_scale;
+                        memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
+                        memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+                        ggml_vk_rope(seq, id_src0, id_dst, off_src0, off_dst, n_past, n_dims, mode, freq_base, freq_scale, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3);
+                    } break;
+                case GGML_OP_DUP:
+                case GGML_OP_CPY:
+                case GGML_OP_CONT:
+                    {
+                        switch (src0t) {
+                            case GGML_TYPE_F32:
+                                {
+                                    switch (dstt) {
+                                        case GGML_TYPE_F16: ggml_vk_cpy_f32_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
+                                        case GGML_TYPE_F32: ggml_vk_cpy_f32_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
+                                        default: goto not_implemented;
+                                    }
+                                } break;
+                            case GGML_TYPE_F16:
+                                {
+                                    switch (dstt) {
+                                        case GGML_TYPE_F16: ggml_vk_cpy_f16_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
+                                        case GGML_TYPE_F32: ggml_vk_cpy_f16_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
+                                    default: goto not_implemented;
+                                } break;
+                            default: goto not_implemented;
+                            }
+                        }
+                    } break;
+                default: goto not_implemented;
+            }
+            continue;
+            not_implemented: {}
+            fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+            //GGML_ASSERT(false);
+        }
+
+        // Evaluate sequence
+        seq.evalAsync();
+    }
+
+    // Wait for all sequences to finish
+    for (auto& sequence : sequences) {
+        if (sequence->isRunning())
+            sequence->evalAwait();
+    }
+
+    ggml_vk_free_descriptor_pool(ctx);
+}
+
+template<>
+kp::Tensor::TensorDataTypes
+kp::TensorT<half>::dataType()
+{
+    return TensorDataTypes::eFloat;
+}
+
+template<>
+kp::Tensor::TensorDataTypes
+kp::TensorT<uint8_t>::dataType()
+{
+    return TensorDataTypes::eUnsignedInt;
+}
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
new file mode 100644
index 000000000..ad8b41e4d
--- /dev/null
+++ b/ggml-vulkan.h
@@ -0,0 +1,61 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+#include <string>
+
+struct ggml_kompute_context;
+
+namespace vk {
+    class DeviceMemory;
+    class Buffer;
+};
+
+struct ggml_vk_memory {
+    void *data = nullptr;
+    size_t size = 0;
+    vk::DeviceMemory *primaryMemory = nullptr;
+    vk::Buffer *primaryBuffer = nullptr;
+    vk::DeviceMemory *stagingMemory = nullptr;
+    vk::Buffer *stagingBuffer = nullptr;
+};
+
+struct ggml_vk_device {
+    int index = 0;
+    int type = 0;           // same as VkPhysicalDeviceType
+    size_t heapSize = 0;
+    std::string name;
+    std::string vendor;
+};
+
+std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired);
+bool ggml_vk_init_device(size_t memoryRequired, const std::string &device);
+bool ggml_vk_init_device(const ggml_vk_device &device);
+bool ggml_vk_init_device(int device);
+bool ggml_vk_has_device();
+ggml_vk_device ggml_vk_current_device();
+struct ggml_kompute_context * ggml_vk_init(void);
+bool ggml_vk_has_h2d_all(struct ggml_kompute_context * ctx);
+void ggml_vk_free(struct ggml_kompute_context * ctx);
+size_t ggml_vk_aligned_offset(size_t offset);
+ggml_vk_memory ggml_vk_allocate(size_t size);
+void ggml_vk_free_memory(ggml_vk_memory &memory);
+
+void ggml_vk_add_buffer(
+    struct ggml_kompute_context * ctx,
+    const char * name,
+    const ggml_vk_memory &memory);
+
+void ggml_vk_h2d_all(struct ggml_kompute_context * ctx);
+void ggml_vk_d2h_all(struct ggml_kompute_context * ctx);
+void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
+void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
+void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf);
diff --git a/ggml.c b/ggml.c
index a0be068d6..cf9e056ba 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9007,7 +9007,7 @@ static void ggml_compute_forward_add_q_f32(
     }
 }
 
-static void ggml_compute_forward_add(
+void ggml_compute_forward_add(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -9587,7 +9587,7 @@ static void ggml_compute_forward_mul_f32(
     }
 }
 
-static void ggml_compute_forward_mul(
+void ggml_compute_forward_mul(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -10510,7 +10510,7 @@ static void ggml_compute_forward_elu(
 
 // ggml_compute_forward_relu
 
-static void ggml_compute_forward_relu_f32(
+void ggml_compute_forward_relu_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10534,7 +10534,7 @@ static void ggml_compute_forward_relu_f32(
     }
 }
 
-static void ggml_compute_forward_relu(
+void ggml_compute_forward_relu(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10552,7 +10552,7 @@ static void ggml_compute_forward_relu(
 
 // ggml_compute_forward_gelu
 
-static void ggml_compute_forward_gelu_f32(
+void ggml_compute_forward_gelu_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10593,7 +10593,7 @@ static void ggml_compute_forward_gelu_f32(
     }
 }
 
-static void ggml_compute_forward_gelu(
+void ggml_compute_forward_gelu(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10670,7 +10670,7 @@ static void ggml_compute_forward_gelu_quick(
 
 // ggml_compute_forward_silu
 
-static void ggml_compute_forward_silu_f32(
+void ggml_compute_forward_silu_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10711,7 +10711,7 @@ static void ggml_compute_forward_silu_f32(
     }
 }
 
-static void ggml_compute_forward_silu(
+void ggml_compute_forward_silu(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10844,7 +10844,7 @@ static void ggml_compute_forward_norm_f32(
     }
 }
 
-static void ggml_compute_forward_norm(
+void ggml_compute_forward_norm(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10910,7 +10910,7 @@ static void ggml_compute_forward_rms_norm_f32(
     }
 }
 
-static void ggml_compute_forward_rms_norm(
+void ggml_compute_forward_rms_norm(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -11623,7 +11623,7 @@ static void ggml_compute_forward_scale_f32(
     }
 }
 
-static void ggml_compute_forward_scale(
+void ggml_compute_forward_scale(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -11744,7 +11744,7 @@ static void ggml_compute_forward_set(
 
 // ggml_compute_forward_cpy
 
-static void ggml_compute_forward_cpy(
+void ggml_compute_forward_cpy(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -11888,7 +11888,7 @@ static void ggml_compute_forward_get_rows_f32(
     }
 }
 
-static void ggml_compute_forward_get_rows(
+void ggml_compute_forward_get_rows(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -12164,7 +12164,7 @@ static void ggml_compute_forward_diag_mask_f32(
     }
 }
 
-static void ggml_compute_forward_diag_mask_inf(
+void ggml_compute_forward_diag_mask_inf(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -12198,7 +12198,7 @@ static void ggml_compute_forward_diag_mask_zero(
 
 // ggml_compute_forward_soft_max
 
-static void ggml_compute_forward_soft_max_f32(
+void ggml_compute_forward_soft_max_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -12887,7 +12887,7 @@ static void ggml_compute_forward_rope_f16(
     }
 }
 
-static void ggml_compute_forward_rope(
+void ggml_compute_forward_rope(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
diff --git a/kompute/.ccls b/kompute/.ccls
new file mode 100644
index 000000000..71d5d711e
--- /dev/null
+++ b/kompute/.ccls
@@ -0,0 +1,27 @@
+
+%clang
+
+-fdeclspec
+-fms-extensions
+-Wall
+-Wextra
+-std=c++17
+
+%h -x
+%h c++-header
+
+-DDEBUG=1
+-DKOMPUTE_INCLUDE_FOR_SYNTAX
+
+-I/usr/include/python3.6/
+-I./python/pybind11/include/
+
+-I./build/_deps/vulkan_header-src/include/
+-I./build/_deps/spdlog-src/include/
+-I./build/_deps/googletest-src/googletest/include/
+-I./build/_deps/fmt-src/include/
+
+-I./src/include/
+-I./build/src/shaders/glsl/
+-I./build/test/shaders/glsl/
+-I./test/utils/
diff --git a/kompute/.clang-format b/kompute/.clang-format
new file mode 100644
index 000000000..5191313a3
--- /dev/null
+++ b/kompute/.clang-format
@@ -0,0 +1,5 @@
+﻿---
+BasedOnStyle: Mozilla
+IndentWidth: 4
+
+...
diff --git a/kompute/.dockerignore b/kompute/.dockerignore
new file mode 100644
index 000000000..9498d9195
--- /dev/null
+++ b/kompute/.dockerignore
@@ -0,0 +1,4 @@
+build/*
+examples/*
+docker-builders/
+swiftshader/
diff --git a/kompute/.github/workflows/cpp_examples.yml b/kompute/.github/workflows/cpp_examples.yml
new file mode 100644
index 000000000..ad5306e9b
--- /dev/null
+++ b/kompute/.github/workflows/cpp_examples.yml
@@ -0,0 +1,58 @@
+name: C++ Tests
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  array-multiplication-example:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    env:
+      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: "[Release g++] Build & Test"
+      uses: KomputeProject/action-cmake-build@master
+      with:
+        build-dir: ${{github.workspace}}/examples/array_multiplication/build
+        source-dir: ${{github.workspace}}/examples/array_multiplication
+        cc: gcc
+        cxx: g++
+        build-type: Debug
+        run-test: false
+        ctest-options: -V
+        configure-options: -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON KOMPUTE_OPT_FROM_SOURCE=ON
+        build-options: --parallel # Given we don't build too many resources we can leverage parallel
+    - name: Run tests
+      run: ./examples/array_multiplication/build/src/kompute_array_mult
+
+  logistc-regression-example:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    env:
+      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: "[Release g++] Build & Test"
+      uses: KomputeProject/action-cmake-build@master
+      with:
+        build-dir: ${{github.workspace}}/examples/logistic_regression/build
+        source-dir: ${{github.workspace}}/examples/logistic_regression
+        cc: gcc
+        cxx: g++
+        build-type: Debug
+        run-test: false
+        ctest-options: -V
+        configure-options: -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON KOMPUTE_OPT_FROM_SOURCE=ON
+        build-options: --parallel # Given we don't build too many resources we can leverage parallel
+    - name: Run tests
+      run: ./examples/logistic_regression/build/src/kompute_logistic_regression
diff --git a/kompute/.github/workflows/cpp_tests.yml b/kompute/.github/workflows/cpp_tests.yml
new file mode 100644
index 000000000..53a90a145
--- /dev/null
+++ b/kompute/.github/workflows/cpp_tests.yml
@@ -0,0 +1,104 @@
+name: C++ Tests
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  cpp-tests-debug-with-debug-layers:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    env:
+      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: "[Release g++] Build & Test"
+      uses: KomputeProject/action-cmake-build@master
+      with:
+        build-dir: ${{github.workspace}}/build
+        source-dir: ${{github.workspace}}
+        cc: gcc
+        cxx: g++
+        build-type: Debug
+        run-test: false
+        ctest-options: -V
+        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=OFF -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
+    - name: Run tests
+      run: make mk_run_tests
+
+  cpp-tests-release-with-debug-layers:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    env:
+      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: "[Release g++] Build & Test"
+      uses: KomputeProject/action-cmake-build@master
+      with:
+        build-dir: ${{github.workspace}}/build
+        source-dir: ${{github.workspace}}
+        cc: gcc
+        cxx: g++
+        build-type: Release
+        run-test: false
+        ctest-options: -V
+        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=OFF -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
+    - name: Run tests
+      run: make mk_run_tests
+
+  cpp-tests-debug-without-debug-layers:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    env:
+      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: "[Release g++] Build & Test"
+      uses: KomputeProject/action-cmake-build@master
+      with:
+        build-dir: ${{github.workspace}}/build
+        source-dir: ${{github.workspace}}
+        cc: gcc
+        cxx: g++
+        build-type: Debug
+        run-test: false
+        ctest-options: -V
+        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=ON -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
+    - name: Run tests
+      run: make mk_run_tests
+  
+  cpp-tests-release-without-debug-layers:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    env:
+      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: "[Release g++] Build & Test"
+      uses: KomputeProject/action-cmake-build@master
+      with:
+        build-dir: ${{github.workspace}}/build
+        source-dir: ${{github.workspace}}
+        cc: gcc
+        cxx: g++
+        build-type: Release
+        run-test: false
+        ctest-options: -V
+        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=ON -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
+    - name: Run tests
+      run: make mk_run_tests
diff --git a/kompute/.github/workflows/python_tests.yml b/kompute/.github/workflows/python_tests.yml
new file mode 100644
index 000000000..9f84d1e85
--- /dev/null
+++ b/kompute/.github/workflows/python_tests.yml
@@ -0,0 +1,28 @@
+name: Python Tests
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  python-tests:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: Install Python Requirements
+      run: pip3 install --user -r python/test/requirements-dev.txt
+    - name: Python Build
+      env:
+        KOMPUTE_PYTHON_NUM_PARALLEL_THREADS: 2
+        KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER: ON
+      run: pip3 install --user . -v
+    - name: Python run Tests
+      run: |
+        export VK_ICD_FILENAMES=/swiftshader/vk_swiftshader_icd.json
+        make test_python
diff --git a/kompute/CMakeLists.txt b/kompute/CMakeLists.txt
new file mode 100644
index 000000000..f89e13d1d
--- /dev/null
+++ b/kompute/CMakeLists.txt
@@ -0,0 +1,187 @@
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20)
+project(kompute VERSION 0.8.1 LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 14)
+
+# Only change the folder behavior if kompute is not a subproject
+if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
+    set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+    set_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER "CMake")
+    set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}/bin)
+    set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}/lib)
+endif()
+
+# Avoid the dll boilerplate code for windows
+set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake;${CMAKE_MODULE_PATH}")
+
+set(KOMPUTE_LIBRARIES kompute CACHE INTERNAL "")
+
+# ####################################################
+# Options
+# ####################################################
+macro(kompute_option OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
+    option(${OPTION_NAME} ${OPTION_TEXT} ${OPTION_DEFAULT})
+
+    if(DEFINED ENV{${OPTION_NAME}})
+        # Allow overriding the option through an environment variable
+        set(${OPTION_NAME} $ENV{${OPTION_NAME}})
+    endif()
+
+    if(${OPTION_NAME})
+        add_definitions(-D${OPTION_NAME})
+    endif()
+
+    message(STATUS "  ${OPTION_NAME}: ${${OPTION_NAME}}")
+endmacro()
+
+macro(kompute_log_level OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
+    set(${OPTION_NAME} ${OPTION_DEFAULT} CACHE STRING ${OPTION_TEXT})
+    set_property(CACHE ${OPTION_NAME} PROPERTY STRINGS "Trace" "Debug" "Info" "Warn" "Error" "Critical" "Default" "Off")
+
+    if(DEFINED ENV{${OPTION_NAME}})
+        # Allow setting the option through an environment variable
+        set(${OPTION_NAME} $ENV{${OPTION_NAME}})
+    endif()
+
+    if(${OPTION_NAME})
+        add_definitions(-D${OPTION_NAME})
+    endif()
+
+    # Allow disabling logging completely and prevent linking against it:
+    if(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Off")
+        set(${OPTION_NAME}_DISABLED ON)
+        add_compile_definitions(${OPTION_NAME}_DISABLED=1)
+    endif()
+
+    message(STATUS "  ${OPTION_NAME}: ${${OPTION_NAME}}")
+endmacro()
+
+macro(kompute_option_string OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
+    set(${OPTION_NAME} ${OPTION_DEFAULT} CACHE STRING ${OPTION_TEXT})
+
+    if(DEFINED ENV{${OPTION_NAME}})
+        # Allow setting the option through an environment variable
+        set(${OPTION_NAME} $ENV{${OPTION_NAME}})
+    endif()
+
+    if(${OPTION_NAME})
+        add_definitions(-D${OPTION_NAME})
+    endif()
+
+    message(STATUS "  ${OPTION_NAME}: ${${OPTION_NAME}}")
+endmacro()
+
+message(STATUS "General purpose GPU compute framework built on Vulkan")
+message(STATUS "=======================================================")
+
+# Build options
+kompute_log_level(KOMPUTE_OPT_LOG_LEVEL "Internally we use Spdlog or fmt for logging, depending on the value of 'KOMPUTE_OPT_USE_SPDLOG'. The log level used can be changed here. Possible values: 'Trace', 'Debug', 'Info', 'Warn', 'Error', 'Critical', 'Off', 'Default'. If set to 'Off' logging will be deactivated completely. If set to 'Default', the log level will be set to 'Info' for release builds and 'Debug' else." "Off")
+kompute_option(KOMPUTE_OPT_USE_SPDLOG "If enabled, logging via KP_LOG_<DEBUG, INFO, etc...> will happen through Spdlog instead of plan fmt." OFF)
+kompute_option(KOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS "Explicitly disable debug layers even on debug." ON)
+kompute_option(KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK "Whether to check if your driver supports the Vulkan Header version you are linking against. This might be useful in case you build shared on a different system than you run later." OFF)
+kompute_option(KOMPUTE_OPT_BUILD_SHADERS "Rebuilds all compute shaders during compilation and does not use the already precompiled versions. Requires glslangValidator to be installed on your system." OFF)
+
+# External components
+kompute_option(KOMPUTE_OPT_USE_BUILT_IN_SPDLOG "Use the built-in version of Spdlog. Requires 'KOMPUTE_OPT_USE_SPDLOG' to be set to ON in order to have any effect." ON)
+kompute_option(KOMPUTE_OPT_SPDLOG_ASYNC_MODE "If spdlog is enabled this allows for selecting whether the default logger setup creates sync or async logger" OFF)
+kompute_option(KOMPUTE_OPT_USE_BUILT_IN_FMT "Use the built-in version of fmt." ON)
+kompute_option(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER "Use the built-in version of Vulkan Headers. This could be helpful in case your system Vulkan Headers are too new for your driver. If you set this to OFF, please make sure your system Vulkan Headers are supported by your driver." ON)
+kompute_option_string(KOMPUTE_OPT_BUILT_IN_VULKAN_HEADER_TAG "The git tag used for the built-in Vulkan Headers when 'KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER' is enabled. A list of tags can be found here: https://github.com/KhronosGroup/Vulkan-Headers/tags" "v1.3.231")
+message(STATUS "=======================================================")
+
+# ####################################################
+# Deprecated Options
+# ####################################################
+include(cmake/deprecation_warnings.cmake)
+
+# ####################################################
+# Dependencies
+# ####################################################
+include(cmake/vulkan_shader_compiler.cmake)
+include(cmake/check_vulkan_version.cmake)
+include(FetchContent)
+
+# Vulkan Header
+if(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER)
+    FetchContent_Declare(vulkan_header GIT_REPOSITORY https://github.com/KhronosGroup/Vulkan-Headers.git
+        GIT_TAG ${KOMPUTE_OPT_BUILT_IN_VULKAN_HEADER_TAG}) # Source: https://github.com/KhronosGroup/Vulkan-Headers/tags
+    FetchContent_MakeAvailable(vulkan_header)
+
+    if(NOT KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK)
+        # Ensure the driver supports this Vulkan version
+        check_vulkan_version(INCLUDE_DIR "${vulkan_header_SOURCE_DIR}/include")
+    endif()
+endif()
+
+find_package(Vulkan REQUIRED)
+
+if(Vulkan_FOUND AND NOT TARGET Vulkan::Headers)
+    add_library(Vulkan::Headers INTERFACE IMPORTED)
+    set_target_properties(Vulkan::Headers PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES "${Vulkan_INCLUDE_DIRS}")
+endif()
+
+if(NOT KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER AND NOT KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK)
+    # Ensure the driver supports this Vulkan version
+    check_vulkan_version(INCLUDE_DIR ${Vulkan_INCLUDE_DIR})
+endif()
+
+# Spdlog
+if(KOMPUTE_OPT_USE_SPDLOG)
+    add_compile_definitions(KOMPUTE_OPT_USE_SPDLOG=1)
+
+    if(NOT KOMPUTE_OPT_LOG_LEVEL_DISABLED)
+        if(KOMPUTE_OPT_USE_BUILT_IN_SPDLOG)
+            set(SPDLOG_BUILD_SHARED ${BUILD_SHARED_LIBS})
+
+            FetchContent_Declare(spdlog GIT_REPOSITORY https://github.com/gabime/spdlog.git
+                GIT_TAG v1.10.0) # Source: https://github.com/gabime/spdlog/releases
+            FetchContent_MakeAvailable(spdlog)
+        else()
+            find_package(spdlog REQUIRED)
+        endif()
+    endif()
+endif()
+
+# fmt
+if(KOMPUTE_OPT_USE_BUILT_IN_FMT)
+    FetchContent_Declare(fmt GIT_REPOSITORY https://github.com/fmtlib/fmt.git
+        GIT_TAG 10.0.0) # Source: https://github.com/fmtlib/fmt/releases
+    FetchContent_MakeAvailable(fmt)
+else()
+    find_package(fmt REQUIRED)
+endif()
+
+# ####################################################
+# Preprocessor Macros
+# ####################################################
+if(KOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS)
+    add_compile_definitions(KOMPUTE_DISABLE_VK_DEBUG_LAYERS=1)
+endif()
+
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Werror")
+endif()
+
+# If glslang is cloned, then SPIRV/GlslangToSpv.h will be used instead of glslang/SPIRV/GlslangToSpv.h
+# As after installation, SPIRV/ header files will be found in glslang/SPIRV/ , more info in #193
+if(KOMPUTE_OPT_REPO_SUBMODULE_BUILD)
+    add_definitions(-DUSE_EXTERNAL_GLSLANG)
+endif()
+
+# Allow scripts to call main kompute Makefile
+function(kompute_make KOMPUTE_MAKE_TARGET)
+    add_custom_target(${KOMPUTE_MAKE_TARGET}
+        COMMAND make -C ${PROJECT_SOURCE_DIR} ${KOMPUTE_MAKE_TARGET})
+endfunction()
+
+add_executable(xxd external/bin/xxd.c)
+
+add_subdirectory(src)
diff --git a/kompute/LICENSE b/kompute/LICENSE
new file mode 100644
index 000000000..821a2723e
--- /dev/null
+++ b/kompute/LICENSE
@@ -0,0 +1,203 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2021 The Institute for Ethical AI & Machine Learning
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/kompute/Makefile b/kompute/Makefile
new file mode 100644
index 000000000..62ad68b46
--- /dev/null
+++ b/kompute/Makefile
@@ -0,0 +1,210 @@
+# This makefile is optimized to be run from WSL and to interact with the 
+# Windows host as there are limitations when building GPU programs. This
+# makefile contains the commands for interacting with the visual studio
+# build via command line for faster iterations, as the intention is to 
+# support other editors (optimised for vim). There are also commands that
+# support the builds for linux-native compilations and these are the commands
+# starting with mk_.
+
+VERSION := $(shell cat ./VERSION)
+
+VCPKG_WIN_PATH ?= "C:\\Users\\axsau\\Programming\\lib\\vcpkg\\scripts\\buildsystems\\vcpkg.cmake"
+VCPKG_UNIX_PATH ?= "/c/Users/axsau/Programming/lib/vcpkg/scripts/buildsystems/vcpkg.cmake"
+
+# These are the tests that don't work with swiftshader but can be run directly with vulkan
+FILTER_TESTS ?= "-TestAsyncOperations.TestManagerParallelExecution:TestSequence.SequenceTimestamps:TestPushConstants.TestConstantsDouble"
+
+ifeq ($(OS),Windows_NT)     # is Windows_NT on XP, 2000, 7, Vista, 10...
+	CMAKE_BIN ?= "C:\Program Files\CMake\bin\cmake.exe"
+	SCMP_BIN="C:\\VulkanSDK\\1.2.141.2\\Bin32\\glslangValidator.exe"
+	MSBUILD_BIN ?= "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\MSBuild\\Current\\Bin\\MSBuild.exe"
+else
+	CLANG_FORMAT_BIN ?= "/home/alejandro/Programming/lib/clang+llvm-10.0.0-x86_64-linux-gnu-ubuntu-18.04/bin/clang-format"
+	CMAKE_BIN ?= "/c/Program Files/CMake/bin/cmake.exe"
+	MSBUILD_BIN ?= "/c/Program Files (x86)/Microsoft Visual Studio/2019/Community/MSBuild/Current/Bin/MSBuild.exe"
+	# Choosing the binary based on whether it's on WSL or linux-native
+	KERNEL := $(shell uname -r)
+	IS_WSL := $(shell (if [[ "$(KERNEL)" =~ Microsoft$  ]]; then echo '0'; fi))
+	ifeq ($(IS_WSL),0)
+		SCMP_BIN ?= "/c/VulkanSDK/1.2.141.2/Bin32/glslangValidator.exe"
+	else
+		SCMP_BIN ?= "/usr/bin/glslangValidator"
+	endif
+endif
+
+
+####### Main Target Rules #######
+
+push_docs_to_ghpages:
+	GIT_DEPLOY_DIR="build/docs/sphinx/" \
+		GIT_DEPLOY_BRANCH="gh-pages" \
+		GIT_DEPLOY_REPO="origin" \
+			./scripts/push_folder_to_branch.sh
+
+####### CMAKE quickstart commands #######
+
+clean_cmake:
+	rm -rf build/
+
+####### Visual studio build shortcut commands #######
+
+MK_BUILD_TYPE ?= "Release"
+MK_INSTALL_PATH ?= "build/src/CMakeFiles/Export/" # Set to "" if prefer default
+MK_CMAKE_EXTRA_FLAGS ?= ""
+MK_KOMPUTE_EXTRA_CXX_FLAGS ?= ""
+
+mk_cmake:
+	cmake \
+		-Bbuild \
+		-DCMAKE_CXX_FLAGS=$(MK_KOMPUTE_EXTRA_CXX_FLAGS) \
+		-DCMAKE_BUILD_TYPE=$(MK_BUILD_TYPE) \
+		-DCMAKE_INSTALL_PREFIX=$(MK_INSTALL_PATH) \
+		-DKOMPUTE_OPT_INSTALL=ON \
+		-DKOMPUTE_OPT_BUILD_TESTS=ON \
+		-DKOMPUTE_OPT_BUILD_DOCS=ON \
+		-DKOMPUTE_OPT_BUILD_SHADERS=ON \
+		-DKOMPUTE_OPT_CODE_COVERAGE=ON \
+		-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+		-DKOMPUTE_OPT_LOG_LEVEL=Debug \
+		$(MK_CMAKE_EXTRA_FLAGS) \
+		-G "Unix Makefiles"
+
+mk_build_all:
+	cmake --build build/. --parallel
+
+mk_build_docs:
+	cmake --build build/. --target gendocsall --parallel
+
+mk_build_kompute:
+	cmake --build build/. --target kompute --parallel
+
+mk_build_tests:
+	cmake --build build/. --target kompute_tests --parallel
+
+mk_run_docs: mk_build_docs
+	(cd build/docs/sphinx && python2.7 -m SimpleHTTPServer)
+
+# An alternative would be: ctest -vv --test-dir build/.
+# But this is not possible since we need to filter specific tests, not complete executables, which is not possible with ctest.
+# https://gitlab.kitware.com/cmake/cmake/-/issues/13168 
+mk_run_tests: mk_build_tests
+	./build/bin/kompute_tests --gtest_filter=$(FILTER_TESTS)
+
+mk_build_swiftshader_library:
+	git clone https://github.com/google/swiftshader || echo "Assuming already cloned"
+	# GCC 8 or above is required otherwise error on "filesystem" lib will appear
+	CC="/usr/bin/gcc-8" CXX="/usr/bin/g++-8" cmake swiftshader/. -Bswiftshader/build/
+	cmake --build swiftshader/build/. --parallel
+
+mk_run_tests_cpu: export VK_ICD_FILENAMES=$(PWD)/swiftshader/build/vk_swiftshader_icd.json
+mk_run_tests_cpu: mk_build_swiftshader_library mk_build_tests mk_run_tests_cpu_only
+
+
+####### Visual studio build shortcut commands #######
+
+VS_BUILD_TYPE ?= "Debug"
+# Run with multiprocessin / parallel build by default
+VS_CMAKE_EXTRA_FLAGS ?= ""
+VS_KOMPUTE_EXTRA_CXX_FLAGS ?= ""
+VS_INSTALL_PATH ?= "build/src/CMakeFiles/Export/" # Set to "" if prefer default
+
+vs_cmake:
+	$(CMAKE_BIN) \
+		-Bbuild \
+		$(VS_CMAKE_EXTRA_FLAGS) \
+		-DCMAKE_TOOLCHAIN_FILE=$(VCPKG_WIN_PATH) \
+		-DCMAKE_CXX_FLAGS=$(VS_KOMPUTE_EXTRA_CXX_FLAGS) \
+		-DCMAKE_INSTALL_PREFIX=$(VS_INSTALL_PATH) \
+		-DKOMPUTE_OPT_INSTALL=ON \
+		-DKOMPUTE_OPT_BUILD_TESTS=ON \
+		-DKOMPUTE_OPT_BUILD_SHADERS=ON \
+		-DKOMPUTE_OPT_CODE_COVERAGE=OFF \
+		-DKOMPUTE_OPT_BUILD_DOCS=OFF \
+		-G "Visual Studio 16 2019" \
+		-DCMAKE_BUILD_TYPE=$(VS_BUILD_TYPE)
+
+vs_build_all:
+	cmake --build build/. --parallel
+
+vs_build_docs:
+	cmake --build build/. --target gendocsall --parallel
+
+vs_install_kompute:
+	cmake --build build/. --target install --parallel
+
+vs_build_kompute:
+	cmake --build build/. --target kompute --parallel
+
+vs_build_tests:
+	cmake --build build/. --target kompute_tests --parallel
+
+vs_run_docs: vs_build_docs
+	(cd build/docs/sphinx && python2.7 -m SimpleHTTPServer)
+
+vs_run_tests: vs_build_tests
+	./build/test/$(VS_BUILD_TYPE)/bin/kompute_tests.exe --gtest_filter=$(FILTER_TESTS)
+
+
+#### PYTHONG ####
+
+test_python:
+	python3 -m pytest -s --log-cli-level=DEBUG -v python/test/
+
+####### Run CI Commands #######
+
+# This command uses act to replicate github action
+# https://github.com/nektos/act
+run_ci:
+	act
+
+####### General project commands #######
+
+generate_python_docstrings:
+	python -m pybind11_mkdoc \
+		-o python/src/docstrings.hpp \
+		kompute/Kompute.hpp \
+		-Iexternal/fmt/include/ \
+		-Iexternal/spdlog/include/ \
+		-Iexternal/glslang/ \
+		-I/usr/include/c++/7.5.0/
+
+install_python_reqs:
+	python3 -m pip install -r scripts/requirements.txt
+
+install_lcov:
+	sudo apt install lcov -y
+
+build_shaders:
+	python3 scripts/convert_shaders.py \
+		--shader-path shaders/glsl \
+		--shader-binary $(SCMP_BIN) \
+		--header-path src/include/kompute/shaders/ \
+		-v
+	python3 scripts/convert_shaders.py \
+		--shader-path test/shaders/glsl \
+		--shader-binary $(SCMP_BIN) \
+		--header-path test/compiled_shaders_include/kompute_test/shaders/ \
+		-v
+
+build_single_header:
+	quom \
+		--include_directory \
+		"src/include/" \
+		"single_include/AggregateHeaders.cpp" \
+		"single_include/kompute/Kompute.hpp"
+
+win_build_xxd:
+	cd external/bin/ && gcc.exe -o xxd.exe xxd.c -DCYGWIN
+
+format:
+	for val in "examples single_include src test" ; do \
+    	find $$val -depth -iname *.h -or -iname *.c -or -iname *.hpp -or -iname *.cpp | grep -v "shaders" | xargs $(CLANG_FORMAT_BIN) -style=file -i; \
+	done
+
+static_scan:
+	cppcheck --project=build/compile_commands.json -iexternal/
+
+build_changelog:
+	docker run --rm -it -v "$(PWD)":/usr/local/src/your-app -e CHANGELOG_GITHUB_TOKEN=${CHANGELOG_GITHUB_TOKEN} ferrarimarco/github-changelog-generator:1.15.2 -u KomputeProject -p kompute
+	chmod 664 CHANGELOG.md # (Read+Write, Read+Write, Read)
+	sed -i -e 's/\(HEAD\|Unreleased\)/v${VERSION}/g' CHANGELOG.md # Replacing unreleased version with latest tag
diff --git a/kompute/README.md b/kompute/README.md
new file mode 100644
index 000000000..b169da254
--- /dev/null
+++ b/kompute/README.md
@@ -0,0 +1,513 @@
+
+![GitHub](https://img.shields.io/badge/Version-0.7.0-green.svg)
+![GitHub](https://img.shields.io/badge/C++-14—20-purple.svg)
+![GitHub](https://img.shields.io/badge/Build-cmake-red.svg)
+![GitHub](https://img.shields.io/badge/Python-3.7—3.9-blue.svg)
+![GitHub](https://img.shields.io/badge/License-Apache-black.svg)
+[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/4834/badge)](https://bestpractices.coreinfrastructure.org/projects/4834)
+
+<table>
+<tr>
+
+<td width="20%">
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute.jpg">
+</td>
+
+<td>
+
+<h1>Kompute</h1>
+<h3>The general purpose GPU compute framework for cross vendor graphics cards (AMD, Qualcomm, NVIDIA & friends)</h3>
+
+</td>
+
+</tr>
+</table>
+
+<h4>Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU acceleration usecases.</h4>
+
+💬 [Join the Discord & Community Calls](https://kompute.cc/overview/community.html) 🔋 [Documentation](https://kompute.cc) 💻 [Blog Post](https://medium.com/@AxSaucedo/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) ⌨ [Examples](#more-examples) 💾
+
+<hr>
+
+##### Kompute is backed by the Linux Foundation as a <a href="https://lfaidata.foundation/blog/2021/08/26/kompute-joins-lf-ai-data-as-new-sandbox-project/">hosted project</a> by the LF AI & Data Foundation.
+
+<table>
+<tr>
+<td>
+<a href="https://www.linuxfoundation.org/projects/">
+<img src="https://upload.wikimedia.org/wikipedia/commons/b/b5/Linux_Foundation_logo.png">
+</a>
+</td>
+<td>
+<a href="https://lfaidata.foundation/projects/">
+<img src="https://raw.githubusercontent.com/lfai/artwork/main/lfaidata-assets/lfaidata/horizontal/color/lfaidata-horizontal-color.png">
+</a>
+</td>
+</tr>
+</table>
+
+
+## Principles & Features
+
+* [Flexible Python module](#your-first-kompute-python) with [C++ SDK](#your-first-kompute-c) for optimizations
+* [Asynchronous & parallel processing](#asynchronous-and-parallel-operations) support through GPU family queues
+* [Mobile enabled](#mobile-enabled) with examples via Android NDK across several architectures
+* BYOV: [Bring-your-own-Vulkan design](#motivations) to play nice with existing Vulkan applications
+* Explicit relationships for GPU and host [memory ownership and memory management](https://kompute.cc/overview/memory-management.html)
+* Robust codebase with [90% unit test code coverage](https://kompute.cc/codecov/)
+* Advanced use-cases on [machine learning 🤖](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a), [mobile development 📱](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617) and [game development 🎮](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0).
+* Active community with [monthly calls, discord chat and more](https://kompute.cc/overview/community.html)
+
+![](https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/komputer-logos.gif)
+
+## Getting Started
+
+Below you can find a GPU multiplication example using the C++ and Python Kompute interfaces.
+
+You can [join the Discord](https://discord.gg/MaH5Jv5zwv) for questions / discussion, open a [github issue](https://github.com/KomputeProject/kompute/issues/new), or read [the documentation](https://kompute.cc/).
+
+### Your First Kompute (C++)
+
+The C++ interface provides low level access to the native components of Kompute, enabling for [advanced optimizations](https://kompute.cc/overview/async-parallel.html) as well as [extension of components](https://kompute.cc/overview/reference.html).
+
+```c++
+
+void kompute(const std::string& shader) {
+
+    // 1. Create Kompute Manager with default settings (device 0, first queue and no extensions)
+    kp::Manager mgr; 
+
+    // 2. Create and initialise Kompute Tensors through manager
+
+    // Default tensor constructor simplifies creation of float values
+    auto tensorInA = mgr.tensor({ 2., 2., 2. });
+    auto tensorInB = mgr.tensor({ 1., 2., 3. });
+    // Explicit type constructor supports uint32, int32, double, float and bool
+    auto tensorOutA = mgr.tensorT<uint32_t>({ 0, 0, 0 });
+    auto tensorOutB = mgr.tensorT<uint32_t>({ 0, 0, 0 });
+
+    std::vector<std::shared_ptr<kp::Tensor>> params = {tensorInA, tensorInB, tensorOutA, tensorOutB};
+
+    // 3. Create algorithm based on shader (supports buffers & push/spec constants)
+    kp::Workgroup workgroup({3, 1, 1});
+    std::vector<float> specConsts({ 2 });
+    std::vector<float> pushConstsA({ 2.0 });
+    std::vector<float> pushConstsB({ 3.0 });
+
+    auto algorithm = mgr.algorithm(params,
+                                   // See documentation shader section for compileSource
+                                   compileSource(shader),
+                                   workgroup,
+                                   specConsts,
+                                   pushConstsA);
+
+    // 4. Run operation synchronously using sequence
+    mgr.sequence()
+        ->record<kp::OpTensorSyncDevice>(params)
+        ->record<kp::OpAlgoDispatch>(algorithm) // Binds default push consts
+        ->eval() // Evaluates the two recorded operations
+        ->record<kp::OpAlgoDispatch>(algorithm, pushConstsB) // Overrides push consts
+        ->eval(); // Evaluates only last recorded operation
+
+    // 5. Sync results from the GPU asynchronously
+    auto sq = mgr.sequence();
+    sq->evalAsync<kp::OpTensorSyncLocal>(params);
+
+    // ... Do other work asynchronously whilst GPU finishes
+
+    sq->evalAwait();
+
+    // Prints the first output which is: { 4, 8, 12 }
+    for (const float& elem : tensorOutA->vector()) std::cout << elem << "  ";
+    // Prints the second output which is: { 10, 10, 10 }
+    for (const float& elem : tensorOutB->vector()) std::cout << elem << "  ";
+
+} // Manages / releases all CPU and GPU memory resources
+
+int main() {
+
+    // Define a raw string shader (or use the Kompute tools to compile to SPIRV / C++ header
+    // files). This shader shows some of the main components including constants, buffers, etc
+    std::string shader = (R"(
+        #version 450
+
+        layout (local_size_x = 1) in;
+
+        // The input tensors bind index is relative to index in parameter passed
+        layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
+        layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
+        layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; };
+        layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; };
+
+        // Kompute supports push constants updated on dispatch
+        layout(push_constant) uniform PushConstants {
+            float val;
+        } push_const;
+
+        // Kompute also supports spec constants on initalization
+        layout(constant_id = 0) const float const_one = 0;
+
+        void main() {
+            uint index = gl_GlobalInvocationID.x;
+            out_a[index] += uint( in_a[index] * in_b[index] );
+            out_b[index] += uint( const_one * push_const.val );
+        }
+    )");
+
+    // Run the function declared above with our raw string shader
+    kompute(shader);
+}
+
+```
+
+### Your First Kompute (Python)
+
+The [Python package](https://kompute.cc/overview/python-package.html) provides a [high level interactive interface](https://kompute.cc/overview/python-reference.html) that enables for experimentation whilst ensuring high performance and fast development workflows.
+
+```python
+
+from .utils import compile_source # using util function from python/test/utils
+
+def kompute(shader):
+    # 1. Create Kompute Manager with default settings (device 0, first queue and no extensions)
+    mgr = kp.Manager()
+
+    # 2. Create and initialise Kompute Tensors through manager
+
+    # Default tensor constructor simplifies creation of float values
+    tensor_in_a = mgr.tensor([2, 2, 2])
+    tensor_in_b = mgr.tensor([1, 2, 3])
+    # Explicit type constructor supports uint32, int32, double, float and bool
+    tensor_out_a = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))
+    tensor_out_b = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))
+
+    params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b]
+
+    # 3. Create algorithm based on shader (supports buffers & push/spec constants)
+    workgroup = (3, 1, 1)
+    spec_consts = [2]
+    push_consts_a = [2]
+    push_consts_b = [3]
+
+    # See documentation shader section for compile_source
+    spirv = compile_source(shader)
+
+    algo = mgr.algorithm(params, spirv, workgroup, spec_consts, push_consts_a)
+
+    # 4. Run operation synchronously using sequence
+    (mgr.sequence()
+        .record(kp.OpTensorSyncDevice(params))
+        .record(kp.OpAlgoDispatch(algo)) # Binds default push consts provided
+        .eval() # evaluates the two recorded ops
+        .record(kp.OpAlgoDispatch(algo, push_consts_b)) # Overrides push consts
+        .eval()) # evaluates only the last recorded op
+
+    # 5. Sync results from the GPU asynchronously
+    sq = mgr.sequence()
+    sq.eval_async(kp.OpTensorSyncLocal(params))
+
+    # ... Do other work asynchronously whilst GPU finishes
+
+    sq.eval_await()
+
+    # Prints the first output which is: { 4, 8, 12 }
+    print(tensor_out_a)
+    # Prints the first output which is: { 10, 10, 10 }
+    print(tensor_out_b)
+
+if __name__ == "__main__":
+
+    # Define a raw string shader (or use the Kompute tools to compile to SPIRV / C++ header
+    # files). This shader shows some of the main components including constants, buffers, etc
+    shader = """
+        #version 450
+
+        layout (local_size_x = 1) in;
+
+        // The input tensors bind index is relative to index in parameter passed
+        layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
+        layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
+        layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; };
+        layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; };
+
+        // Kompute supports push constants updated on dispatch
+        layout(push_constant) uniform PushConstants {
+            float val;
+        } push_const;
+
+        // Kompute also supports spec constants on initalization
+        layout(constant_id = 0) const float const_one = 0;
+
+        void main() {
+            uint index = gl_GlobalInvocationID.x;
+            out_a[index] += uint( in_a[index] * in_b[index] );
+            out_b[index] += uint( const_one * push_const.val );
+        }
+    """
+
+    kompute(shader)
+
+```
+
+### Interactive Notebooks & Hands on Videos
+
+You are able to try out the interactive Colab Notebooks which allow you to use a free GPU. The available examples are the Python and C++ examples below:
+
+<table>
+<tr>
+
+<td width="50%">
+<h5>Try the interactive <a href="https://colab.research.google.com/drive/1l3hNSq2AcJ5j2E3YIw__jKy5n6M615GP?usp=sharing">C++ Colab</a> from <a href="https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a">Blog Post</a></h5>
+</td>
+
+<td>
+<h5>Try the interactive <a href="https://colab.research.google.com/drive/15uQ7qMZuOyk8JcXF-3SB2R5yNFW21I4P">Python Colab</a> from <a href="https://towardsdatascience.com/beyond-cuda-gpu-accelerated-python-for-machine-learning-in-cross-vendor-graphics-cards-made-simple-6cc828a45cc3">Blog Post</a></h5>
+</td>
+
+</tr>
+<tr>
+
+<td width="50%">
+<a href="https://colab.research.google.com/drive/1l3hNSq2AcJ5j2E3YIw__jKy5n6M615GP?authuser=1#scrollTo=1BipBsO-fQRD">
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/binder-cpp.jpg">
+</a>
+</td>
+
+<td>
+<a href="https://colab.research.google.com/drive/15uQ7qMZuOyk8JcXF-3SB2R5yNFW21I4P">
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/binder-python.jpg">
+</a>
+</td>
+
+</tr>
+</table>
+
+
+You can also check out the two following talks presented at the FOSDEM 2021 conference. 
+
+Both videos have timestamps which will allow you to skip to the most relevant section for you - the intro & motivations for both is almost the same so you can skip to the more specific content.
+
+<table>
+<tr>
+
+<td width="50%">
+<h5>Watch the video for <a href="https://www.youtube.com/watch?v=Xz4fiQNmGSA">C++ Enthusiasts</a> </h5>
+</td>
+
+<td>
+<h5>Watch the video for <a href="https://www.youtube.com/watch?v=AJRyZ09IUdg">Python & Machine Learning</a> Enthusiasts</h5>
+</td>
+
+</tr>
+<tr>
+
+<td width="50%">
+<a href="https://www.youtube.com/watch?v=Xz4fiQNmGSA">
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-cpp-video.png">
+</a>
+</td>
+
+<td>
+<a href="https://www.youtube.com/watch?v=AJRyZ09IUdg">
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-python-video.png">
+</a>
+</td>
+
+</tr>
+</table>
+
+
+## Architectural Overview
+
+The core architecture of Kompute includes the following:
+* [Kompute Manager](https://kompute.cc/overview/reference.html#manager) - Base orchestrator which creates and manages device and child components
+* [Kompute Sequence](https://kompute.cc/overview/reference.html#sequence) - Container of operations that can be sent to GPU as batch
+* [Kompute Operation (Base)](https://kompute.cc/overview/reference.html#algorithm) - Base class from which all operations inherit
+* [Kompute Tensor](https://kompute.cc/overview/reference.html#tensor) - Tensor structured data used in GPU operations
+* [Kompute Algorithm](https://kompute.cc/overview/reference.html#algorithm) - Abstraction for (shader) logic executed in the GPU
+
+To see a full breakdown you can read further in the [C++ Class Reference](https://kompute.cc/overview/reference.html).
+
+<table>
+<th>
+Full Architecture
+</th>
+<th>
+Simplified Kompute Components
+</th>
+<tr>
+<td width=30%>
+
+
+<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-vulkan-architecture.jpg">
+
+<br>
+<br>
+(very tiny, check the <a href="https://ethicalml.github.io/vulkan-kompute/overview/reference.html">full reference diagram in docs for details</a>)
+<br>
+<br>
+
+<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/suspicious.jfif">
+
+</td>
+<td>
+<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-architecture.jpg">
+</td>
+</tr>
+</table>
+
+
+## Asynchronous and Parallel Operations
+
+Kompute provides flexibility to run operations in an asynrchonous way through vk::Fences. Furthermore, Kompute enables for explicit allocation of queues, which allow for parallel execution of operations across queue families.
+
+The image below provides an intuition on how Kompute Sequences can be allocated to different queues to enable parallel execution based on hardware. You can see the [hands on example](https://kompute.cc/overview/advanced-examples.html#parallel-operations), as well as the [detailed documentation page](https://kompute.cc/overview/async-parallel.html) describing how it would work using an NVIDIA 1650 as an example. 
+
+![](https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/queue-allocation.jpg)
+
+## Mobile Enabled
+
+Kompute has been optimized to work in mobile environments. The [build system](#build-overview) enables for dynamic loading of the Vulkan shared library for Android environments, together with a working [Android NDK wrapper](https://github.com/KomputeProject/kompute/tree/master/vk_ndk_wrapper_include) for the CPP headers.
+
+<table>
+<tr>
+
+<td width="70%">
+<p>
+For a full deep dive you can read the blog post "<a href="https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617">Supercharging your Mobile Apps with On-Device GPU Accelerated Machine Learning</a>". 
+
+You can also access the <a href="https://github.com/KomputeProject/kompute/tree/v0.4.0/examples/android/android-simple">end-to-end example code</a> in the repository, which can be run using android studio.
+
+</p>
+
+
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/android-example/docs/images/android-editor.jpg">
+
+</td>
+
+
+<td width="30%">
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/android-example/docs/images/android-kompute.jpg">
+</td>
+
+</tr>
+</table>
+
+## More examples
+
+### Simple examples
+
+* [Simple multiplication example](https://kompute.cc/overview/advanced-examples.html#simple-shader-example)
+* [Record batch commands with a Kompute Sequence](https://kompute.cc/overview/advanced-examples.html#record-batch-commands)
+* [Run Asynchronous Operations](https://kompute.cc/overview/advanced-examples.html#asynchronous-operations)
+* [Run Parallel Operations Across Multiple GPU Queues](https://kompute.cc/overview/advanced-examples.html#parallel-operations)
+* [Create your custom Kompute Operations](https://kompute.cc/overview/advanced-examples.html#your-custom-kompute-operation)
+* [Implementing logistic regression from scratch](https://kompute.cc/overview/advanced-examples.html#logistic-regression-example)
+
+### End-to-end examples
+
+* [Machine Learning Logistic Regression Implementation](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a)
+* [Parallelizing GPU-intensive Workloads via Multi-Queue Operations](https://towardsdatascience.com/parallelizing-heavy-gpu-workloads-via-multi-queue-operations-50a38b15a1dc)
+* [Android NDK Mobile Kompute ML Application](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617)
+* [Game Development Kompute ML in Godot Engine](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0)
+
+## Python Package
+
+Besides the C++ core SDK you can also use the Python package of Kompute, which exposes the same core functionality, and supports interoperability with Python objects like Lists, Numpy Arrays, etc.
+
+The only dependencies are Python 3.5+ and Cmake 3.4.1+. You can install Kompute from the [Python pypi package](https://pypi.org/project/kp/) using the following command.
+
+```
+pip install kp
+```
+
+You can also install from master branch using:
+
+```
+pip install git+git://github.com/KomputeProject/kompute.git@master
+```
+
+For further details you can read the [Python Package documentation](https://kompute.cc/overview/python-package.html) or the [Python Class Reference documentation](https://kompute.cc/overview/python-reference.html).
+
+## C++ Build Overview
+
+The build system provided uses `cmake`, which allows for cross platform builds.
+
+The top level `Makefile` provides a set of optimized configurations for development as well as the docker image build, but you can start a build with the following command:
+
+```
+   cmake -Bbuild
+```
+
+You also are able to add Kompute in your repo with `add_subdirectory` - the [Android example CMakeLists.txt file](https://github.com/KomputeProject/kompute/blob/7c8c0eeba2cdc098349fcd999102bb2cca1bf711/examples/android/android-simple/app/src/main/cpp/CMakeLists.txt#L3) shows how this would be done.
+
+For a more advanced overview of the build configuration check out the [Build System Deep Dive](https://kompute.cc/overview/build-system.html) documentation.
+
+## Kompute Development
+
+We appreciate PRs and Issues. If you want to contribute try checking the "Good first issue" tag, but even using Kompute and reporting issues is a great contribution!
+
+### Contributing
+
+#### Dev Dependencies
+
+* Testing
+    + GTest
+* Documentation
+    + Doxygen (with Dot)
+    + Sphynx
+
+#### Development
+
+* Follows Mozilla C++ Style Guide https://www-archive.mozilla.org/hacking/mozilla-style-guide.html
+    + Uses post-commit hook to run the linter, you can set it up so it runs the linter before commit
+    + All dependencies are defined in vcpkg.json 
+* Uses cmake as build system, and provides a top level makefile with recommended command
+* Uses xxd (or xxd.exe windows 64bit port) to convert shader spirv to header files
+* Uses doxygen and sphinx for documentation and autodocs
+* Uses vcpkg for finding the dependencies, it's the recommended set up to retrieve the libraries
+
+If you want to run with debug layers you can add them with the `KOMPUTE_ENV_DEBUG_LAYERS` parameter as:
+
+```
+export KOMPUTE_ENV_DEBUG_LAYERS="VK_LAYER_LUNARG_api_dump"
+```
+
+##### Updating documentation
+
+To update the documentation you will need to:
+* Run the gendoxygen target in the build system
+* Run the gensphynx target in the build-system 
+* Push to github pages with `make push_docs_to_ghpages`
+
+##### Running tests
+
+Running the unit tests has been significantly simplified for contributors.
+
+The tests run on CPU, and can be triggered using the ACT command line interface (https://github.com/nektos/act) - once you install the command line (And start the Docker daemon) you just have to type:
+
+```
+$ act
+
+[Python Tests/python-tests] 🚀  Start image=axsauze/kompute-builder:0.2
+[C++ Tests/cpp-tests      ] 🚀  Start image=axsauze/kompute-builder:0.2
+[C++ Tests/cpp-tests      ]   🐳  docker run image=axsauze/kompute-builder:0.2 entrypoint=["/usr/bin/tail" "-f" "/dev/null"] cmd=[]
+[Python Tests/python-tests]   🐳  docker run image=axsauze/kompute-builder:0.2 entrypoint=["/usr/bin/tail" "-f" "/dev/null"] cmd=[]
+...
+```
+
+The repository contains unit tests for the C++ and Python code, and can be found under the `test/` and `python/test` folder.
+
+The tests are currently run through the CI using Github Actions. It uses the images found in `docker-builders/`.
+
+In order to minimise hardware requirements the tests can run without a GPU, directly in the CPU using [Swiftshader](https://github.com/google/swiftshader).
+
+For more information on how the CI and tests are setup, you can go to the [CI, Docker and Tests Section](https://kompute.cc/overview/ci-tests.html) in the documentation.
+
+## Motivations
+
+This project started after seeing that a lot of new and renowned ML & DL projects like Pytorch, Tensorflow, Alibaba DNN, Tencent NCNN - among others - have either integrated or are looking to integrate the Vulkan SDK to add mobile (and cross-vendor) GPU support.
+
+The Vulkan SDK offers a great low level interface that enables for highly specialized optimizations - however it comes at a cost of highly verbose code which requires 500-2000 lines of code to even begin writing application code. This has resulted in each of these projects having to implement the same baseline to abstract the non-compute related features of the Vulkan SDK. This large amount of non-standardised boiler-plate can result in limited knowledge transfer, higher chance of unique framework implementation bugs being introduced, etc.
+
+We are currently developing Kompute not to hide the Vulkan SDK interface (as it's incredibly well designed) but to augment it with a direct focus on the Vulkan SDK's GPU computing capabilities. [This article](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) provides a high level overview of the motivations of Kompute, together with a set of hands on examples that introduce both GPU computing as well as the core Kompute architecture.
diff --git a/kompute/cmake/bin2h.cmake b/kompute/cmake/bin2h.cmake
new file mode 100644
index 000000000..21ad56cb1
--- /dev/null
+++ b/kompute/cmake/bin2h.cmake
@@ -0,0 +1,106 @@
+##################################################################################
+# Based on: https://github.com/sivachandran/cmake-bin2h
+#
+# Copyright 2020 Sivachandran Paramasivam
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+##################################################################################
+
+include(CMakeParseArguments)
+
+# Function to wrap a given string into multiple lines at the given column position.
+# Parameters:
+#   VARIABLE    - The name of the CMake variable holding the string.
+#   AT_COLUMN   - The column position at which string will be wrapped.
+function(WRAP_STRING)
+    set(oneValueArgs VARIABLE AT_COLUMN)
+    cmake_parse_arguments(WRAP_STRING "${options}" "${oneValueArgs}" "" ${ARGN})
+
+    string(LENGTH ${${WRAP_STRING_VARIABLE}} stringLength)
+    math(EXPR offset "0")
+
+    while(stringLength GREATER 0)
+
+        if(stringLength GREATER ${WRAP_STRING_AT_COLUMN})
+            math(EXPR length "${WRAP_STRING_AT_COLUMN}")
+        else()
+            math(EXPR length "${stringLength}")
+        endif()
+
+        string(SUBSTRING ${${WRAP_STRING_VARIABLE}} ${offset} ${length} line)
+        set(lines "${lines}\n${line}")
+
+        math(EXPR stringLength "${stringLength} - ${length}")
+        math(EXPR offset "${offset} + ${length}")
+    endwhile()
+
+    set(${WRAP_STRING_VARIABLE} "${lines}" PARENT_SCOPE)
+endfunction()
+
+# Function to embed contents of a file as byte array in C/C++ header file(.h). The header file
+# will contain a byte array and integer variable holding the size of the array.
+# Parameters
+#   SOURCE_FILE      - The path of source file whose contents will be embedded in the header file.
+#   VARIABLE_NAME    - The name of the variable for the byte array. The string "_SIZE" will be append
+#                      to this name and will be used a variable name for size variable.
+#   HEADER_FILE      - The path of header file.
+#   APPEND           - If specified appends to the header file instead of overwriting it
+#   NULL_TERMINATE   - If specified a null byte(zero) will be append to the byte array. This will be
+#                      useful if the source file is a text file and we want to use the file contents
+#                      as string. But the size variable holds size of the byte array without this
+#                      null byte.
+#   HEADER_NAMESPACE - The namespace, where the array should be located in.
+#   IS_BIG_ENDIAN    - If set to true, will not revers the byte order for the uint32_t to match the
+#                      big endian system architecture
+# Usage:
+#   bin2h(SOURCE_FILE "Logo.png" HEADER_FILE "Logo.h" VARIABLE_NAME "LOGO_PNG")
+function(BIN2H)
+    set(options APPEND NULL_TERMINATE)
+    set(oneValueArgs SOURCE_FILE VARIABLE_NAME HEADER_FILE)
+    cmake_parse_arguments(BIN2H "${options}" "${oneValueArgs}" "" ${ARGN})
+
+    # reads source file contents as hex string
+    file(READ ${BIN2H_SOURCE_FILE} hexString HEX)
+    string(LENGTH ${hexString} hexStringLength)
+
+    # appends null byte if asked
+    if(BIN2H_NULL_TERMINATE)
+        set(hexString "${hexString}00")
+    endif()
+
+    # wraps the hex string into multiple lines at column 32(i.e. 16 bytes per line)
+    wrap_string(VARIABLE hexString AT_COLUMN 32)
+    math(EXPR arraySize "${hexStringLength} / 8")
+
+    # adds '0x' prefix and comma suffix before and after every byte respectively
+    if(IS_BIG_ENDIAN)
+        message(STATUS "Interpreting shader in big endian...")
+        string(REGEX REPLACE "([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])" "0x\\1\\2\\3\\4, " arrayValues ${hexString})
+    else()
+        message(STATUS "Interpreting shader in little endian...")
+        string(REGEX REPLACE "([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])" "0x\\4\\3\\2\\1, " arrayValues ${hexString})
+    endif()
+    # removes trailing comma
+    string(REGEX REPLACE ", $" "" arrayValues ${arrayValues})
+
+    # converts the variable name into proper C identifier
+    string(MAKE_C_IDENTIFIER "${BIN2H_VARIABLE_NAME}" BIN2H_VARIABLE_NAME)
+    string(TOUPPER "${BIN2H_VARIABLE_NAME}" BIN2H_VARIABLE_NAME)
+
+    # declares byte array and the length variables
+    set(namespaceStart "namespace ${HEADER_NAMESPACE} {")
+    set(namespaceEnd "} // namespace ${HEADER_NAMESPACE}")
+    set(arrayIncludes "#pragma once\n#include <array>\n#include <cstdint>")
+    set(arrayDefinition "const std::array<uint32_t, ${arraySize}> ${BIN2H_VARIABLE_NAME} = { ${arrayValues} };")
+
+    set(declarations "${arrayIncludes}\n\n${namespaceStart}\n${arrayDefinition}\n${namespaceEnd}\n\n")
+    if(BIN2H_APPEND)
+        file(APPEND ${BIN2H_HEADER_FILE} "${declarations}")
+    else()
+        file(WRITE ${BIN2H_HEADER_FILE} "${declarations}")
+    endif()
+endfunction()
\ No newline at end of file
diff --git a/kompute/cmake/bin_file_to_header.cmake b/kompute/cmake/bin_file_to_header.cmake
new file mode 100644
index 000000000..b47b36139
--- /dev/null
+++ b/kompute/cmake/bin_file_to_header.cmake
@@ -0,0 +1,19 @@
+cmake_minimum_required(VERSION 3.20)
+
+if(${INPUT_SHADER_FILE} STREQUAL "")
+    message(FATAL_ERROR "No input file path provided via 'INPUT_SHADER_FILE'.")
+endif()
+
+if(${OUTPUT_HEADER_FILE} STREQUAL "")
+    message(FATAL_ERROR "No output file path provided via 'OUTPUT_HEADER_FILE'.")
+endif()
+
+if(${HEADER_NAMESPACE} STREQUAL "")
+    message(FATAL_ERROR "No header namespace provided via 'HEADER_NAMESPACE'.")
+endif()
+
+include(bin2h.cmake)
+
+get_filename_component(BINARY_FILE_CONTENT ${INPUT_SHADER_FILE} NAME)
+bin2h(SOURCE_FILE ${INPUT_SHADER_FILE} HEADER_FILE ${OUTPUT_HEADER_FILE} VARIABLE_NAME ${BINARY_FILE_CONTENT} HEADER_NAMESPACE ${HEADER_NAMESPACE})
+file(APPEND ${OUTPUT_HEADER_FILE} "\n")
\ No newline at end of file
diff --git a/kompute/cmake/check_vulkan_version.cmake b/kompute/cmake/check_vulkan_version.cmake
new file mode 100644
index 000000000..0372d3206
--- /dev/null
+++ b/kompute/cmake/check_vulkan_version.cmake
@@ -0,0 +1,139 @@
+# Current issue: Only checks the result of GPU0
+function(check_vulkan_version)
+    cmake_parse_arguments(VULKAN_CHECK_VERSION "" "INCLUDE_DIR" "" ${ARGN})
+    message(STATUS "Ensuring the currently installed driver supports the Vulkan version requested by the Vulkan Header.")
+
+    # Get the current Vulkan Header version (e.g. 1.2.189).
+    # This snippet is based on: https://gitlab.kitware.com/cmake/cmake/-/blob/v3.23.1/Modules/FindVulkan.cmake#L140-156
+    if(VULKAN_CHECK_VERSION_INCLUDE_DIR)
+        set(VULKAN_CORE_H ${VULKAN_CHECK_VERSION_INCLUDE_DIR}/vulkan/vulkan_core.h)
+        if(EXISTS ${VULKAN_CORE_H})
+            file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_VERSION_LINE REGEX "^#define VK_HEADER_VERSION ")
+            string(REGEX MATCHALL "[0-9]+" VULKAN_HEADER_VERSION "${VULKAN_HEADER_VERSION_LINE}")
+            file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_VERSION_LINE2 REGEX "^#define VK_HEADER_VERSION_COMPLETE ")
+            if(NOT ${VULKAN_HEADER_VERSION_LINE2} STREQUAL "")
+                string(REGEX MATCHALL "[0-9]+" VULKAN_HEADER_VERSION2 "${VULKAN_HEADER_VERSION_LINE2}")
+                list(LENGTH VULKAN_HEADER_VERSION2 _len)
+                # Versions >= 1.2.175 have an additional numbers in front of e.g. '0, 1, 2' instead of '1, 2'
+                if(_len EQUAL 3)
+                    list(REMOVE_AT VULKAN_HEADER_VERSION2 0)
+                endif()
+                list(APPEND VULKAN_HEADER_VERSION2 ${VULKAN_HEADER_VERSION})
+                list(JOIN VULKAN_HEADER_VERSION2 "." VULKAN_HEADER_VERSION)
+            else()
+                file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_API_VERSION_1_2 REGEX "^#define VK_API_VERSION_1_2.*")
+                if(NOT ${VULKAN_HEADER_API_VERSION_1_2} STREQUAL "")
+                    set(VULKAN_HEADER_VERSION "1.2.${VULKAN_HEADER_VERSION}")
+                else()
+                    file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_API_VERSION_1_1 REGEX "^#define VK_API_VERSION_1_1.*")
+                    if(NOT ${VULKAN_HEADER_API_VERSION_1_1} STREQUAL "")
+                        set(VULKAN_HEADER_VERSION "1.1.${VULKAN_HEADER_VERSION}")
+                    else()
+                        message(FATAL_ERROR "'${VULKAN_CORE_H}' does not contain a supported Vulkan version. Probably because its < 1.2.0.")
+                    endif()
+                endif()
+            endif()
+        else()
+            message(FATAL_ERROR "'${VULKAN_CORE_H}' does not exist. Try calling 'find_package(Vulkan REQUIRED)' before you call this function or set 'Vulkan_INCLUDE_DIR' manually!")
+            return()
+        endif()
+    else()
+        message(FATAL_ERROR "Invalid Vulkan include directory given. Try calling 'find_package(Vulkan REQUIRED)' before you call this function or set 'Vulkan_INCLUDE_DIR' manually!")
+        return()
+    endif()
+    message(STATUS "Found Vulkan Header version: ${VULKAN_HEADER_VERSION}")
+
+    # Get Vulkan version supported by driver
+    find_program(VULKAN_INFO_PATH NAMES vulkaninfo)
+    if(VULKAN_INFO_PATH STREQUAL "VULKAN_INFO_PATH-NOTFOUND")
+        message(FATAL_ERROR "vulkaninfo not found. The Vulkan SDK might not be installed properly. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON).")
+        return()
+    endif()
+
+    execute_process(COMMAND "vulkaninfo"
+                    OUTPUT_VARIABLE VULKAN_INFO_OUTPUT
+                    RESULT_VARIABLE VULKAN_INFO_RETURN)
+    if(NOT ${VULKAN_INFO_RETURN} EQUAL 0)
+        message(FATAL_ERROR "Running vulkaninfo failed with return code ${VULKAN_INFO_RETURN}. Make sure you have 'vulkan-tools' installed. Result:\n${VULKAN_INFO_OUTPUT}?")
+        return()
+    else()
+        message(STATUS "Running vulkaninfo was successful. Parsing the output...")
+    endif()
+
+    # Check if running vulkaninfo was successfully
+    string(FIND "${VULKAN_INFO_OUTPUT}" "Vulkan Instance Version" VULKAN_INFO_SUCCESSFUL)
+    if(VULKAN_INFO_SUCCESSFUL LESS 0)
+        message(FATAL_ERROR "Running vulkaninfo failed. Make sure you have 'vulkan-tools' installed and DISPLAY is configured. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON). Result:\n${VULKAN_INFO_OUTPUT}?")
+    endif()
+
+    string(REGEX MATCHALL "(GPU[0-9]+)" GPU_IDS "${VULKAN_INFO_OUTPUT}")
+    if(NOT GPU_IDS)
+        message(FATAL_ERROR "No GPU supporting Vulkan found in vulkaninfo. Does your GPU (driver) support Vulkan?")
+    endif()
+
+    string(REGEX MATCHALL "apiVersion[ ]*=[ ]*[0-9a-fA-F]*[ ]*[(]*([0-9]+[.][0-9]+[.][0-9]+)[)]*" GPU_API_VERSIONS ${VULKAN_INFO_OUTPUT})
+    if(NOT GPU_API_VERSIONS)
+        message(FATAL_ERROR "No valid Vulkan API version found in vulkaninfo. Does your GPU (driver) support Vulkan?")
+    endif()
+
+    # Check length
+    # message(FATAL_ERROR "GPUS: ${GPU_IDS}")
+    list(LENGTH GPU_IDS GPU_IDS_LENGTH)
+    list(LENGTH GPU_API_VERSIONS GPU_API_VERSIONS_LENGTH)
+    if(NOT ${GPU_IDS_LENGTH} EQUAL ${GPU_API_VERSIONS_LENGTH})
+        message(FATAL_ERROR "Found ${GPU_IDS_LENGTH} GPUs, but ${GPU_API_VERSIONS_LENGTH} API versions in vulkaninfo. We expected to find an equal amount of them.")
+    endif()
+
+    # Compare versions
+    set(VALID_GPU "")
+    set(VALID_VULKAN_VERSION "")
+    math(EXPR ITER_LEN "${GPU_IDS_LENGTH} - 1")
+    foreach(INDEX RANGE ${ITER_LEN})
+        list(GET GPU_IDS ${INDEX} GPU)
+        list(GET GPU_API_VERSIONS ${INDEX} API_VERSION)
+
+        # Extract API version
+        if(${API_VERSION} MATCHES "apiVersion[ ]*=[ ]*[0-9a-fA-F]*[ ]*[(]*([0-9]+[.][0-9]+[.][0-9]+)[)]*")
+            set(VULKAN_DRIVER_VERSION ${CMAKE_MATCH_1})
+        else()
+            message(FATAL_ERROR "API version match failed. This should not have happened...")
+        endif()
+
+        message(STATUS "${GPU} supports Vulkan API version '${VULKAN_DRIVER_VERSION}'.")
+
+        # Compare driver and header version
+        if(${VULKAN_DRIVER_VERSION} VERSION_LESS ${VULKAN_HEADER_VERSION})
+        # Version missmatch. Let us check if the minor version is the same.
+            if(${VULKAN_DRIVER_VERSION} MATCHES "[0-9]+[.]([0-9]+)[.][0-9]+")
+                set(VULKAN_DRIVER_MINOR_VERSION ${CMAKE_MATCH_1})
+            else()
+                message(FATAL_ERROR "Invalid Vulkan driver version '${VULKAN_DRIVER_VERSION}' found. Expected version in the following format: '[0-9]+.[0-9]+.[0-9]+'")
+            endif()
+            if(${VULKAN_HEADER_VERSION} MATCHES "[0-9]+[.]([0-9]+)[.][0-9]+")
+                set(VULKAN_HEADER_MINOR_VERSION ${CMAKE_MATCH_1})
+            else()
+                message(FATAL_ERROR "Invalid Vulkan Header version '${VULKAN_HEADER_VERSION}' found. Expected version in the following format: '[0-9]+.[0-9]+.[0-9]+'")
+            endif()
+
+            if(${VULKAN_DRIVER_MINOR_VERSION} EQUAL ${VULKAN_HEADER_MINOR_VERSION})
+                message(WARNING "Your GPU driver does not support Vulkan > ${VULKAN_DRIVER_VERSION}, but you try to use Vulkan Header ${VULKAN_HEADER_VERSION}. At least your driver supports the same minor version (${VULKAN_DRIVER_MINOR_VERSION}), so this should be fine but keep it in mind in case you encounter any strange behavior.")
+                set(VALID_GPU ${GPU})
+                set(VALID_VULKAN_VERSION ${VULKAN_DRIVER_VERSION})
+                break()
+            else()
+                message(STATUS "${GPU} does not support Vulkan > ${VULKAN_DRIVER_VERSION}.")
+            endif()
+        else()
+            set(VALID_GPU ${GPU})
+            set(VALID_VULKAN_VERSION ${VULKAN_DRIVER_VERSION})
+            break()
+        endif()
+    endforeach()
+
+    if("${VALID_GPU}" STREQUAL "")
+        message(FATAL_ERROR "None of your GPUs supports Vulkan Header ${VULKAN_HEADER_VERSION}. Please try updating your driver, or downgrade your Vulkan headers. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON).")
+    else()
+        message("Valid GPU (${VALID_GPU}) for Vulkan header version ${VULKAN_HEADER_VERSION} found. ${VALID_GPU} supports up to Vulkan ${VALID_VULKAN_VERSION}.")
+    endif()
+
+endfunction()
diff --git a/kompute/cmake/code_coverage.cmake b/kompute/cmake/code_coverage.cmake
new file mode 100644
index 000000000..7fb6ce264
--- /dev/null
+++ b/kompute/cmake/code_coverage.cmake
@@ -0,0 +1,35 @@
+# Code coverage
+set(CMAKE_BUILD_TYPE COVERAGE CACHE INTERNAL "Coverage build enabled")
+message(STATUS "Enabling gcov support")
+
+if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+    set(COVERAGE_FLAG "--coverage")
+endif()
+
+set(CMAKE_CXX_FLAGS_COVERAGE
+    "-g -O0 ${COVERAGE_FLAG} -fprofile-arcs -ftest-coverage"
+    CACHE STRING "Flags used by the C++ compiler during coverage builds."
+    FORCE)
+set(CMAKE_C_FLAGS_COVERAGE
+    "-g -O0 ${COVERAGE_FLAG} -fprofile-arcs -ftest-coverage"
+    CACHE STRING "Flags used by the C compiler during coverage builds."
+    FORCE)
+set(CMAKE_EXE_LINKER_FLAGS_COVERAGE
+    ""
+    CACHE STRING "Flags used for linking binaries during coverage builds."
+    FORCE)
+set(CMAKE_SHARED_LINKER_FLAGS_COVERAGE
+    ""
+    CACHE STRING "Flags used by the shared libraries linker during coverage builds."
+    FORCE)
+
+set(CODECOV_DIR ${CMAKE_CURRENT_BINARY_DIR}/codecov/)
+set(CODECOV_DIR_LCOV ${CODECOV_DIR}lcov/)
+set(CODECOV_FILENAME_LCOV_INFO lcov.info)
+set(CODECOV_FILENAME_LCOV_INFO_FULL lcov_full.info)
+set(CODECOV_DIR_HTML ${CODECOV_DIR}html/)
+
+mark_as_advanced(CMAKE_CXX_FLAGS_COVERAGE
+    CMAKE_C_FLAGS_COVERAGE
+    CMAKE_EXE_LINKER_FLAGS_COVERAGE
+    CMAKE_SHARED_LINKER_FLAGS_COVERAGE)
diff --git a/kompute/cmake/deprecation_warnings.cmake b/kompute/cmake/deprecation_warnings.cmake
new file mode 100644
index 000000000..1ed1f4555
--- /dev/null
+++ b/kompute/cmake/deprecation_warnings.cmake
@@ -0,0 +1,15 @@
+if(KOMPUTE_OPT_REPO_SUBMODULE_BUILD)
+    message(FATAL_ERROR "'KOMPUTE_OPT_REPO_SUBMODULE_BUILD' got replaced by 'KOMPUTE_OPT_USE_BUILT_IN_SPDLOG', 'KOMPUTE_OPT_USE_BUILT_IN_FMT', 'KOMPUTE_OPT_USE_BUILT_IN_GOOGLE_TEST', 'KOMPUTE_OPT_USE_BUILT_IN_PYBIND11' and 'KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER'. Please use them instead.")
+endif()
+
+if(KOMPUTE_OPT_BUILD_AS_SHARED_LIB)
+    message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_AS_SHARED_LIB' is deprecated and should not be used. Instead use the default 'BUILD_SHARED_LIBS' CMake switch.")
+endif()
+
+if(KOMPUTE_OPT_BUILD_SINGLE_HEADER)
+    message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_SINGLE_HEADER' is deprecated and should not be used. The single header will now always be build and can be included via '#include<kompute/kompute.h>'.")
+endif()
+
+if(KOMPUTE_OPT_ENABLE_SPDLOG)
+    message(FATAL_ERROR "'KOMPUTE_OPT_ENABLE_SPDLOG' is deprecated and should not be used. It got replaced by 'KOMPUTE_OPT_LOG_LEVEL'. This option can be set to a variety of log levels (e.g. 'Off', 'Trace', 'Debug', 'Default', ...).")
+endif()
\ No newline at end of file
diff --git a/kompute/cmake/komputeConfig.cmake.in b/kompute/cmake/komputeConfig.cmake.in
new file mode 100644
index 000000000..87e8a99e2
--- /dev/null
+++ b/kompute/cmake/komputeConfig.cmake.in
@@ -0,0 +1,8 @@
+include(CMakeFindDependencyMacro)
+@PACKAGE_INIT@
+
+find_dependency(VULKAN REQUIRED)
+
+include(${CMAKE_CURRENT_LIST_DIR}/komputeTargets.cmake)
+
+check_required_components(kompute)
\ No newline at end of file
diff --git a/kompute/cmake/vulkan_shader_compiler.cmake b/kompute/cmake/vulkan_shader_compiler.cmake
new file mode 100644
index 000000000..acc27b57c
--- /dev/null
+++ b/kompute/cmake/vulkan_shader_compiler.cmake
@@ -0,0 +1,43 @@
+function(vulkan_compile_shader)
+     find_program(GLS_LANG_VALIDATOR_PATH NAMES glslangValidator)
+     if(GLS_LANG_VALIDATOR_PATH STREQUAL "GLS_LANG_VALIDATOR_PATH-NOTFOUND")
+          message(FATAL_ERROR "glslangValidator not found.")
+          return()
+     endif()
+
+     cmake_parse_arguments(SHADER_COMPILE "" "INFILE;OUTFILE;NAMESPACE;RELATIVE_PATH" "" ${ARGN})
+     set(SHADER_COMPILE_INFILE_FULL "${CMAKE_CURRENT_SOURCE_DIR}/${SHADER_COMPILE_INFILE}")
+     set(SHADER_COMPILE_SPV_FILE_FULL "${CMAKE_CURRENT_BINARY_DIR}/${SHADER_COMPILE_INFILE}.spv")
+     set(SHADER_COMPILE_HEADER_FILE_FULL "${CMAKE_CURRENT_BINARY_DIR}/${SHADER_COMPILE_OUTFILE}")
+
+     if(NOT SHADER_COMPILE_RELATIVE_PATH)
+          set(SHADER_COMPILE_RELATIVE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+     endif()
+    
+     # .comp -> .spv
+     add_custom_command(OUTPUT "${SHADER_COMPILE_SPV_FILE_FULL}"
+                        COMMAND "${GLS_LANG_VALIDATOR_PATH}"
+                        ARGS "-V"
+                             "${SHADER_COMPILE_INFILE_FULL}"
+                             "-o"
+                             "${SHADER_COMPILE_SPV_FILE_FULL}"
+                        COMMENT "Compile vulkan compute shader from file '${SHADER_COMPILE_INFILE_FULL}' to '${SHADER_COMPILE_SPV_FILE_FULL}'."
+                        MAIN_DEPENDENCY "${SHADER_COMPILE_INFILE_FULL}")
+
+     # Check if big or little endian
+     include (TestBigEndian)
+     TEST_BIG_ENDIAN(IS_BIG_ENDIAN)
+
+     # .spv -> .hpp
+     add_custom_command(OUTPUT "${SHADER_COMPILE_HEADER_FILE_FULL}"
+                        COMMAND ${CMAKE_COMMAND}
+                        ARGS "-DINPUT_SHADER_FILE=${SHADER_COMPILE_SPV_FILE_FULL}"
+                             "-DOUTPUT_HEADER_FILE=${SHADER_COMPILE_HEADER_FILE_FULL}"
+                             "-DHEADER_NAMESPACE=${SHADER_COMPILE_NAMESPACE}"
+                             "-DIS_BIG_ENDIAN=${IS_BIG_ENDIAN}"
+                             "-P"
+                             "${SHADER_COMPILE_RELATIVE_PATH}/bin_file_to_header.cmake"
+                        WORKING_DIRECTORY "${SHADER_COMPILE_RELATIVE_PATH}"
+                        COMMENT "Converting compiled shader '${SHADER_COMPILE_SPV_FILE_FULL}' to header file '${SHADER_COMPILE_HEADER_FILE_FULL}'."
+                        MAIN_DEPENDENCY "${SHADER_COMPILE_SPV_FILE_FULL}")
+endfunction()
diff --git a/kompute/config/FindSphinx.cmake b/kompute/config/FindSphinx.cmake
new file mode 100644
index 000000000..c645ccc9f
--- /dev/null
+++ b/kompute/config/FindSphinx.cmake
@@ -0,0 +1,16 @@
+# Look for an executable called sphinx-build
+find_program(SPHINX_EXECUTABLE
+    NAMES sphinx-build
+    DOC "Path to sphinx-build executable")
+
+if(SPHINX_EXECUTABLE STREQUAL "SPHINX_EXECUTABLE-NOTFOUND")
+    message(FATAL_ERROR "sphinx-build not found.")
+endif()
+
+include(FindPackageHandleStandardArgs)
+
+# Handle standard arguments to find_package like REQUIRED and QUIET
+find_package_handle_standard_args(
+    Sphinx
+    "Failed to find sphinx-build executable"
+    SPHINX_EXECUTABLE)
diff --git a/kompute/external/bin/xxd.c b/kompute/external/bin/xxd.c
new file mode 100644
index 000000000..60ed3f712
--- /dev/null
+++ b/kompute/external/bin/xxd.c
@@ -0,0 +1,819 @@
+/*
+As indicated at https://lists.debian.org/debian-legal/2015/01/msg00037.html,
+the author has permitted redistribution of xxd under the MIT license, as follows:
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * xxd: my hexdump facility. jw
+ *
+ *  2.10.90 changed to word output
+ *  3.03.93 new indent style, dumb bug inserted and fixed.
+ *	    -c option, mls
+ * 26.04.94 better option parser, -ps, -l, -s added.
+ *  1.07.94 -r badly needs - as input file.  Per default autoskip over
+ *	       consecutive lines of zeroes, as unix od does.
+ *	    -a shows them too.
+ *	    -i dump as c-style #include "file.h"
+ *  1.11.95 if "xxd -i" knows the filename, an 'unsigned char filename_bits[]'
+ *	    array is written in correct c-syntax.
+ *	    -s improved, now defaults to absolute seek, relative requires a '+'.
+ *	    -r improved, now -r -s -0x... is supported.
+ *	       change/suppress leading '\0' bytes.
+ *	    -l n improved: stops exactly after n bytes.
+ *	    -r improved, better handling of partial lines with trailing garbage.
+ *	    -r improved, now -r -p works again!
+ *	    -r improved, less flushing, much faster now! (that was silly)
+ *  3.04.96 Per repeated request of a single person: autoskip defaults to off.
+ * 15.05.96 -v added. They want to know the version.
+ *	    -a fixed, to show last line inf file ends in all zeros.
+ *	    -u added: Print upper case hex-letters, as preferred by unix bc.
+ *	    -h added to usage message. Usage message extended.
+ *	    Now using outfile if specified even in normal mode, aehem.
+ *	    No longer mixing of ints and longs. May help doze people.
+ *	    Added binify ioctl for same reason. (Enough Doze stress for 1996!)
+ * 16.05.96 -p improved, removed occasional superfluous linefeed.
+ * 20.05.96 -l 0 fixed. tried to read anyway.
+ * 21.05.96 -i fixed. now honours -u, and prepends __ to numeric filenames.
+ *	    compile -DWIN32 for NT or W95. George V. Reilly, * -v improved :-)
+ *	    support --gnuish-longhorn-options
+ * 25.05.96 MAC support added: CodeWarrior already uses ``outline'' in Types.h
+ *	    which is included by MacHeaders (Axel Kielhorn). Renamed to
+ *	    xxdline().
+ *  7.06.96 -i printed 'int' instead of 'char'. *blush*
+ *	    added Bram's OS2 ifdefs...
+ * 18.07.96 gcc -Wall @ SunOS4 is now slient.
+ *	    Added osver for MSDOS/DJGPP/WIN32.
+ * 29.08.96 Added size_t to strncmp() for Amiga.
+ * 24.03.97 Windows NT support (Phil Hanna). Clean exit for Amiga WB (Bram)
+ * 02.04.97 Added -E option, to have EBCDIC translation instead of ASCII
+ *	    (azc10@yahoo.com)
+ * 22.05.97 added -g (group octets) option (jcook@namerica.kla.com).
+ * 23.09.98 nasty -p -r misfeature fixed: slightly wrong output, when -c was
+ *	    missing or wrong.
+ * 26.09.98 Fixed: 'xxd -i infile outfile' did not truncate outfile.
+ * 27.10.98 Fixed: -g option parser required blank.
+ *	    option -b added: 01000101 binary output in normal format.
+ * 16.05.00 Added VAXC changes by Stephen P. Wall
+ * 16.05.00 Improved MMS file and merge for VMS by Zoltan Arpadffy
+ *
+ * (c) 1990-1998 by Juergen Weigert (jnweiger@informatik.uni-erlangen.de)
+ *
+ * Small changes made afterwards by Bram Moolenaar et al.
+ *
+ * Distribute freely and credit me,
+ * make money and share with me,
+ * lose money and don't ask me.
+ *
+ *
+ */
+
+/* Visual Studio 2005 has 'deprecated' many of the standard CRT functions */
+#if _MSC_VER >= 1400
+# define _CRT_SECURE_NO_DEPRECATE
+# define _CRT_NONSTDC_NO_DEPRECATE
+#endif
+
+#include <stdio.h>
+#ifdef VAXC
+# include <file.h>
+#else
+# include <fcntl.h>
+#endif
+#ifdef __TSC__
+# define MSDOS
+#endif
+#if !defined(OS2) && defined(__EMX__)
+# define OS2
+#endif
+#if defined(MSDOS) || defined(WIN32) || defined(OS2) || defined(__BORLANDC__) || defined(CYGWIN)
+# include <io.h>	/* for setmode() */
+#else
+# ifdef UNIX
+#  include <unistd.h>
+# endif
+#endif
+#include <stdlib.h>
+#include <string.h>	/* for strncmp() */
+#include <ctype.h>	/* for isalnum() */
+#if __MWERKS__ && !defined(BEBOX)
+# include <unix.h>	/* for fdopen() on MAC */
+#endif
+
+#if defined(__BORLANDC__) && __BORLANDC__ <= 0x0410 && !defined(fileno)
+/* Missing define and prototype grabbed from the BC 4.0 <stdio.h> */
+# define fileno(f)       ((f)->fd)
+FILE   _FAR *_Cdecl _FARFUNC fdopen(int __handle, char _FAR *__type);
+#endif
+
+
+/*  This corrects the problem of missing prototypes for certain functions
+ *  in some GNU installations (e.g. SunOS 4.1.x).
+ *  Darren Hiebert <darren@hmi.com> (sparc-sun-sunos4.1.3_U1/2.7.2.2)
+ */
+#if defined(__GNUC__) && defined(__STDC__)
+# ifndef __USE_FIXED_PROTOTYPES__
+#  define __USE_FIXED_PROTOTYPES__
+# endif
+#endif
+
+#ifndef __USE_FIXED_PROTOTYPES__
+/*
+ * This is historic and works only if the compiler really has no prototypes:
+ *
+ * Include prototypes for Sun OS 4.x, when using an ANSI compiler.
+ * FILE is defined on OS 4.x, not on 5.x (Solaris).
+ * if __SVR4 is defined (some Solaris versions), don't include this.
+ */
+#if defined(sun) && defined(FILE) && !defined(__SVR4) && defined(__STDC__)
+#  define __P(a) a
+/* excerpt from my sun_stdlib.h */
+extern int fprintf __P((FILE *, char *, ...));
+extern int fputs   __P((char *, FILE *));
+extern int _flsbuf __P((unsigned char, FILE *));
+extern int _filbuf __P((FILE *));
+extern int fflush  __P((FILE *));
+extern int fclose  __P((FILE *));
+extern int fseek   __P((FILE *, long, int));
+extern int rewind  __P((FILE *));
+
+extern void perror __P((char *));
+# endif
+#endif
+
+extern long int strtol();
+extern long int ftell();
+
+char version[] = "xxd V1.10 27oct98 by Juergen Weigert";
+#ifdef WIN32
+char osver[] = " (Win32)";
+#else
+# ifdef DJGPP
+char osver[] = " (dos 32 bit)";
+# else
+#  ifdef MSDOS
+char osver[] = " (dos 16 bit)";
+#  else
+char osver[] = "";
+#  endif
+# endif
+#endif
+
+#if !defined(CYGWIN) && (defined(CYGWIN32) || defined(__CYGWIN__) || defined(__CYGWIN32__))
+# define CYGWIN
+#endif
+#if defined(MSDOS) || defined(WIN32) || defined(OS2)
+# define BIN_READ(yes)  ((yes) ? "rb" : "rt")
+# define BIN_WRITE(yes) ((yes) ? "wb" : "wt")
+# define BIN_CREAT(yes) ((yes) ? (O_CREAT|O_BINARY) : O_CREAT)
+# define BIN_ASSIGN(fp, yes) setmode(fileno(fp), (yes) ? O_BINARY : O_TEXT)
+# define PATH_SEP '\\'
+#elif defined(CYGWIN)
+# define BIN_READ(yes)  ((yes) ? "rb" : "rt")
+# define BIN_WRITE(yes) ((yes) ? "wb" : "w")
+# define BIN_CREAT(yes) ((yes) ? (O_CREAT|O_BINARY) : O_CREAT)
+# define BIN_ASSIGN(fp, yes) ((yes) ? (void) setmode(fileno(fp), O_BINARY) : (void) (fp))
+# define PATH_SEP '/'
+#else
+# ifdef VMS
+#  define BIN_READ(dummy)  "r"
+#  define BIN_WRITE(dummy) "w"
+#  define BIN_CREAT(dummy) O_CREAT
+#  define BIN_ASSIGN(fp, dummy) fp
+#  define PATH_SEP ']'
+#  define FILE_SEP '.'
+# else
+#  define BIN_READ(dummy)  "r"
+#  define BIN_WRITE(dummy) "w"
+#  define BIN_CREAT(dummy) O_CREAT
+#  define BIN_ASSIGN(fp, dummy) fp
+#  define PATH_SEP '/'
+# endif
+#endif
+
+/* open has only to arguments on the Mac */
+#if __MWERKS__
+# define OPEN(name, mode, umask) open(name, mode)
+#else
+# define OPEN(name, mode, umask) open(name, mode, umask)
+#endif
+
+#ifdef AMIGA
+# define STRNCMP(s1, s2, l) strncmp(s1, s2, (size_t)l)
+#else
+# define STRNCMP(s1, s2, l) strncmp(s1, s2, l)
+#endif
+
+#ifndef __P
+# if defined(__STDC__) || defined(MSDOS) || defined(WIN32) || defined(OS2) \
+        || defined(__BORLANDC__)
+#  define __P(a) a
+# else
+#  define __P(a) ()
+# endif
+#endif
+
+/* Let's collect some prototypes */
+/* CodeWarrior is really picky about missing prototypes */
+static void exit_with_usage __P((char *));
+static int huntype __P((FILE *, FILE *, FILE *, char *, int, int, long));
+static void xxdline __P((FILE *, char *, int));
+
+#define TRY_SEEK	/* attempt to use lseek, or skip forward by reading */
+#define COLS 256	/* change here, if you ever need more columns */
+#define LLEN (11 + (9*COLS-1)/1 + COLS + 2)
+
+char hexxa[] = "0123456789abcdef0123456789ABCDEF", *hexx = hexxa;
+
+/* the different hextypes known by this program: */
+#define HEX_NORMAL 0
+#define HEX_POSTSCRIPT 1
+#define HEX_CINCLUDE 2
+#define HEX_BITS 3		/* not hex a dump, but bits: 01111001 */
+
+static void
+exit_with_usage(pname)
+char *pname;
+{
+  fprintf(stderr, "Usage:\n       %s [options] [infile [outfile]]\n", pname);
+  fprintf(stderr, "    or\n       %s -r [-s [-]offset] [-c cols] [-ps] [infile [outfile]]\n", pname);
+  fprintf(stderr, "Options:\n");
+  fprintf(stderr, "    -a          toggle autoskip: A single '*' replaces nul-lines. Default off.\n");
+  fprintf(stderr, "    -b          binary digit dump (incompatible with -p,-i,-r). Default hex.\n");
+  fprintf(stderr, "    -c cols     format <cols> octets per line. Default 16 (-i: 12, -ps: 30).\n");
+  fprintf(stderr, "    -E          show characters in EBCDIC. Default ASCII.\n");
+  fprintf(stderr, "    -g          number of octets per group in normal output. Default 2.\n");
+  fprintf(stderr, "    -h          print this summary.\n");
+  fprintf(stderr, "    -i          output in C include file style.\n");
+  fprintf(stderr, "    -l len      stop after <len> octets.\n");
+  fprintf(stderr, "    -ps         output in postscript plain hexdump style.\n");
+  fprintf(stderr, "    -r          reverse operation: convert (or patch) hexdump into binary.\n");
+  fprintf(stderr, "    -r -s off   revert with <off> added to file positions found in hexdump.\n");
+  fprintf(stderr, "    -s %sseek  start at <seek> bytes abs. %sinfile offset.\n",
+#ifdef TRY_SEEK
+      "[+][-]", "(or +: rel.) ");
+#else
+      "", "");
+#endif
+  fprintf(stderr, "    -u          use upper case hex letters.\n");
+  fprintf(stderr, "    -v          show version: \"%s%s\".\n", version, osver);
+  exit(1);
+}
+
+/*
+ * Max. cols binary characters are decoded from the input stream per line.
+ * Two adjacent garbage characters after evaluated data delimit valid data.
+ * Everything up to the next newline is discarded.
+ *
+ * The name is historic and came from 'undo type opt h'.
+ */
+static int
+huntype(fpi, fpo, fperr, pname, cols, hextype, base_off)
+FILE *fpi, *fpo, *fperr;
+char *pname;
+int cols, hextype;
+long base_off;
+{
+  int c, ign_garb = 1, n1 = -1, n2 = 0, n3, p = cols;
+  long have_off = 0, want_off = 0;
+
+  rewind(fpi);
+
+  while ((c = getc(fpi)) != EOF)
+    {
+      if (c == '\r')	/* Doze style input file? */
+    continue;
+
+#if 0	/* this doesn't work when there is normal text after the hex codes in
+       the last line that looks like hex */
+      if (c == ' ' || c == '\n' || c == '\t')  /* allow multiple spaces */
+    continue;
+#endif
+
+      n3 = n2;
+      n2 = n1;
+
+      if (c >= '0' && c <= '9')
+    n1 = c - '0';
+      else if (c >= 'a' && c <= 'f')
+    n1 = c - 'a' + 10;
+      else if (c >= 'A' && c <= 'F')
+    n1 = c - 'A' + 10;
+      else
+    {
+      n1 = -1;
+      if (ign_garb)
+        continue;
+    }
+
+      ign_garb = 0;
+
+      if (p >= cols)
+    {
+      if (!hextype)
+        {
+          if (n1 < 0)
+        {
+          p = 0;
+          continue;
+        }
+          want_off = (want_off << 4) | n1;
+          continue;
+        }
+      else
+        p = 0;
+    }
+
+      if (base_off + want_off != have_off)
+    {
+      fflush(fpo);
+#ifdef TRY_SEEK
+      c = fseek(fpo, base_off + want_off - have_off, 1);
+      if (c >= 0)
+        have_off = base_off + want_off;
+#endif
+      if (base_off + want_off < have_off)
+        {
+          fprintf(fperr, "%s: sorry, cannot seek backwards.\n", pname);
+          return 5;
+        }
+      for (; have_off < base_off + want_off; have_off++)
+        putc(0, fpo);
+    }
+
+      if (n2 >= 0 && n1 >= 0)
+    {
+      putc((n2 << 4) | n1, fpo);
+      have_off++;
+      want_off++;
+      n1 = -1;
+      if ((++p >= cols) && !hextype)
+        {
+          /* skip rest of line as garbage */
+          want_off = 0;
+          while ((c = getc(fpi)) != '\n' && c != EOF)
+        ;
+          ign_garb = 1;
+        }
+    }
+      else if (n1 < 0 && n2 < 0 && n3 < 0)
+    {
+      /* already stumbled into garbage, skip line, wait and see */
+      if (!hextype)
+        want_off = 0;
+      while ((c = getc(fpi)) != '\n' && c != EOF)
+        ;
+      ign_garb = 1;
+    }
+    }
+  fflush(fpo);
+#ifdef TRY_SEEK
+  fseek(fpo, 0L, 2);
+#endif
+  fclose(fpo);
+  fclose(fpi);
+  return 0;
+}
+
+/*
+ * Print line l. If nz is false, xxdline regards the line a line of
+ * zeroes. If there are three or more consecutive lines of zeroes,
+ * they are replaced by a single '*' character.
+ *
+ * If the output ends with more than two lines of zeroes, you
+ * should call xxdline again with l being the last line and nz
+ * negative. This ensures that the last line is shown even when
+ * it is all zeroes.
+ *
+ * If nz is always positive, lines are never suppressed.
+ */
+static void
+xxdline(fp, l, nz)
+FILE *fp;
+char *l;
+int nz;
+{
+  static char z[LLEN+1];
+  static int zero_seen = 0;
+
+  if (!nz && zero_seen == 1)
+    strcpy(z, l);
+
+  if (nz || !zero_seen++)
+    {
+      if (nz)
+    {
+      if (nz < 0)
+        zero_seen--;
+      if (zero_seen == 2)
+        fputs(z, fp);
+      if (zero_seen > 2)
+        fputs("*\n", fp);
+    }
+      if (nz >= 0 || zero_seen > 0)
+    fputs(l, fp);
+      if (nz)
+    zero_seen = 0;
+    }
+}
+
+/* This is an EBCDIC to ASCII conversion table */
+/* from a proposed BTL standard April 16, 1979 */
+static unsigned char etoa64[] =
+{
+    0040,0240,0241,0242,0243,0244,0245,0246,
+    0247,0250,0325,0056,0074,0050,0053,0174,
+    0046,0251,0252,0253,0254,0255,0256,0257,
+    0260,0261,0041,0044,0052,0051,0073,0176,
+    0055,0057,0262,0263,0264,0265,0266,0267,
+    0270,0271,0313,0054,0045,0137,0076,0077,
+    0272,0273,0274,0275,0276,0277,0300,0301,
+    0302,0140,0072,0043,0100,0047,0075,0042,
+    0303,0141,0142,0143,0144,0145,0146,0147,
+    0150,0151,0304,0305,0306,0307,0310,0311,
+    0312,0152,0153,0154,0155,0156,0157,0160,
+    0161,0162,0136,0314,0315,0316,0317,0320,
+    0321,0345,0163,0164,0165,0166,0167,0170,
+    0171,0172,0322,0323,0324,0133,0326,0327,
+    0330,0331,0332,0333,0334,0335,0336,0337,
+    0340,0341,0342,0343,0344,0135,0346,0347,
+    0173,0101,0102,0103,0104,0105,0106,0107,
+    0110,0111,0350,0351,0352,0353,0354,0355,
+    0175,0112,0113,0114,0115,0116,0117,0120,
+    0121,0122,0356,0357,0360,0361,0362,0363,
+    0134,0237,0123,0124,0125,0126,0127,0130,
+    0131,0132,0364,0365,0366,0367,0370,0371,
+    0060,0061,0062,0063,0064,0065,0066,0067,
+    0070,0071,0372,0373,0374,0375,0376,0377
+};
+
+const char* extract_filename(const char* path) {
+    const char* filename = strrchr(path, '/');
+    if (filename) {
+        return filename + 1;
+    }
+    return path;
+}
+
+int
+main(argc, argv)
+int argc;
+char *argv[];
+{
+  FILE *fp, *fpo;
+  int c, e, p = 0, relseek = 1, negseek = 0, revert = 0;
+  int cols = 0, nonzero = 0, autoskip = 0, hextype = HEX_NORMAL;
+  int ebcdic = 0;
+  int octspergrp = -1;	/* number of octets grouped in output */
+  int grplen;		/* total chars per octet group */
+  long length = -1, n = 0, seekoff = 0;
+  char l[LLEN+1];
+  char *pname, *pp;
+
+#ifdef AMIGA
+  /* This program doesn't work when started from the Workbench */
+  if (argc == 0)
+    exit(1);
+#endif
+
+  pname = argv[0];
+  for (pp = pname; *pp; )
+    if (*pp++ == PATH_SEP)
+      pname = pp;
+#ifdef FILE_SEP
+  for (pp = pname; *pp; pp++)
+    if (*pp == FILE_SEP)
+      {
+    *pp = '\0';
+    break;
+      }
+#endif
+
+  while (argc >= 2)
+    {
+      pp = argv[1] + (!STRNCMP(argv[1], "--", 2) && argv[1][2]);
+       if (!STRNCMP(pp, "-a", 2)) autoskip = 1 - autoskip;
+      else if (!STRNCMP(pp, "-b", 2)) hextype = HEX_BITS;
+      else if (!STRNCMP(pp, "-u", 2)) hexx = hexxa + 16;
+      else if (!STRNCMP(pp, "-p", 2)) hextype = HEX_POSTSCRIPT;
+      else if (!STRNCMP(pp, "-i", 2)) hextype = HEX_CINCLUDE;
+      else if (!STRNCMP(pp, "-r", 2)) revert++;
+      else if (!STRNCMP(pp, "-E", 2)) ebcdic++;
+      else if (!STRNCMP(pp, "-v", 2))
+    {
+      fprintf(stderr, "%s%s\n", version, osver);
+      exit(0);
+    }
+      else if (!STRNCMP(pp, "-c", 2))
+    {
+      if (pp[2] && STRNCMP("ols", pp + 2, 3))
+        cols = (int)strtol(pp + 2, NULL, 0);
+      else
+        {
+          if (!argv[2])
+        exit_with_usage(pname);
+          cols = (int)strtol(argv[2], NULL, 0);
+          argv++;
+          argc--;
+        }
+    }
+      else if (!STRNCMP(pp, "-g", 2))
+    {
+      if (pp[2] && STRNCMP("roupsize", pp + 2, 8))
+        octspergrp = (int)strtol(pp + 2, NULL, 0);
+      else
+        {
+          if (!argv[2])
+        exit_with_usage(pname);
+          octspergrp = (int)strtol(argv[2], NULL, 0);
+          argv++;
+          argc--;
+        }
+    }
+      else if (!STRNCMP(pp, "-s", 2))
+    {
+      relseek = 0;
+      negseek = 0;
+      if (pp[2] && STRNCMP("kip", pp+2, 3) && STRNCMP("eek", pp+2, 3))
+        {
+#ifdef TRY_SEEK
+          if (pp[2] == '+')
+        relseek++;
+          if (pp[2+relseek] == '-')
+        negseek++;
+#endif
+          seekoff = strtol(pp + 2+relseek+negseek, (char **)NULL, 0);
+        }
+      else
+        {
+          if (!argv[2])
+        exit_with_usage(pname);
+#ifdef TRY_SEEK
+          if (argv[2][0] == '+')
+        relseek++;
+          if (argv[2][relseek] == '-')
+        negseek++;
+#endif
+          seekoff = strtol(argv[2] + relseek+negseek, (char **)NULL, 0);
+          argv++;
+          argc--;
+        }
+    }
+      else if (!STRNCMP(pp, "-l", 2))
+    {
+      if (pp[2] && STRNCMP("en", pp + 2, 2))
+        length = strtol(pp + 2, (char **)NULL, 0);
+      else
+        {
+          if (!argv[2])
+        exit_with_usage(pname);
+          length = strtol(argv[2], (char **)NULL, 0);
+          argv++;
+          argc--;
+        }
+    }
+      else if (!strcmp(pp, "--"))	/* end of options */
+    {
+      argv++;
+      argc--;
+      break;
+    }
+      else if (pp[0] == '-' && pp[1])	/* unknown option */
+    exit_with_usage(pname);
+      else
+    break;				/* not an option */
+
+      argv++;				/* advance to next argument */
+      argc--;
+    }
+
+  if (!cols)
+    switch (hextype)
+      {
+      case HEX_POSTSCRIPT:	cols = 30; break;
+      case HEX_CINCLUDE:	cols = 12; break;
+      case HEX_BITS:		cols = 6; break;
+      case HEX_NORMAL:
+      default:			cols = 16; break;
+      }
+
+  if (octspergrp < 0)
+    switch (hextype)
+      {
+      case HEX_BITS:		octspergrp = 1; break;
+      case HEX_NORMAL:		octspergrp = 2; break;
+      case HEX_POSTSCRIPT:
+      case HEX_CINCLUDE:
+      default:			octspergrp = 0; break;
+      }
+
+  if (cols < 1 || ((hextype == HEX_NORMAL || hextype == HEX_BITS)
+                                && (cols > COLS)))
+    {
+      fprintf(stderr, "%s: invalid number of columns (max. %d).\n", pname, COLS);
+      exit(1);
+    }
+
+  if (octspergrp < 1)
+    octspergrp = cols;
+
+  if (argc > 3)
+    exit_with_usage(pname);
+
+  if (argc == 1 || (argv[1][0] == '-' && !argv[1][1]))
+    BIN_ASSIGN(fp = stdin, !revert);
+  else
+    {
+      if ((fp = fopen(argv[1], BIN_READ(!revert))) == NULL)
+    {
+      fprintf(stderr,"%s: ", pname);
+      perror(argv[1]);
+      return 2;
+    }
+    }
+
+  if (argc < 3 || (argv[2][0] == '-' && !argv[2][1]))
+    BIN_ASSIGN(fpo = stdout, revert);
+  else
+    {
+      int fd;
+      int mode = revert ? O_WRONLY : (O_TRUNC|O_WRONLY);
+
+      if (((fd = OPEN(argv[2], mode | BIN_CREAT(revert), 0666)) < 0) ||
+      (fpo = fdopen(fd, BIN_WRITE(revert))) == NULL)
+    {
+      fprintf(stderr, "%s: ", pname);
+      perror(argv[2]);
+      return 3;
+    }
+      rewind(fpo);
+    }
+
+  if (revert)
+    {
+      if (hextype && (hextype != HEX_POSTSCRIPT))
+    {
+      fprintf(stderr, "%s: sorry, cannot revert this type of hexdump\n", pname);
+      return -1;
+    }
+      return huntype(fp, fpo, stderr, pname, cols, hextype,
+        negseek ? -seekoff : seekoff);
+    }
+
+  if (seekoff || negseek || !relseek)
+    {
+#ifdef TRY_SEEK
+      if (relseek)
+    e = fseek(fp, negseek ? -seekoff : seekoff, 1);
+      else
+    e = fseek(fp, negseek ? -seekoff : seekoff, negseek ? 2 : 0);
+      if (e < 0 && negseek)
+    {
+      fprintf(stderr, "%s: sorry cannot seek.\n", pname);
+      return 4;
+    }
+      if (e >= 0)
+    seekoff = ftell(fp);
+      else
+#endif
+    {
+      long s = seekoff;
+
+      while (s--)
+        (void)getc(fp);
+    }
+    }
+
+  if (hextype == HEX_CINCLUDE)
+    {
+      const char* filename = extract_filename(argv[1]);
+
+      if (fp != stdin)
+    {
+      fprintf(fpo, "unsigned char %s", isdigit((int)filename[0]) ? "__" : "");
+      for (e = 0; (c = filename[e]) != 0; e++)
+        putc(isalnum(c) ? c : '_', fpo);
+      fputs("[] = {\n", fpo);
+    }
+
+      p = 0;
+      while ((length < 0 || p < length) && (c = getc(fp)) != EOF)
+    {
+      fprintf(fpo, (hexx == hexxa) ? "%s0x%02x" : "%s0X%02X",
+        (p % cols) ? ", " : ",\n  "+2*!p,  c);
+      p++;
+    }
+
+      if (p)
+    fputs("\n};\n"+3*(fp == stdin), fpo);
+
+      if (fp != stdin)
+    {
+      fprintf(fpo, "unsigned int %s", isdigit((int)filename[0]) ? "__" : "");
+      for (e = 0; (c = filename[e]) != 0; e++)
+        putc(isalnum(c) ? c : '_', fpo);
+      fprintf(fpo, "_len = %d;\n", p);
+    }
+
+      fclose(fp);
+      fclose(fpo);
+      return 0;
+    }
+
+  if (hextype == HEX_POSTSCRIPT)
+    {
+      p = cols;
+      while ((length < 0 || n < length) && (e = getc(fp)) != EOF)
+    {
+      putchar(hexx[(e >> 4) & 0xf]);
+      putchar(hexx[(e     ) & 0xf]);
+      n++;
+      if (!--p)
+        {
+          putchar('\n');
+          p = cols;
+        }
+    }
+      if (p < cols)
+    putchar('\n');
+      fclose(fp);
+      fclose(fpo);
+      return 0;
+    }
+
+  /* hextype: HEX_NORMAL or HEX_BITS */
+
+  if (hextype == HEX_NORMAL)
+    grplen = octspergrp + octspergrp + 1;	/* chars per octet group */
+  else	/* hextype == HEX_BITS */
+    grplen = 8 * octspergrp + 1;
+
+  while ((length < 0 || n < length) && (e = getc(fp)) != EOF)
+    {
+      if (p == 0)
+    {
+      sprintf(l, "%07lx: ", n + seekoff);
+      for (c = 9; c < LLEN; l[c++] = ' ');
+    }
+      if (hextype == HEX_NORMAL)
+    {
+      l[c = (9 + (grplen * p) / octspergrp)] = hexx[(e >> 4) & 0xf];
+      l[++c]			       = hexx[ e       & 0xf];
+    }
+      else /* hextype == HEX_BITS */
+    {
+      int i;
+
+      c = (9 + (grplen * p) / octspergrp) - 1;
+      for (i = 7; i >= 0; i--)
+        l[++c] = (e & (1 << i)) ? '1' : '0';
+    }
+      if (ebcdic)
+    e = (e < 64) ? '.' : etoa64[e-64];
+      /* When changing this update definition of LLEN above. */
+      l[11 + (grplen * cols - 1)/octspergrp + p] =
+#ifdef __MVS__
+      (e >= 64)
+#else
+      (e > 31 && e < 127)
+#endif
+      ? e : '.';
+      if (e)
+    nonzero++;
+      n++;
+      if (++p == cols)
+    {
+      l[c = (11 + (grplen * cols - 1)/octspergrp + p)] = '\n'; l[++c] = '\0';
+      xxdline(fpo, l, autoskip ? nonzero : 1);
+      nonzero = 0;
+      p = 0;
+    }
+    }
+  if (p)
+    {
+      l[c = (11 + (grplen * cols - 1)/octspergrp + p)] = '\n'; l[++c] = '\0';
+      xxdline(fpo, l, 1);
+    }
+  else if (autoskip)
+    xxdline(fpo, l, -1);	/* last chance to flush out suppressed lines */
+
+  fclose(fp);
+  fclose(fpo);
+  return 0;
+}
diff --git a/kompute/kompute-config.cmake b/kompute/kompute-config.cmake
new file mode 100644
index 000000000..10425252c
--- /dev/null
+++ b/kompute/kompute-config.cmake
@@ -0,0 +1,28 @@
+# General purpose GPU compute framework built on Vulkan to
+# support 1000s of cross vendor graphics cards
+# (AMD, Qualcomm, NVIDIA & friends). Blazing fast, mobile-enabled,
+# asynchronous and optimized for advanced GPU data processing use cases.
+# Backed by the Linux Foundation. 
+#
+# Finding this module will define the following variables:
+#  KOMPUTE_FOUND - True if the core library has been found
+#  KOMPUTE_LIBRARIES - Path to the core library archive
+#  KOMPUTE_INCLUDE_DIRS - Path to the include directories. Gives access
+#                     to kompute.h, as a single include which must be included in every
+#                     file that uses this interface. Else it also points to the
+#                     directory for individual includes.
+
+find_path(KOMPUTE_INCLUDE_DIR
+          NAMES kompute.h)
+
+find_library(KOMPUTE_LIBRARY
+             NAMES kompute
+             HINTS ${KOMPUTE_LIBRARY_ROOT})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(KOMPUTE REQUIRED_VARS KOMPUTE_LIBRARY KOMPUTE_INCLUDE_DIR)
+
+if(KOMPUTE_FOUND)
+    set(KOMPUTE_LIBRARIES ${KOMPUTE_LIBRARY})
+    set(KOMPUTE_INCLUDE_DIRS ${KOMPUTE_INCLUDE_DIR})
+endif()
diff --git a/kompute/op_add.comp b/kompute/op_add.comp
new file mode 100644
index 000000000..7e4e43d75
--- /dev/null
+++ b/kompute/op_add.comp
@@ -0,0 +1,145 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    uint row;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+
+    out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i) + pcs.inBOff];
+}
\ No newline at end of file
diff --git a/kompute/op_addrow.comp b/kompute/op_addrow.comp
new file mode 100644
index 000000000..492f672e5
--- /dev/null
+++ b/kompute/op_addrow.comp
@@ -0,0 +1,145 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    uint row;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+
+    out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
+}
\ No newline at end of file
diff --git a/kompute/op_cpy_f16_f16.comp b/kompute/op_cpy_f16_f16.comp
new file mode 100644
index 000000000..40d756ae5
--- /dev/null
+++ b/kompute/op_cpy_f16_f16.comp
@@ -0,0 +1,176 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 32
+#define IN_TYPE float16_t
+#define IN_TYPE_SIZE 2
+#define OUT_TYPE float16_t
+#define OUT_TYPE_SIZE 2
+
+layout(local_size_x = nth) in;
+
+layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
+layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inOff;
+    uint outOff;
+    int ne00;
+    int ne01;
+    int ne02;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    int ne1;
+    int ne2;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
+
+    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
+
+    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
+    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
+    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
+    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
+
+    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
+
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
+        out_[dst_data+i00] = OUT_TYPE(in_[src]);
+    }
+}
diff --git a/kompute/op_cpy_f16_f32.comp b/kompute/op_cpy_f16_f32.comp
new file mode 100644
index 000000000..309c48aed
--- /dev/null
+++ b/kompute/op_cpy_f16_f32.comp
@@ -0,0 +1,176 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 32
+#define IN_TYPE float16_t
+#define IN_TYPE_SIZE 2
+#define OUT_TYPE float
+#define OUT_TYPE_SIZE 4
+
+layout(local_size_x = nth) in;
+
+layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
+layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inOff;
+    uint outOff;
+    int ne00;
+    int ne01;
+    int ne02;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    int ne1;
+    int ne2;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
+
+    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
+
+    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
+    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
+    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
+    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
+
+    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
+
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
+        out_[dst_data+i00] = OUT_TYPE(in_[src]);
+    }
+}
diff --git a/kompute/op_cpy_f32_f16.comp b/kompute/op_cpy_f32_f16.comp
new file mode 100644
index 000000000..fb0e00d67
--- /dev/null
+++ b/kompute/op_cpy_f32_f16.comp
@@ -0,0 +1,176 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 32
+#define IN_TYPE float
+#define IN_TYPE_SIZE 4
+#define OUT_TYPE float16_t
+#define OUT_TYPE_SIZE 2
+
+layout(local_size_x = nth) in;
+
+layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
+layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inOff;
+    uint outOff;
+    int ne00;
+    int ne01;
+    int ne02;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    int ne1;
+    int ne2;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
+
+    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
+
+    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
+    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
+    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
+    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
+
+    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
+
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
+        out_[dst_data+i00] = OUT_TYPE(in_[src]);
+    }
+}
diff --git a/kompute/op_cpy_f32_f32.comp b/kompute/op_cpy_f32_f32.comp
new file mode 100644
index 000000000..f43480b8d
--- /dev/null
+++ b/kompute/op_cpy_f32_f32.comp
@@ -0,0 +1,168 @@
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 32
+#define IN_TYPE float
+#define IN_TYPE_SIZE 4
+#define OUT_TYPE float
+#define OUT_TYPE_SIZE 4
+
+layout(local_size_x = nth) in;
+
+layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
+layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inOff;
+    uint outOff;
+    int ne00;
+    int ne01;
+    int ne02;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    int ne1;
+    int ne2;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
+
+    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
+
+    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
+    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
+    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
+    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
+
+    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
+
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
+        out_[dst_data+i00] = OUT_TYPE(in_[src]);
+    }
+}
diff --git a/kompute/op_diagmask.comp b/kompute/op_diagmask.comp
new file mode 100644
index 000000000..18b0192d7
--- /dev/null
+++ b/kompute/op_diagmask.comp
@@ -0,0 +1,153 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+    uint n_past;
+    int ne00;
+    int ne01;
+} pcs;
+
+void main() {
+    const uint i02 = gl_WorkGroupID.z;
+    const uint i01 = gl_WorkGroupID.y;
+    const uint i00 = gl_WorkGroupID.x;
+
+    const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00;
+
+    if (i00 > pcs.n_past + i01) {
+        out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000);
+    } else {
+        out_[index + pcs.outOff] = in_[index + pcs.inOff];
+    }
+}
diff --git a/kompute/op_gelu.comp b/kompute/op_gelu.comp
new file mode 100644
index 000000000..8079b8ef2
--- /dev/null
+++ b/kompute/op_gelu.comp
@@ -0,0 +1,142 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+    const float x = in_[i + pcs.inOff];
+
+    out_[i + pcs.outOff] = 0.5*x*(1.0 + tanh(SQRT_2_OVER_PI*x*(1.0 + GELU_COEF_A*x*x)));
+}
diff --git a/kompute/op_getrows_f16.comp b/kompute/op_getrows_f16.comp
new file mode 100644
index 000000000..e0f5bb16e
--- /dev/null
+++ b/kompute/op_getrows_f16.comp
@@ -0,0 +1,150 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { int inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int nb01;
+    int nb1;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+    const int r = inB[i + pcs.inBOff];
+
+    for (int j = 0; j < pcs.ne00; j++) {
+        out_[i*pcs.nb1 + j + pcs.outOff] = inA[r*pcs.nb01/2+j + pcs.inAOff];
+    }
+}
diff --git a/kompute/op_getrows_q4_0.comp b/kompute/op_getrows_q4_0.comp
new file mode 100644
index 000000000..cddba929b
--- /dev/null
+++ b/kompute/op_getrows_q4_0.comp
@@ -0,0 +1,179 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { int inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int nb01;
+    int nb1;
+} pcs;
+
+#define UNALIGNED_INPUT inA
+
+block_q4_0 get_unaligned_block_q4_0(uint index) {
+    block_q4_0 fres;
+    fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
+    [[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
+        fres.qs[it] = UNALIGNED_INPUT[index+2+it];
+    }
+    return fres;
+}
+
+void dequantize_row_q4_0(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
+    const uint qk = QK4_0;
+
+    const uint nb = k / qk;
+
+    for (uint i = 0; i < nb; i++) {
+        const block_q4_0 block = get_unaligned_block_q4_0(x + i*sizeof_block_q4_0);
+
+        const float16_t d = block.d;
+
+        for (uint j = 0; j < qk/2; ++j) {
+            const int x0 = (block.qs[j] & 0x0F) - 8;
+            const int x1 = (block.qs[j] >>   4) - 8;
+
+            out_[y+i*qk + j + 0   ] = float(x0)*d;
+            out_[y+i*qk + j + qk/2] = float(x1)*d;
+        }
+    }
+}
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+    const int r = inB[i + pcs.inBOff];
+
+    dequantize_row_q4_0(uint(r*pcs.nb01) + pcs.inAOff, uint(i*pcs.nb1/4) + pcs.outOff, pcs.ne00);
+}
diff --git a/kompute/op_getrows_q4_1.comp b/kompute/op_getrows_q4_1.comp
new file mode 100644
index 000000000..151848a9d
--- /dev/null
+++ b/kompute/op_getrows_q4_1.comp
@@ -0,0 +1,181 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { int inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int nb01;
+    int nb1;
+} pcs;
+
+#define UNALIGNED_INPUT inA
+
+block_q4_1 get_unaligned_block_q4_1(uint index) {
+    block_q4_1 fres;
+    fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
+    fres.m = u8BufToFloat16(UNALIGNED_INPUT, index+2);
+    [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
+        fres.qs[it] = UNALIGNED_INPUT[index+4+it];
+    }
+    return fres;
+}
+
+void dequantize_row_q4_1(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
+    const uint qk = QK4_1;
+
+    const uint nb = k / qk;
+
+    for (uint i = 0; i < nb; i++) {
+        const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_0);
+
+        const float16_t d = block.d;
+        const float16_t m = block.m;
+
+        for (uint j = 0; j < qk/2; ++j) {
+            const int x0 = (block.qs[j] & 0x0F);
+            const int x1 = (block.qs[j] >>   4);
+
+            out_[y+i*qk + j + 0   ] = float(x0)*d + m;
+            out_[y+i*qk + j + qk/2] = float(x1)*d + m;
+        }
+    }
+}
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+    const int r = inB[i + pcs.inBOff];
+
+    dequantize_row_q4_1(uint(r*pcs.nb01) + pcs.inAOff, uint(i*pcs.nb1/4) + pcs.outOff, pcs.ne00);
+}
diff --git a/kompute/op_mul.comp b/kompute/op_mul.comp
new file mode 100644
index 000000000..4907015d8
--- /dev/null
+++ b/kompute/op_mul.comp
@@ -0,0 +1,145 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    uint row;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+
+    out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i) + pcs.inBOff];
+}
\ No newline at end of file
diff --git a/kompute/op_mul_mat_f16.comp b/kompute/op_mul_mat_f16.comp
new file mode 100644
index 000000000..f1198b593
--- /dev/null
+++ b/kompute/op_mul_mat_f16.comp
@@ -0,0 +1,177 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 64) in;
+
+layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { float inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    uint nb01;
+    uint nb02;
+    uint nb11;
+    uint nb12;
+    int ne0;
+    int ne1;
+} pcs;
+
+shared float sum[gl_WorkGroupSize.x];
+
+void main() {
+    const uint r0 = gl_WorkGroupID.x;
+    const uint r1 = gl_WorkGroupID.y;
+    const uint im = gl_WorkGroupID.z;
+
+    const uint x = (r0*pcs.nb01 + im*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
+    const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+
+    sum[gl_LocalInvocationID.x] = 0.0;
+
+    for (uint i = gl_LocalInvocationID.x; i < pcs.ne00; i += gl_WorkGroupSize.x) {
+        sum[gl_LocalInvocationID.x] += float(inA[x+i]) * float(inB[y+i]);
+    }
+
+    // accumulate the sum from all threads in the threadgroup
+    barrier();
+    memoryBarrierShared();
+    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
+        if (gl_LocalInvocationID.x < i) {
+            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    if (gl_LocalInvocationID.x == 0) {
+        out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = sum[0];
+    }
+}
diff --git a/kompute/op_mul_mat_q4_0.comp b/kompute/op_mul_mat_q4_0.comp
new file mode 100644
index 000000000..206aea7d5
--- /dev/null
+++ b/kompute/op_mul_mat_q4_0.comp
@@ -0,0 +1,195 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 8, local_size_y = 8) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { float inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int ne10;
+    int ne0;
+} pcs;
+
+shared float sum[64];
+
+void main() {
+    const uint nb = uint(pcs.ne00/QK4_0);
+
+    const uint r0 = gl_WorkGroupID.x;
+    const uint r1 = gl_WorkGroupID.y;
+
+    const uint x = r0*nb; // Based from inA without base offset
+    const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
+
+    const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
+    const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
+
+    const uint ix = gl_LocalInvocationID.y/4;           // 0 or 1
+    const uint iy = gl_LocalInvocationID.y - 4*ix;      // 0...3
+
+    const uint first = 4 * iy;
+
+    float sumf = 0.0;
+
+    for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
+        const uint index = (x+i)*sizeof_block_q4_0+pcs.inAOff;
+        const float d = float(u8BufToFloat16(inA, index));
+
+        const uint xl = first; // Based from bl->qs
+        const uint yl = y + i * QK4_0 + first; // Based from inB
+
+        vec2 acc = vec2(0.0, 0.0);
+
+        for (int j = 0; j < 4; ++j) {
+            const uint8_t b = inA[index+2+xl+j];
+            acc.x += inB[yl+j] * (b & 0xF) + inB[yl+j+16] * (b >> 4);
+            acc.y += inB[yl+j] + inB[yl+j+16];
+        }
+
+        sumf += d * (acc.x - 8.*acc.y);
+    }
+
+    sum[ith] = sumf;
+
+    //
+    // Accumulate the sum from all threads in the threadgroup
+    //
+    barrier();
+    if (ith == 0) {
+        float sumTotal = 0.0;
+        for (uint i = 0; i < nth; ++i) {
+            sumTotal += sum[i];
+        }
+        out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sumTotal;
+    }
+}
diff --git a/kompute/op_mul_mat_q4_1.comp b/kompute/op_mul_mat_q4_1.comp
new file mode 100644
index 000000000..8bdf810a1
--- /dev/null
+++ b/kompute/op_mul_mat_q4_1.comp
@@ -0,0 +1,218 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 8, local_size_y = 8) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { float inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int ne10;
+    int ne0;
+} pcs;
+
+shared float sum[gl_WorkGroupSize.x*gl_WorkGroupSize.y];
+
+#define UNALIGNED_INPUT inA
+
+block_q4_1 get_unaligned_block_q4_1(uint index) {
+    block_q4_1 fres;
+    fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
+    fres.m = u8BufToFloat16(UNALIGNED_INPUT, index+2);
+    [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
+        fres.qs[it] = UNALIGNED_INPUT[index+4+it];
+    }
+    return fres;
+}
+
+void main() {
+    const uint nb = uint(pcs.ne00/QK4_1);
+
+    const uint r0 = gl_WorkGroupID.x;
+    const uint r1 = gl_WorkGroupID.y;
+
+    const uint x = r0*nb; // Based from inA without base offset
+    const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
+
+    const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
+    const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
+
+    const uint ix = gl_LocalInvocationID.y/4;           // 0 or 1
+    const uint iy = gl_LocalInvocationID.y - 4*ix;      // 0...3
+
+    const uint first = 4 * iy;
+
+    float sumf = 0.0;
+
+    for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
+        //TODO: Removing the use of pointers has been quite hairy here. If something goes wrong here, this is most likely it:
+
+        const block_q4_1 block = get_unaligned_block_q4_1((x+i)*sizeof_block_q4_1+pcs.inAOff);
+
+        const float d = float(block.d);
+        const float m = float(block.m);
+
+        const uint xl = first; // Based from bl->qs
+        const uint yl = y + i * QK4_1 + first; // Based from inB
+
+        vec2 acc = vec2(0.0, 0.0);
+
+        for (int j = 0; j < 4; ++j) {
+            acc.x += inB[yl+j] * (d * (block.qs[xl+j] & 0xF) + m);
+            acc.y += inB[yl+j+16] * (d * (block.qs[xl+j] >> 4) + m);
+        }
+
+        sumf += d * (acc.x - acc.y);
+    }
+
+    sum[ith] = sumf;
+
+    //
+    // Accumulate the sum from all threads in the threadgroup
+    //
+    barrier();
+    memoryBarrierShared();
+    if (ith%4 == 0) {
+        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
+    }
+    barrier();
+    memoryBarrierShared();
+    if (ith%16 == 0) {
+        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
+    }
+    barrier();
+    memoryBarrierShared();
+    if (ith == 0) {
+        for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
+        out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sum[0];
+    }
+}
diff --git a/kompute/op_mulrow.comp b/kompute/op_mulrow.comp
new file mode 100644
index 000000000..3defd0a5f
--- /dev/null
+++ b/kompute/op_mulrow.comp
@@ -0,0 +1,145 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    uint row;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+
+    out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i % pcs.row) + pcs.inBOff];
+}
\ No newline at end of file
diff --git a/kompute/op_norm.comp b/kompute/op_norm.comp
new file mode 100644
index 000000000..ec0a8568d
--- /dev/null
+++ b/kompute/op_norm.comp
@@ -0,0 +1,209 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 256
+
+layout(local_size_x = nth) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+    uint ne00;
+    uint nb01;
+    float eps;
+} pcs;
+
+shared float sum[nth];
+
+void main() {
+    const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
+    // MEAN
+    // parallel sum
+    sum[gl_LocalInvocationID.x] = 0.0;
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        sum[gl_LocalInvocationID.x] += in_[x+i00];
+    }
+
+    // reduce
+    barrier();
+    memoryBarrierShared();
+    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_LocalInvocationID.x < i) {
+            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    // broadcast
+    if (gl_LocalInvocationID.x == 0) {
+        sum[0] /= float(pcs.ne00);
+    }
+    barrier();
+    memoryBarrierShared();
+    const float mean = sum[0];
+
+    // recenter
+    const uint y = (gl_WorkGroupID.x*pcs.ne00/4) + pcs.outOff; // Based from out_
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        out_[y+i00] = in_[x+i00] - mean;
+    }
+
+    // VARIANCE
+    // parallel sum
+    sum[gl_LocalInvocationID.x] = 0.0;
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
+    }
+
+    // reduce
+    barrier();
+    memoryBarrierShared();
+    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_LocalInvocationID.x < i) {
+            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    // broadcast
+    if (gl_LocalInvocationID.x == 0) {
+        sum[0] /= float(pcs.ne00);
+    }
+    barrier();
+    memoryBarrierShared();
+    const float variance = sum[0];
+
+    const float scale = 1.0f/sqrt(variance + pcs.eps);
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        out_[y+i00] *= scale;
+    }
+}
diff --git a/kompute/op_relu.comp b/kompute/op_relu.comp
new file mode 100644
index 000000000..bc2c31f43
--- /dev/null
+++ b/kompute/op_relu.comp
@@ -0,0 +1,141 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+
+    out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]);
+}
diff --git a/kompute/op_rmsnorm.comp b/kompute/op_rmsnorm.comp
new file mode 100644
index 000000000..784713c36
--- /dev/null
+++ b/kompute/op_rmsnorm.comp
@@ -0,0 +1,178 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 256
+
+layout(local_size_x = nth) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+    uint ne00;
+    uint nb01;
+    float eps;
+} pcs;
+
+shared float sum[nth];
+
+void main() {
+    const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
+
+    // parallel sum
+    sum[gl_LocalInvocationID.x] = 0.0;
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
+    }
+
+    // reduce
+    barrier();
+    memoryBarrierShared();
+    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_LocalInvocationID.x < i) {
+            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    // broadcast
+    if (gl_LocalInvocationID.x == 0) {
+        sum[0] /= float(pcs.ne00);
+    }
+    barrier();
+    memoryBarrierShared();
+
+    const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
+
+    const uint y = (gl_WorkGroupID.x*pcs.ne00/4) + pcs.outOff; // Based from out_
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        out_[y+i00] = in_[x+i00] * scale;
+    }
+}
diff --git a/kompute/op_rope.comp b/kompute/op_rope.comp
new file mode 100644
index 000000000..ca6bb6831
--- /dev/null
+++ b/kompute/op_rope.comp
@@ -0,0 +1,183 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout (binding = 0) readonly buffer tensorIn { float in_[]; };
+layout (binding = 1) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inOff;
+    uint outOff;
+    uint n_past;
+    int n_dims;
+    int mode;
+    float freq_base;
+    float freq_scale;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i3 = gl_WorkGroupID.z;
+    const uint i2 = gl_WorkGroupID.y;
+    const uint i1 = gl_WorkGroupID.x;
+
+    const bool is_neox = (pcs.mode & 2) != 0;
+    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
+
+    const uint p = ((pcs.mode & 1) == 0 ? pcs.n_past + i2 : i2);
+
+    float theta = pcs.freq_scale * float(p);
+
+    if (!is_neox) {
+        for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
+            const float cos_theta = cos(theta);
+            const float sin_theta = sin(theta);
+
+            theta *= theta_scale;
+
+            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
+
+            const float x0 = in_[src];
+            const float x1 = in_[src+1];
+
+            out_[dst_data] = x0*cos_theta - x1*sin_theta;
+            out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
+        }
+    } else {
+        // TODO: implement
+    }
+}
diff --git a/kompute/op_scale.comp b/kompute/op_scale.comp
new file mode 100644
index 000000000..f537121a4
--- /dev/null
+++ b/kompute/op_scale.comp
@@ -0,0 +1,142 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+    float scale;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+
+    out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
+}
\ No newline at end of file
diff --git a/kompute/op_silu.comp b/kompute/op_silu.comp
new file mode 100644
index 000000000..90c034ac7
--- /dev/null
+++ b/kompute/op_silu.comp
@@ -0,0 +1,141 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+} pcs;
+void main() {
+    const uint i = gl_WorkGroupID.x;
+    const float x = in_[i + pcs.inOff];
+
+    out_[i + pcs.outOff] = x / (1.0 + exp(-x));
+}
diff --git a/kompute/op_softmax.comp b/kompute/op_softmax.comp
new file mode 100644
index 000000000..ce0e71924
--- /dev/null
+++ b/kompute/op_softmax.comp
@@ -0,0 +1,197 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 32
+
+layout(local_size_x = nth) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+    int ne00;
+    int ne01;
+    int ne02;
+} pcs;
+
+shared float buf[nth];
+
+void main() {
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
+
+    const uint extra_off = i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00;
+    const uint psrc0 = extra_off + pcs.inOff; // Based from in_
+    const uint pdst = extra_off + pcs.outOff; // Based from out_
+
+    // parallel max
+    buf[gl_LocalInvocationID.x] = uintBitsToFloat(0xFF800000);
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], in_[psrc0 + i00]);
+    }
+
+    // reduce
+    barrier();
+    memoryBarrierShared();
+    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_LocalInvocationID.x < i) {
+            buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], buf[gl_LocalInvocationID.x + i]);
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    // broadcast
+    const float max_ = buf[0];
+
+    // parallel sum
+    buf[gl_LocalInvocationID.x] = 0.0;
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        buf[gl_LocalInvocationID.x] += exp(in_[psrc0 + i00] - max_);
+    }
+
+    // reduce
+    barrier();
+    memoryBarrierShared();
+    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_LocalInvocationID.x < i) {
+            buf[gl_LocalInvocationID.x] += buf[gl_LocalInvocationID.x + i];
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    // broadcast
+    const float sum = buf[0];
+
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        out_[pdst + i00] = exp(in_[psrc0 + i00] - max_) / sum;
+    }
+}
diff --git a/kompute/scripts/convert_shaders.py b/kompute/scripts/convert_shaders.py
new file mode 100644
index 000000000..9375b6701
--- /dev/null
+++ b/kompute/scripts/convert_shaders.py
@@ -0,0 +1,148 @@
+"""
+    Script to handle conversion of compute shaders to spirv and to headers
+"""
+import os
+import sys
+import logging
+import click
+import subprocess
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.StreamHandler())
+
+is_windows = sys.platform.startswith('win')
+
+CWD=os.path.dirname(os.path.abspath(__file__))
+XXD_LINUX_CMD="xxd"
+XXD_WINDOWS_CMD=os.path.abspath(os.path.join(CWD, "..\\external\\bin\\", "xxd.exe"))
+
+SHADER_GENERATED_NOTICE = """/*
+    THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT
+
+    ---
+
+    Copyright 2020 The Institute for Ethical AI & Machine Learning
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+"""
+
+@click.command()
+@click.option(
+    "--shader-path",
+    "-p",
+    envvar="KOMPUTE_SHADER_PATH",
+    required=True,
+    help="The path for the directory to build and convert shaders",
+)
+@click.option(
+    "--shader-binary",
+    "-s",
+    envvar="KOMPUTE_SHADER_BINARY",
+    required=True,
+    help="The path for the directory to build and convert shaders",
+)
+@click.option(
+    "--header-path",
+    "-c",
+    envvar="KOMPUTE_HEADER_PATH",
+    default="",
+    required=False,
+    help="The (optional) output file for the cpp header files",
+)
+@click.option(
+    "--verbose",
+    "-v",
+    envvar="KOMPUTE_HEADER_PATH",
+    default=False,
+    is_flag=True,
+    help="Enable versbosity if flag is provided",
+)
+def run_cli(
+    shader_path: str = None,
+    shader_binary: str = None,
+    header_path: bool = None,
+    verbose: bool = None,
+):
+    """
+    CLI function for shader generation
+    """
+
+    if verbose:
+        logger.setLevel(logging.DEBUG)
+    else:
+        logger.setLevel(logging.WARNING)
+
+    logger.debug(f"Starting script with variables: {locals()}")
+
+    if is_windows:
+        logger.debug(f"Running on windows, converting input paths")
+        shader_path = shader_path.replace("/", "\\")
+        header_path = header_path.replace("/", "\\")
+
+    shader_files = []
+    for root, directory, files in os.walk(shader_path):
+        for file in files:
+            if file.endswith(".comp"):
+                shader_files.append(os.path.join(root, file))
+
+    run_cmd = lambda *args: subprocess.check_output([*args]).decode()
+
+    logger.debug(f"Output spirv path: {shader_path}")
+    logger.debug(f"Converting files to spirv: {shader_files}")
+
+    spirv_files = []
+    for file in shader_files:
+        logger.debug(f"Converting to spirv: {file}")
+        spirv_file = f"{file}.spv"
+        run_cmd(shader_binary, "-V", file, "-o", spirv_file)
+        spirv_files.append(spirv_file)
+
+    # Create cpp files if header_path provided
+    if header_path:
+        logger.debug(f"Header path provided. Converting bin files to hpp.")
+        logger.debug(f"Output header path: {shader_path}")
+
+        # Check if xxd command options are available
+        if is_windows:
+            xxd_cmd = XXD_WINDOWS_CMD
+        else:
+            xxd_cmd = XXD_LINUX_CMD
+
+        for file in spirv_files:
+            print(xxd_cmd)
+            header_data = str(run_cmd(xxd_cmd, "-i", file))
+            # Ensuring the variable is a static const unsigned
+            header_data = header_data.replace("unsigned", "static const unsigned")
+            if is_windows:
+                raw_file_name = file.split("\\")[-1]
+            else:
+                raw_file_name = file.split("/")[-1]
+            file_name = f"shader{raw_file_name}"
+            header_file = file_name.replace(".comp.spv", ".hpp")
+            header_file_define = "SHADEROP_" + header_file.replace(".", "_").upper()
+            logger.debug(f"Converting to hpp: {file_name}")
+            with open(os.path.join(header_path, header_file), "w+", newline='\n') as fstream:
+                fstream.write(f"{SHADER_GENERATED_NOTICE}\n")
+                fstream.write(f"#ifndef {header_file_define}\n")
+                fstream.write(f"#define {header_file_define}\n\n")
+                fstream.write("namespace kp {\n")
+                fstream.write("namespace shader_data {\n")
+                fstream.write(f"{header_data}")
+                fstream.write("}\n")
+                fstream.write("}\n")
+                fstream.write(f"#endif // define {header_file_define}\n")
+
+
+if __name__ == "__main__":
+    run_cli()
diff --git a/kompute/scripts/requirements.txt b/kompute/scripts/requirements.txt
new file mode 100644
index 000000000..4da042504
--- /dev/null
+++ b/kompute/scripts/requirements.txt
@@ -0,0 +1,11 @@
+# CLI dependencies
+click==7.1.2
+
+# Dev dependencies
+black==19.10b0
+quom==1.2.0
+Sphinx==3.2.1
+sphinx_material==0.0.30
+breathe==4.20.0
+m2r2==0.2.5
+git+git://github.com/pybind/pybind11_mkdoc.git@master
diff --git a/kompute/setup.py b/kompute/setup.py
new file mode 100644
index 000000000..09faa8d1a
--- /dev/null
+++ b/kompute/setup.py
@@ -0,0 +1,93 @@
+import os
+import re
+import platform
+import sys
+import sysconfig
+import subprocess
+
+from setuptools import setup, Extension
+from setuptools.command.build_ext import build_ext
+from distutils.version import LooseVersion
+
+curr_dir = os.path.abspath(os.path.dirname(__file__))
+with open(os.path.join(curr_dir, 'README.md'), encoding='utf-8') as f:
+    long_description = f.read()
+
+class CMakeExtension(Extension):
+    def __init__(self, name, sourcedir=''):
+        Extension.__init__(self, name, sources=[])
+        self.sourcedir = os.path.abspath(sourcedir)
+
+
+class CMakeBuild(build_ext):
+    def run(self):
+        try:
+            out = subprocess.check_output(['cmake', '--version'])
+        except OSError:
+            raise RuntimeError("CMake must be installed to build the following extensions: " +
+                               ", ".join(e.name for e in self.extensions))
+
+        cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1))
+        if cmake_version < '3.15':
+            raise RuntimeError("CMake >= 3.15 is required")
+
+        for ext in self.extensions:
+            self.build_extension(ext)
+
+    def build_extension(self, ext):
+        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
+        # required for auto-detection of auxiliary "native" libs
+        if not extdir.endswith(os.path.sep):
+            extdir += os.path.sep
+
+        cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
+                      '-DKOMPUTE_OPT_BUILD_PYTHON=ON',
+                      '-DKOMPUTE_OPT_LOG_LEVEL=Off',
+                      '-DKOMPUTE_OPT_USE_SPDLOG=Off',
+                      '-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
+                      '-DPYTHON_EXECUTABLE=' + sys.executable,
+                      '-DPYTHON_INCLUDE_DIR=' + sysconfig.get_path('include'),
+                      '-DPYTHON_LIBRARY=' + sysconfig.get_path('stdlib'),
+        ]
+
+        cfg = 'Debug' if self.debug else 'Release'
+        build_args = ['--config', cfg]
+
+        env = os.environ.copy()
+        oldCxxFlags = env.get('CXXFLAGS', '')
+        env['CXXFLAGS'] = f'{oldCxxFlags} -DVERSION_INFO=\\"{self.distribution.get_version()}\\"'
+
+        if platform.system() == "Windows":
+            cmake_args += [f'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}']
+            if sys.maxsize > 2**32:
+                cmake_args += ['-A', 'x64']
+            build_args += ['--', '/m']
+        else:
+            env['CXXFLAGS'] += ' -fPIC'
+            cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
+            build_args += ['--', '-j']
+            # Optional environment variable to limit the number of parallel jobs for GitHub actions to reduce RAM usage
+            if 'KOMPUTE_PYTHON_NUM_PARALLEL_THREADS' in env:
+                build_args += env['KOMPUTE_PYTHON_NUM_PARALLEL_THREADS']
+
+        if not os.path.exists(self.build_temp):
+            os.makedirs(self.build_temp)
+
+        subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
+        subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
+
+setup(
+    name='kp',
+    version='0.8.1',
+    author='Alejandro Saucedo',
+    description='Kompute: Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU processing usecases.',
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    ext_modules=[CMakeExtension('kp')],
+    install_requires=[
+        "numpy<2.0.0"
+    ],
+    cmdclass=dict(build_ext=CMakeBuild),
+    zip_safe=False,
+    include_package_data=True,
+)
diff --git a/kompute/src/Algorithm.cpp b/kompute/src/Algorithm.cpp
new file mode 100644
index 000000000..9c41ec90f
--- /dev/null
+++ b/kompute/src/Algorithm.cpp
@@ -0,0 +1,450 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include <fstream>
+
+#include "kompute/Algorithm.hpp"
+
+namespace kp {
+
+Algorithm::~Algorithm()
+{
+    KP_LOG_DEBUG("Kompute Algorithm Destructor started");
+
+    this->destroy();
+}
+
+bool
+Algorithm::isInit()
+{
+    return this->mPipeline && this->mPipelineCache && this->mPipelineLayout &&
+           this->mDescriptorPool && this->mDescriptorSet &&
+           this->mDescriptorSetLayout && this->mShaderModule;
+}
+
+void
+Algorithm::destroy()
+{
+    // We don't have to free memory on destroy as it's freed by the
+    // commandBuffer destructor if (this->mPushConstantsData) {
+    //     free(this->mPushConstantsData);
+    // }
+    // if (this->mSpecializationConstantsData) {
+    //     free(this->mSpecializationConstantsData);
+    // }
+
+    if (!this->mDevice) {
+        KP_LOG_WARN("Kompute Algorithm destroy function reached with null "
+                    "Device pointer");
+        return;
+    }
+
+    if (this->mFreePipeline && this->mPipeline) {
+        KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline");
+        if (!this->mPipeline) {
+            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
+                        "pipeline but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mPipeline,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mPipeline = nullptr;
+    }
+
+    if (this->mFreePipelineCache && this->mPipelineCache) {
+        KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline cache");
+        if (!this->mPipelineCache) {
+            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
+                        "pipeline cache but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mPipelineCache,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mPipelineCache = nullptr;
+    }
+
+    if (this->mFreePipelineLayout && this->mPipelineLayout) {
+        KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline layout");
+        if (!this->mPipelineLayout) {
+            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
+                        "pipeline layout but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mPipelineLayout,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mPipelineLayout = nullptr;
+    }
+
+    if (this->mFreeShaderModule && this->mShaderModule) {
+        KP_LOG_DEBUG("Kompute Algorithm Destroying shader module");
+        if (!this->mShaderModule) {
+            KP_LOG_WARN("Kompute Algorithm Error requested to destroy shader "
+                        "module but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mShaderModule,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mShaderModule = nullptr;
+    }
+
+    freeParameters();
+}
+
+void
+Algorithm::freeParameters()
+{
+    if (this->mFreeDescriptorSetLayout && this->mDescriptorSetLayout) {
+        KP_LOG_DEBUG("Kompute Algorithm Destroying Descriptor Set Layout");
+        if (!this->mDescriptorSetLayout) {
+            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
+                        "descriptor set layout but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mDescriptorSetLayout,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDescriptorSetLayout = nullptr;
+    }
+}
+
+void
+Algorithm::createParameters()
+{
+    KP_LOG_DEBUG("Kompute Algorithm createParameters started");
+    if (!*this->mDescriptorPool) {
+        KP_LOG_ERROR("Kompute Algorithm can not create descriptor pool");
+        return;
+    }
+
+    std::vector<vk::DescriptorSetLayoutBinding> descriptorSetBindings;
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        descriptorSetBindings.push_back(
+          vk::DescriptorSetLayoutBinding(i, // Binding index
+                                         vk::DescriptorType::eStorageBuffer,
+                                         1, // Descriptor count
+                                         vk::ShaderStageFlagBits::eCompute));
+    }
+
+    // This is the component that is fed into the pipeline
+    vk::DescriptorSetLayoutCreateInfo descriptorSetLayoutInfo(
+      vk::DescriptorSetLayoutCreateFlags(),
+      static_cast<uint32_t>(descriptorSetBindings.size()),
+      descriptorSetBindings.data());
+
+    KP_LOG_DEBUG("Kompute Algorithm creating descriptor set layout");
+    this->mDescriptorSetLayout = std::make_shared<vk::DescriptorSetLayout>();
+    vk::Result result = this->mDevice->createDescriptorSetLayout(
+      &descriptorSetLayoutInfo, nullptr, this->mDescriptorSetLayout.get());
+
+   if (result != vk::Result::eSuccess) {
+        KP_LOG_ERROR("Failed to create descriptor set layout. Error code: {}", vk::to_string(result));
+    } else {
+        this->mFreeDescriptorSetLayout = true;
+        KP_LOG_DEBUG("Successfully allocated descriptor set layout.");
+    }
+
+    vk::DescriptorSetAllocateInfo descriptorSetAllocateInfo(
+      *this->mDescriptorPool,
+      1, // Descriptor set layout count
+      this->mDescriptorSetLayout.get());
+
+    KP_LOG_DEBUG("Kompute Algorithm allocating descriptor sets");
+    this->mDescriptorSet = std::make_shared<vk::DescriptorSet>();
+    result = this->mDevice->allocateDescriptorSets(&descriptorSetAllocateInfo,
+                                          this->mDescriptorSet.get());
+
+    if (result != vk::Result::eSuccess) {
+        KP_LOG_ERROR("Failed to allocate descriptor sets. Error code: {}", vk::to_string(result));
+    } else {
+        this->mFreeDescriptorSet = true;
+        KP_LOG_DEBUG("Successfully allocated descriptor sets.");
+    }
+
+    this->mFreeDescriptorSet = true;
+
+    KP_LOG_DEBUG("Kompute Algorithm updating descriptor sets");
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        std::vector<vk::WriteDescriptorSet> computeWriteDescriptorSets;
+
+        vk::DescriptorBufferInfo descriptorBufferInfo =
+          this->mTensors[i]->constructDescriptorBufferInfo();
+
+        computeWriteDescriptorSets.push_back(
+          vk::WriteDescriptorSet(*this->mDescriptorSet,
+                                 i, // Destination binding
+                                 0, // Destination array element
+                                 1, // Descriptor count
+                                 vk::DescriptorType::eStorageBuffer,
+                                 nullptr, // Descriptor image info
+                                 &descriptorBufferInfo));
+
+        this->mDevice->updateDescriptorSets(computeWriteDescriptorSets,
+                                            nullptr);
+    }
+
+    KP_LOG_DEBUG("Kompute Algorithm successfully run init");
+}
+
+void
+Algorithm::updateParameters()
+{
+    KP_LOG_DEBUG("Kompute Algorithm updateParameters started");
+    if (!*this->mDescriptorPool) {
+        KP_LOG_ERROR("Kompute Algorithm can not create descriptor pool");
+        return;
+    }
+
+    vk::DescriptorSetAllocateInfo descriptorSetAllocateInfo(
+      *this->mDescriptorPool,
+      1, // Descriptor set layout count
+      this->mDescriptorSetLayout.get());
+
+    KP_LOG_DEBUG("Kompute Algorithm allocating descriptor sets");
+    this->mDescriptorSet = std::make_shared<vk::DescriptorSet>();
+    vk::Result result = this->mDevice->allocateDescriptorSets(&descriptorSetAllocateInfo,
+                                          this->mDescriptorSet.get());
+
+    if (result != vk::Result::eSuccess) {
+        KP_LOG_ERROR("Failed to allocate descriptor sets. Error code: {}", vk::to_string(result));
+    } else {
+        this->mFreeDescriptorSet = true;
+        KP_LOG_DEBUG("Successfully allocated descriptor sets.");
+    }
+
+    this->mFreeDescriptorSet = true;
+
+    KP_LOG_DEBUG("Kompute Algorithm updating descriptor sets");
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        std::vector<vk::WriteDescriptorSet> computeWriteDescriptorSets;
+
+        vk::DescriptorBufferInfo descriptorBufferInfo =
+          this->mTensors[i]->constructDescriptorBufferInfo();
+
+        computeWriteDescriptorSets.push_back(
+          vk::WriteDescriptorSet(*this->mDescriptorSet,
+                                 i, // Destination binding
+                                 0, // Destination array element
+                                 1, // Descriptor count
+                                 vk::DescriptorType::eStorageBuffer,
+                                 nullptr, // Descriptor image info
+                                 &descriptorBufferInfo));
+
+        this->mDevice->updateDescriptorSets(computeWriteDescriptorSets,
+                                            nullptr);
+    }
+
+    KP_LOG_DEBUG("Kompute Algorithm successfully run init");
+}
+
+void
+Algorithm::createShaderModule()
+{
+    KP_LOG_DEBUG("Kompute Algorithm createShaderModule started");
+
+    vk::ShaderModuleCreateInfo shaderModuleInfo(vk::ShaderModuleCreateFlags(),
+                                                sizeof(uint32_t) *
+                                                  this->mSpirv.size(),
+                                                this->mSpirv.data());
+
+    KP_LOG_DEBUG("Kompute Algorithm Creating shader module. ShaderFileSize: {}",
+                 this->mSpirv.size());
+    this->mFreeShaderModule = true;
+    this->mShaderModule = std::make_shared<vk::ShaderModule>();
+    this->mDevice->createShaderModule(
+      &shaderModuleInfo, nullptr, this->mShaderModule.get());
+    this->mFreeShaderModule = true;
+
+    KP_LOG_DEBUG("Kompute Algorithm create shader module success");
+}
+
+void
+Algorithm::createPipeline()
+{
+    KP_LOG_DEBUG("Kompute Algorithm calling create Pipeline");
+
+    vk::PipelineLayoutCreateInfo pipelineLayoutInfo(
+      vk::PipelineLayoutCreateFlags(),
+      1, // Set layout count
+      this->mDescriptorSetLayout.get());
+
+    vk::PushConstantRange pushConstantRange;
+    if (this->mPushConstantsSize) {
+        pushConstantRange.setStageFlags(vk::ShaderStageFlagBits::eCompute);
+        pushConstantRange.setOffset(0);
+        pushConstantRange.setSize(this->mPushConstantsDataTypeMemorySize *
+                                  this->mPushConstantsSize);
+
+        pipelineLayoutInfo.setPushConstantRangeCount(1);
+        pipelineLayoutInfo.setPPushConstantRanges(&pushConstantRange);
+    }
+
+    this->mPipelineLayout = std::make_shared<vk::PipelineLayout>();
+    this->mDevice->createPipelineLayout(
+      &pipelineLayoutInfo, nullptr, this->mPipelineLayout.get());
+    this->mFreePipelineLayout = true;
+
+    std::vector<vk::SpecializationMapEntry> specializationEntries;
+
+    for (uint32_t i = 0; i < this->mSpecializationConstantsSize; i++) {
+        vk::SpecializationMapEntry specializationEntry(
+          static_cast<uint32_t>(i),
+          static_cast<uint32_t>(
+            this->mSpecializationConstantsDataTypeMemorySize * i),
+          this->mSpecializationConstantsDataTypeMemorySize);
+
+        specializationEntries.push_back(specializationEntry);
+    }
+
+    // This passes ownership of the memory so we remove ownership from
+    // specialization container by using "transferDataOwnership"
+    vk::SpecializationInfo specializationInfo(
+      static_cast<uint32_t>(specializationEntries.size()),
+      specializationEntries.data(),
+      this->mSpecializationConstantsDataTypeMemorySize *
+        this->mSpecializationConstantsSize,
+      this->mSpecializationConstantsData);
+
+    vk::PipelineShaderStageCreateInfo shaderStage(
+      vk::PipelineShaderStageCreateFlags(),
+      vk::ShaderStageFlagBits::eCompute,
+      *this->mShaderModule,
+      "main",
+      &specializationInfo);
+
+    static std::shared_ptr<vk::PipelineCache> globalPipelineCache = std::make_shared<vk::PipelineCache>();
+    if(!*globalPipelineCache) {
+       vk::PipelineCacheCreateInfo pipelineCacheInfo =
+         vk::PipelineCacheCreateInfo();
+      this->mPipelineCache = globalPipelineCache;
+      this->mFreePipelineCache = true;
+      this->mDevice->createPipelineCache(
+        &pipelineCacheInfo, nullptr, globalPipelineCache.get());
+    }
+
+    vk::ComputePipelineCreateInfo pipelineInfo(vk::PipelineCreateFlags(),
+                                               shaderStage,
+                                               *this->mPipelineLayout,
+                                               vk::Pipeline(),
+                                               0);
+
+#ifdef KOMPUTE_CREATE_PIPELINE_RESULT_VALUE
+    vk::ResultValue<vk::Pipeline> pipelineResult =
+      this->mDevice->createComputePipeline(*globalPipelineCache, pipelineInfo);
+
+    if (pipelineResult.result != vk::Result::eSuccess) {
+        throw std::runtime_error("Failed to create pipeline result: " +
+                                 vk::to_string(pipelineResult.result));
+    }
+
+    vk::Pipeline& pipeline = pipelineResult.value;
+    this->mPipeline = std::make_shared<vk::Pipeline>(pipeline);
+    this->mFreePipeline = true;
+#else
+    vk::Pipeline pipeline =
+      this->mDevice->createComputePipeline(*globalPipelineCache, pipelineInfo)
+        .value;
+    this->mPipeline = std::make_shared<vk::Pipeline>(pipeline);
+    this->mFreePipeline = true;
+#endif
+
+    // TODO: Update to consistent
+    // this->mPipeline = std::make_shared<vk::Pipeline>();
+    // this->mDevice->createComputePipelines(
+    //         *this->mPipelineCache, 1, &pipelineInfo, nullptr,
+    //         this->mPipeline.get());
+
+    KP_LOG_DEBUG("Kompute Algorithm Create Pipeline Success");
+}
+
+void
+Algorithm::recordBindCore(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute Algorithm binding pipeline");
+
+    commandBuffer.bindPipeline(vk::PipelineBindPoint::eCompute,
+                               *this->mPipeline);
+
+    KP_LOG_DEBUG("Kompute Algorithm binding descriptor sets");
+
+    commandBuffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
+                                     *this->mPipelineLayout,
+                                     0, // First set
+                                     *this->mDescriptorSet,
+                                     nullptr // Dispatcher
+    );
+}
+
+void
+Algorithm::recordBindPush(const vk::CommandBuffer& commandBuffer)
+{
+    if (this->mPushConstantsSize) {
+        KP_LOG_DEBUG("Kompute Algorithm binding push constants memory size: {}",
+                     this->mPushConstantsSize *
+                       this->mPushConstantsDataTypeMemorySize);
+
+        commandBuffer.pushConstants(*this->mPipelineLayout,
+                                    vk::ShaderStageFlagBits::eCompute,
+                                    0,
+                                    this->mPushConstantsSize *
+                                      this->mPushConstantsDataTypeMemorySize,
+                                    this->mPushConstantsData);
+    }
+}
+
+void
+Algorithm::recordDispatch(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute Algorithm recording dispatch");
+
+    commandBuffer.dispatch(
+      this->mWorkgroup[0], this->mWorkgroup[1], this->mWorkgroup[2]);
+}
+
+void
+Algorithm::setWorkgroup(const Workgroup& workgroup, uint32_t minSize)
+{
+
+    KP_LOG_INFO("Kompute OpAlgoCreate setting dispatch size");
+
+    // The dispatch size is set up based on either explicitly provided template
+    // parameters or by default it would take the shape and size of the tensors
+    if (workgroup[0] > 0) {
+        // If at least the x value is provided we use mainly the parameters
+        // provided
+        this->mWorkgroup = { workgroup[0],
+                             workgroup[1] > 0 ? workgroup[1] : 1,
+                             workgroup[2] > 0 ? workgroup[2] : 1 };
+    } else {
+        this->mWorkgroup = { minSize, 1, 1 };
+    }
+
+    KP_LOG_INFO("Kompute OpAlgoCreate set dispatch size X: {}, Y: {}, Z: {}",
+                this->mWorkgroup[0],
+                this->mWorkgroup[1],
+                this->mWorkgroup[2]);
+}
+
+const Workgroup&
+Algorithm::getWorkgroup()
+{
+    return this->mWorkgroup;
+}
+
+const std::vector<std::shared_ptr<Tensor>>&
+Algorithm::getTensors()
+{
+    return this->mTensors;
+}
+
+void Algorithm::setTensors(const std::vector<std::shared_ptr<Tensor>>& tensors)
+{
+    this->mTensors = tensors;
+}
+
+}
diff --git a/kompute/src/CMakeLists.txt b/kompute/src/CMakeLists.txt
new file mode 100644
index 000000000..f4f8440f4
--- /dev/null
+++ b/kompute/src/CMakeLists.txt
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20)
+
+if(KOMPUTE_OPT_ANDROID_BUILD)
+    find_library(android android)
+endif()
+
+cmake_minimum_required(VERSION 3.20)
+
+add_library(kompute Algorithm.cpp
+    Manager.cpp
+    OpAlgoDispatch.cpp
+    OpMemoryBarrier.cpp
+    OpTensorCopy.cpp
+    OpTensorSyncDevice.cpp
+    OpTensorSyncLocal.cpp
+    OpBufferSyncDevice.cpp
+    OpBufferSyncLocal.cpp
+    Sequence.cpp
+    Tensor.cpp
+    Core.cpp)
+
+add_library(kompute::kompute ALIAS kompute)
+
+# Set version for shared libraries.
+set_target_properties(kompute
+    PROPERTIES
+    VERSION ${${PROJECT_NAME}_VERSION}
+    SOVERSION ${${PROJECT_NAME}_VERSION_MAJOR})
+
+# Import GNU common install directory variables
+include(GNUInstallDirs)
+
+install(TARGETS kompute
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+# Include CMake helpers for package config files
+# Follow this installation guideline: https://cmake.org/cmake/help/latest/manual/cmake-packages.7.html
+include(CMakePackageConfigHelpers)
+
+configure_package_config_file(${PROJECT_SOURCE_DIR}/cmake/komputeConfig.cmake.in
+    "${PROJECT_BINARY_DIR}/kompute/komputeConfig.cmake"
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/kompute)
+
+install(FILES ${PROJECT_BINARY_DIR}/kompute/komputeConfig.cmake
+    ${PROJECT_BINARY_DIR}/kompute/komputeConfigVersion.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/kompute)
+
+# ####################################################
+# Linking
+# ####################################################
+if(KOMPUTE_OPT_ANDROID_BUILD)
+    target_link_libraries(kompute PUBLIC vulkanAndroid
+        android
+        kp_logger
+        kp_shader
+        fmt::fmt)
+else()
+    target_link_libraries(kompute PUBLIC Vulkan::Vulkan
+        kp_logger
+        kp_shader
+        fmt::fmt)
+endif()
+
+if(KOMPUTE_OPT_BUILD_PYTHON)
+    include_directories(${PYTHON_INCLUDE_DIRS})
+
+    target_link_libraries(kompute PRIVATE pybind11::headers ${PYTHON_LIBRARIES})
+endif()
+
+if(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER)
+    target_link_libraries(kompute PUBLIC Vulkan-Headers)
+endif()
+
+# ####################################################
+# Misc
+# ####################################################
+add_subdirectory(logger)
+add_subdirectory(shaders)
+add_subdirectory(include)
diff --git a/kompute/src/Core.cpp b/kompute/src/Core.cpp
new file mode 100644
index 000000000..60849a3ec
--- /dev/null
+++ b/kompute/src/Core.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/Core.hpp"
+
+#if VK_USE_PLATFORM_ANDROID_KHR
+#ifndef KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
+#define KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
+/**
+ * Ensures support for dynamic loading of Vulkan functions on Android.
+ * Acts as a default store for loaded functions.
+ * More information:
+ * https://github.com/KhronosGroup/Vulkan-Hpp#vulkan_hpp_default_dispatcher
+ **/
+VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
+#endif // !KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+
+namespace kp {
+} // namespace kp
diff --git a/kompute/src/Manager.cpp b/kompute/src/Manager.cpp
new file mode 100644
index 000000000..07514ed9a
--- /dev/null
+++ b/kompute/src/Manager.cpp
@@ -0,0 +1,493 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/Manager.hpp"
+#include "fmt/format.h"
+#include "kompute/logger/Logger.hpp"
+#include <fmt/core.h>
+#include <iterator>
+#include <set>
+#include <sstream>
+#include <string>
+
+namespace kp {
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+static VKAPI_ATTR VkBool32 VKAPI_CALL
+debugMessageCallback(VkDebugReportFlagsEXT /*flags*/,
+                     VkDebugReportObjectTypeEXT /*objectType*/,
+                     uint64_t /*object*/,
+                     size_t /*location*/,
+                     int32_t /*messageCode*/,
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_DEBUG
+                     const char* pLayerPrefix,
+                     const char* pMessage,
+#else
+                     const char* /*pLayerPrefix*/,
+                     const char* /*pMessage*/,
+#endif
+                     void* /*pUserData*/)
+{
+    KP_LOG_DEBUG("[VALIDATION]: {} - {}", pLayerPrefix, pMessage);
+    return VK_FALSE;
+}
+#endif
+
+Manager::Manager()
+{
+    this->mManageResources = true;
+
+// Make sure the logger is setup
+#if !KOMPUTE_OPT_LOG_LEVEL_DISABLED
+    logger::setupLogger();
+#endif
+    this->createInstance();
+}
+
+void Manager::initializeDevice(uint32_t physicalDeviceIndex,
+                               const std::vector<uint32_t>& familyQueueIndices,
+                               const std::vector<std::string>& desiredExtensions)
+{
+    this->createDevice(
+      familyQueueIndices, physicalDeviceIndex, desiredExtensions);
+}
+
+Manager::~Manager()
+{
+    KP_LOG_DEBUG("Kompute Manager Destructor started");
+    this->destroy();
+}
+
+void
+Manager::destroy()
+{
+
+    KP_LOG_DEBUG("Kompute Manager destroy() started");
+
+    if (this->mDevice == nullptr) {
+        KP_LOG_ERROR(
+          "Kompute Manager destructor reached with null Device pointer");
+        return;
+    }
+
+    if (this->mManageResources && this->mManagedSequences.size()) {
+        KP_LOG_DEBUG("Kompute Manager explicitly running destructor for "
+                     "managed sequences");
+        for (const std::weak_ptr<Sequence>& weakSq : this->mManagedSequences) {
+            if (std::shared_ptr<Sequence> sq = weakSq.lock()) {
+                sq->destroy();
+            }
+        }
+        this->mManagedSequences.clear();
+    }
+
+    if (this->mManageResources && this->mManagedAlgorithms.size()) {
+        KP_LOG_DEBUG("Kompute Manager explicitly freeing algorithms");
+        for (const std::weak_ptr<Algorithm>& weakAlgorithm :
+             this->mManagedAlgorithms) {
+            if (std::shared_ptr<Algorithm> algorithm = weakAlgorithm.lock()) {
+                algorithm->destroy();
+            }
+        }
+        this->mManagedAlgorithms.clear();
+    }
+
+    if (this->mManageResources && this->mManagedTensors.size()) {
+        KP_LOG_DEBUG("Kompute Manager explicitly freeing tensors");
+        for (const std::weak_ptr<Tensor>& weakTensor : this->mManagedTensors) {
+            if (std::shared_ptr<Tensor> tensor = weakTensor.lock()) {
+                tensor->destroy();
+            }
+        }
+        this->mManagedTensors.clear();
+    }
+
+    if (this->mFreeDevice) {
+        KP_LOG_INFO("Destroying device");
+        this->mDevice->destroy(
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice = nullptr;
+        KP_LOG_DEBUG("Kompute Manager Destroyed Device");
+    }
+
+    if (this->mInstance == nullptr) {
+        KP_LOG_ERROR(
+          "Kompute Manager destructor reached with null Instance pointer");
+        return;
+    }
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+    if (this->mDebugReportCallback) {
+        this->mInstance->destroyDebugReportCallbackEXT(
+          this->mDebugReportCallback, nullptr, this->mDebugDispatcher);
+        KP_LOG_DEBUG("Kompute Manager Destroyed Debug Report Callback");
+    }
+#endif
+
+    if (this->mFreeInstance) {
+        this->mInstance->destroy(
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mInstance = nullptr;
+        KP_LOG_DEBUG("Kompute Manager Destroyed Instance");
+    }
+}
+
+void
+Manager::createInstance()
+{
+
+    KP_LOG_DEBUG("Kompute Manager creating instance");
+
+    this->mFreeInstance = true;
+
+    vk::ApplicationInfo applicationInfo;
+    applicationInfo.pApplicationName = "Kompute";
+    applicationInfo.pEngineName = "Kompute";
+    applicationInfo.apiVersion = KOMPUTE_VK_API_VERSION;
+    applicationInfo.engineVersion = KOMPUTE_VK_API_VERSION;
+    applicationInfo.applicationVersion = KOMPUTE_VK_API_VERSION;
+
+    std::vector<const char*> applicationExtensions;
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+    applicationExtensions.push_back(VK_EXT_DEBUG_REPORT_EXTENSION_NAME);
+#endif
+
+    vk::InstanceCreateInfo computeInstanceCreateInfo;
+    computeInstanceCreateInfo.pApplicationInfo = &applicationInfo;
+    if (!applicationExtensions.empty()) {
+        computeInstanceCreateInfo.enabledExtensionCount =
+          (uint32_t)applicationExtensions.size();
+        computeInstanceCreateInfo.ppEnabledExtensionNames =
+          applicationExtensions.data();
+    }
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+    KP_LOG_DEBUG("Kompute Manager adding debug validation layers");
+    // We'll identify the layers that are supported
+    std::vector<const char*> validLayerNames;
+    std::vector<const char*> desiredLayerNames = {
+        "VK_LAYER_LUNARG_assistant_layer",
+        "VK_LAYER_LUNARG_standard_validation",
+        "VK_LAYER_KHRONOS_validation",
+    };
+    std::vector<std::string> envLayerNames;
+    const char* envLayerNamesVal = std::getenv("KOMPUTE_ENV_DEBUG_LAYERS");
+    if (envLayerNamesVal != nullptr && *envLayerNamesVal != '\0') {
+        KP_LOG_DEBUG("Kompute Manager adding environment layers: {}",
+                     envLayerNamesVal);
+        std::istringstream iss(envLayerNamesVal);
+        std::istream_iterator<std::string> beg(iss);
+        std::istream_iterator<std::string> end;
+        envLayerNames = std::vector<std::string>(beg, end);
+        for (const std::string& layerName : envLayerNames) {
+            desiredLayerNames.push_back(layerName.c_str());
+        }
+        KP_LOG_DEBUG("Desired layers: {}", fmt::join(desiredLayerNames, ", "));
+    }
+
+    // Identify the valid layer names based on the desiredLayerNames
+    {
+        std::set<std::string> uniqueLayerNames;
+        std::vector<vk::LayerProperties> availableLayerProperties =
+          vk::enumerateInstanceLayerProperties();
+        for (vk::LayerProperties layerProperties : availableLayerProperties) {
+            std::string layerName(layerProperties.layerName.data());
+            uniqueLayerNames.insert(layerName);
+        }
+        KP_LOG_DEBUG("Available layers: {}", fmt::join(uniqueLayerNames, ", "));
+        for (const char* desiredLayerName : desiredLayerNames) {
+            if (uniqueLayerNames.count(desiredLayerName) != 0) {
+                validLayerNames.push_back(desiredLayerName);
+            }
+        }
+    }
+
+    if (!validLayerNames.empty()) {
+        KP_LOG_DEBUG(
+          "Kompute Manager Initializing instance with valid layers: {}",
+          fmt::join(validLayerNames, ", "));
+        computeInstanceCreateInfo.enabledLayerCount =
+          static_cast<uint32_t>(validLayerNames.size());
+        computeInstanceCreateInfo.ppEnabledLayerNames = validLayerNames.data();
+    } else {
+        KP_LOG_WARN("Kompute Manager no valid layer names found from desired "
+                    "layer names");
+    }
+#endif
+
+#if VK_USE_PLATFORM_ANDROID_KHR
+    vk::DynamicLoader dl;
+    PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr =
+      dl.getProcAddress<PFN_vkGetInstanceProcAddr>("vkGetInstanceProcAddr");
+    VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+
+    this->mInstance = std::make_shared<vk::Instance>();
+    vk::createInstance(
+      &computeInstanceCreateInfo, nullptr, this->mInstance.get());
+
+#if VK_USE_PLATFORM_ANDROID_KHR
+    VULKAN_HPP_DEFAULT_DISPATCHER.init(*this->mInstance);
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+
+    KP_LOG_DEBUG("Kompute Manager Instance Created");
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+    KP_LOG_DEBUG("Kompute Manager adding debug callbacks");
+    if (validLayerNames.size() > 0) {
+        vk::DebugReportFlagsEXT debugFlags =
+          vk::DebugReportFlagBitsEXT::eError |
+          vk::DebugReportFlagBitsEXT::eWarning;
+        vk::DebugReportCallbackCreateInfoEXT debugCreateInfo = {};
+        debugCreateInfo.pfnCallback =
+          (PFN_vkDebugReportCallbackEXT)debugMessageCallback;
+        debugCreateInfo.flags = debugFlags;
+
+        this->mDebugDispatcher.init(*this->mInstance, &vkGetInstanceProcAddr);
+        this->mDebugReportCallback =
+          this->mInstance->createDebugReportCallbackEXT(
+            debugCreateInfo, nullptr, this->mDebugDispatcher);
+    }
+#endif
+}
+
+void
+Manager::clear()
+{
+    if (this->mManageResources) {
+        this->mManagedTensors.erase(
+          std::remove_if(begin(this->mManagedTensors),
+                         end(this->mManagedTensors),
+                         [](std::weak_ptr<Tensor> t) { return t.expired(); }),
+          end(this->mManagedTensors));
+        this->mManagedAlgorithms.erase(
+          std::remove_if(
+            begin(this->mManagedAlgorithms),
+            end(this->mManagedAlgorithms),
+            [](std::weak_ptr<Algorithm> t) { return t.expired(); }),
+          end(this->mManagedAlgorithms));
+        this->mManagedSequences.erase(
+          std::remove_if(begin(this->mManagedSequences),
+                         end(this->mManagedSequences),
+                         [](std::weak_ptr<Sequence> t) { return t.expired(); }),
+          end(this->mManagedSequences));
+    }
+}
+
+void
+Manager::createDevice(const std::vector<uint32_t>& familyQueueIndices,
+                      uint32_t physicalDeviceIndex,
+                      const std::vector<std::string>& desiredExtensions)
+{
+
+    KP_LOG_DEBUG("Kompute Manager creating Device");
+
+    if (this->mInstance == nullptr) {
+        throw std::runtime_error("Kompute Manager instance is null");
+    }
+
+    this->mFreeDevice = true;
+
+    // Getting an integer that says how many vuklan devices we have
+    std::vector<vk::PhysicalDevice> physicalDevices =
+      this->mInstance->enumeratePhysicalDevices();
+    uint32_t deviceCount = physicalDevices.size();
+
+    // This means there are no devices at all
+    if (deviceCount == 0) {
+        throw std::runtime_error("Failed to find GPUs with Vulkan support! "
+                                 "Maybe you haven't installed vulkan drivers?");
+    }
+
+    // This means that we're exceeding our device limit, for
+    // example if we have 2 devices, just physicalDeviceIndex
+    // 0 and 1 are acceptable. Hence, physicalDeviceIndex should
+    // always be less than deviceCount, else we raise an error
+    if (!(deviceCount > physicalDeviceIndex)) {
+        throw std::runtime_error("There is no such physical index or device, "
+                                 "please use your existing device");
+    }
+
+    vk::PhysicalDevice physicalDevice = physicalDevices[physicalDeviceIndex];
+
+    this->mPhysicalDevice =
+      std::make_shared<vk::PhysicalDevice>(physicalDevice);
+
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_INFO
+    vk::PhysicalDeviceProperties physicalDeviceProperties =
+      physicalDevice.getProperties();
+#endif
+
+    KP_LOG_INFO("Using physical device index {} found {}",
+                physicalDeviceIndex,
+                physicalDeviceProperties.deviceName);
+
+    if (familyQueueIndices.empty()) {
+        // Find compute queue
+        std::vector<vk::QueueFamilyProperties> allQueueFamilyProperties =
+          physicalDevice.getQueueFamilyProperties();
+
+        uint32_t computeQueueFamilyIndex = 0;
+        bool computeQueueSupported = false;
+        for (uint32_t i = 0; i < allQueueFamilyProperties.size(); i++) {
+            vk::QueueFamilyProperties queueFamilyProperties =
+              allQueueFamilyProperties[i];
+
+            if (queueFamilyProperties.queueFlags &
+                vk::QueueFlagBits::eCompute) {
+                computeQueueFamilyIndex = i;
+                computeQueueSupported = true;
+                break;
+            }
+        }
+
+        if (!computeQueueSupported) {
+            throw std::runtime_error("Compute queue is not supported");
+        }
+
+        this->mComputeQueueFamilyIndices.push_back(computeQueueFamilyIndex);
+    } else {
+        this->mComputeQueueFamilyIndices = familyQueueIndices;
+    }
+
+    std::unordered_map<uint32_t, uint32_t> familyQueueCounts;
+    std::unordered_map<uint32_t, std::vector<float>> familyQueuePriorities;
+    for (const auto& value : this->mComputeQueueFamilyIndices) {
+        familyQueueCounts[value]++;
+        familyQueuePriorities[value].push_back(1.0f);
+    }
+
+    std::unordered_map<uint32_t, uint32_t> familyQueueIndexCount;
+    std::vector<vk::DeviceQueueCreateInfo> deviceQueueCreateInfos;
+    for (const auto& familyQueueInfo : familyQueueCounts) {
+        // Setting the device count to 0
+        familyQueueIndexCount[familyQueueInfo.first] = 0;
+
+        // Creating the respective device queue
+        vk::DeviceQueueCreateInfo deviceQueueCreateInfo(
+          vk::DeviceQueueCreateFlags(),
+          familyQueueInfo.first,
+          familyQueueInfo.second,
+          familyQueuePriorities[familyQueueInfo.first].data());
+        deviceQueueCreateInfos.push_back(deviceQueueCreateInfo);
+    }
+
+    KP_LOG_DEBUG("Kompute Manager desired extension layers {}",
+                 fmt::join(desiredExtensions, ", "));
+
+    std::vector<vk::ExtensionProperties> deviceExtensions =
+      this->mPhysicalDevice->enumerateDeviceExtensionProperties();
+
+    std::set<std::string> uniqueExtensionNames;
+    for (const vk::ExtensionProperties& ext : deviceExtensions) {
+        uniqueExtensionNames.insert(ext.extensionName);
+    }
+    KP_LOG_DEBUG("Kompute Manager available extensions {}",
+                 fmt::join(uniqueExtensionNames, ", "));
+    std::vector<const char*> validExtensions;
+    for (const std::string& ext : desiredExtensions) {
+        if (uniqueExtensionNames.count(ext) != 0) {
+            validExtensions.push_back(ext.c_str());
+        }
+    }
+    if (desiredExtensions.size() != validExtensions.size()) {
+        KP_LOG_ERROR("Kompute Manager not all extensions were added: {}",
+                     fmt::join(validExtensions, ", "));
+    }
+
+    vk::PhysicalDeviceFeatures features;
+    features.shaderInt16 = true;
+
+    vk::PhysicalDeviceVulkan11Features features11;
+    features11.uniformAndStorageBuffer16BitAccess = true;
+    features11.storageBuffer16BitAccess = true;
+    features11.pNext = nullptr;
+
+    vk::PhysicalDeviceVulkan12Features features12;
+    features12.storageBuffer8BitAccess = true;
+    features12.uniformAndStorageBuffer8BitAccess = true;
+    features12.shaderFloat16 = true;
+    features12.shaderInt8 = true;
+    features12.pNext = &features11;
+
+    vk::DeviceCreateInfo deviceCreateInfo(vk::DeviceCreateFlags(),
+                                          deviceQueueCreateInfos.size(),
+                                          deviceQueueCreateInfos.data(),
+                                          {},
+                                          {},
+                                          validExtensions.size(),
+                                          validExtensions.data(),
+                                          &features);
+
+    deviceCreateInfo.pNext = &features12;
+
+    this->mDevice = std::make_shared<vk::Device>();
+    vk::Result r = physicalDevice.createDevice(
+      &deviceCreateInfo, nullptr, this->mDevice.get());
+    if (r != vk::Result::eSuccess) {
+        KP_LOG_ERROR("Kompute Manager could not create device");
+    }
+
+    KP_LOG_DEBUG("Kompute Manager device created");
+
+    for (const uint32_t& familyQueueIndex : this->mComputeQueueFamilyIndices) {
+        std::shared_ptr<vk::Queue> currQueue = std::make_shared<vk::Queue>();
+
+        this->mDevice->getQueue(familyQueueIndex,
+                                familyQueueIndexCount[familyQueueIndex],
+                                currQueue.get());
+
+        familyQueueIndexCount[familyQueueIndex]++;
+
+        this->mComputeQueues.push_back(currQueue);
+    }
+
+    KP_LOG_DEBUG("Kompute Manager compute queue obtained");
+}
+
+std::shared_ptr<Sequence>
+Manager::sequence(uint32_t queueIndex, uint32_t totalTimestamps)
+{
+    KP_LOG_DEBUG("Kompute Manager sequence() with queueIndex: {}", queueIndex);
+
+    std::shared_ptr<Sequence> sq{ new kp::Sequence(
+      this->mPhysicalDevice,
+      this->mDevice,
+      this->mComputeQueues[queueIndex],
+      this->mComputeQueueFamilyIndices[queueIndex],
+      totalTimestamps) };
+
+    if (this->mManageResources) {
+        this->mManagedSequences.push_back(sq);
+    }
+
+    return sq;
+}
+
+vk::PhysicalDeviceProperties
+Manager::getDeviceProperties() const
+{
+    return this->mPhysicalDevice->getProperties();
+}
+
+std::vector<vk::PhysicalDevice>
+Manager::listDevices() const
+{
+    return this->mInstance->enumeratePhysicalDevices();
+}
+
+std::shared_ptr<vk::Instance>
+Manager::getVkInstance() const
+{
+    return this->mInstance;
+}
+
+}
diff --git a/kompute/src/OpAlgoDispatch.cpp b/kompute/src/OpAlgoDispatch.cpp
new file mode 100644
index 000000000..cad334f0c
--- /dev/null
+++ b/kompute/src/OpAlgoDispatch.cpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpAlgoDispatch.hpp"
+
+namespace kp {
+
+OpAlgoDispatch::~OpAlgoDispatch()
+{
+    KP_LOG_DEBUG("Kompute OpAlgoDispatch destructor started");
+
+    if (this->mPushConstantsData) {
+        KP_LOG_DEBUG("Kompute freeing push constants data");
+        free(this->mPushConstantsData);
+    }
+}
+
+void
+OpAlgoDispatch::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpAlgoDispatch record called");
+
+    // Barrier to ensure the data is finished writing to buffer memory
+    for (const std::shared_ptr<Tensor>& tensor :
+         this->mAlgorithm->getTensors()) {
+        tensor->recordPrimaryBufferMemoryBarrier(
+          commandBuffer,
+          vk::AccessFlagBits::eTransferWrite,
+          vk::AccessFlagBits::eShaderRead,
+          vk::PipelineStageFlagBits::eTransfer,
+          vk::PipelineStageFlagBits::eComputeShader);
+    }
+
+    if (this->mPushConstantsSize) {
+        this->mAlgorithm->setPushConstants(
+          this->mPushConstantsData,
+          this->mPushConstantsSize,
+          this->mPushConstantsDataTypeMemorySize);
+    }
+
+    this->mAlgorithm->recordBindCore(commandBuffer);
+    this->mAlgorithm->recordBindPush(commandBuffer);
+    this->mAlgorithm->recordDispatch(commandBuffer);
+}
+
+void
+OpAlgoDispatch::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpAlgoDispatch preEval called");
+}
+
+void
+OpAlgoDispatch::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpAlgoDispatch postSubmit called");
+}
+
+}
diff --git a/kompute/src/OpBufferSyncDevice.cpp b/kompute/src/OpBufferSyncDevice.cpp
new file mode 100644
index 000000000..baaafda0f
--- /dev/null
+++ b/kompute/src/OpBufferSyncDevice.cpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpBufferSyncDevice.hpp"
+
+namespace kp {
+
+OpBufferSyncDevice::OpBufferSyncDevice(
+        vk::Buffer *primaryBuffer,
+        vk::Buffer *stagingBuffer,
+        vk::DeviceSize size)
+  : mPrimaryBuffer(primaryBuffer)
+  , mStagingBuffer(stagingBuffer)
+  , mSize(size)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncDevice constructor with params");
+}
+
+OpBufferSyncDevice::~OpBufferSyncDevice()
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncDevice destructor started");
+}
+
+void
+OpBufferSyncDevice::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncDevice record called");
+    vk::BufferCopy copyRegion(0, 0, mSize);
+    commandBuffer.copyBuffer(*mStagingBuffer, *mPrimaryBuffer, copyRegion);
+}
+
+void
+OpBufferSyncDevice::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncDevice preEval called");
+}
+
+void
+OpBufferSyncDevice::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncDevice postEval called");
+}
+
+}
diff --git a/kompute/src/OpBufferSyncLocal.cpp b/kompute/src/OpBufferSyncLocal.cpp
new file mode 100644
index 000000000..63739a351
--- /dev/null
+++ b/kompute/src/OpBufferSyncLocal.cpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpBufferSyncLocal.hpp"
+
+namespace kp {
+
+OpBufferSyncLocal::OpBufferSyncLocal(
+        vk::Buffer *primaryBuffer,
+        vk::Buffer *stagingBuffer,
+        vk::DeviceSize size)
+  : mPrimaryBuffer(primaryBuffer)
+  , mStagingBuffer(stagingBuffer)
+  , mSize(size)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncLocal constructor with params");
+}
+
+OpBufferSyncLocal::~OpBufferSyncLocal()
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncLocal destructor started");
+}
+
+void
+OpBufferSyncLocal::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncLocal record called");
+    vk::BufferCopy copyRegion(0, 0, mSize);
+    commandBuffer.copyBuffer(*mPrimaryBuffer, *mStagingBuffer, copyRegion);
+}
+
+void
+OpBufferSyncLocal::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncLocal preEval called");
+}
+
+void
+OpBufferSyncLocal::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncLocal postEval called");
+}
+
+}
diff --git a/kompute/src/OpMemoryBarrier.cpp b/kompute/src/OpMemoryBarrier.cpp
new file mode 100644
index 000000000..89d44d85e
--- /dev/null
+++ b/kompute/src/OpMemoryBarrier.cpp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpMemoryBarrier.hpp"
+
+namespace kp {
+
+OpMemoryBarrier::OpMemoryBarrier(
+  const std::vector<std::shared_ptr<Tensor>>& tensors,
+  const vk::AccessFlagBits& srcAccessMask,
+  const vk::AccessFlagBits& dstAccessMask,
+  const vk::PipelineStageFlagBits& srcStageMask,
+  const vk::PipelineStageFlagBits& dstStageMask,
+  bool barrierOnPrimary)
+  : mSrcAccessMask(srcAccessMask)
+  , mDstAccessMask(dstAccessMask)
+  , mSrcStageMask(srcStageMask)
+  , mDstStageMask(dstStageMask)
+  , mBarrierOnPrimary(barrierOnPrimary)
+  , mTensors(tensors)
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier constructor");
+}
+
+OpMemoryBarrier::~OpMemoryBarrier()
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier destructor started");
+}
+
+void
+OpMemoryBarrier::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier record called");
+
+    // Barrier to ensure the data is finished writing to buffer memory
+    if (this->mBarrierOnPrimary) {
+        for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
+            tensor->recordPrimaryBufferMemoryBarrier(commandBuffer,
+                                                     this->mSrcAccessMask,
+                                                     this->mDstAccessMask,
+                                                     this->mSrcStageMask,
+                                                     this->mDstStageMask);
+        }
+    } else {
+        for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
+            tensor->recordStagingBufferMemoryBarrier(commandBuffer,
+                                                     this->mSrcAccessMask,
+                                                     this->mDstAccessMask,
+                                                     this->mSrcStageMask,
+                                                     this->mDstStageMask);
+        }
+    }
+}
+
+void
+OpMemoryBarrier::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier preEval called");
+}
+
+void
+OpMemoryBarrier::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier postSubmit called");
+}
+
+}
diff --git a/kompute/src/OpTensorCopy.cpp b/kompute/src/OpTensorCopy.cpp
new file mode 100644
index 000000000..e732cc413
--- /dev/null
+++ b/kompute/src/OpTensorCopy.cpp
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpTensorCopy.hpp"
+#include "kompute/Tensor.hpp"
+
+namespace kp {
+
+OpTensorCopy::OpTensorCopy(const std::vector<std::shared_ptr<Tensor>>& tensors)
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopy constructor with params");
+
+    this->mTensors = tensors;
+
+    if (this->mTensors.size() < 2) {
+        throw std::runtime_error(
+          "Kompute OpTensorCopy called with less than 2 tensor");
+    }
+
+    kp::Tensor::TensorDataTypes dataType = this->mTensors[0]->dataType();
+    uint32_t size = this->mTensors[0]->size();
+    for (const std::shared_ptr<Tensor>& tensor : tensors) {
+        if (tensor->dataType() != dataType) {
+            throw std::runtime_error(fmt::format(
+              "Attempting to copy tensors of different types from {} to {}",
+              Tensor::toString(dataType),
+              Tensor::toString(tensor->dataType())));
+        }
+        if (tensor->size() != size) {
+            throw std::runtime_error(fmt::format(
+              "Attempting to copy tensors of different sizes from {} to {}",
+              size,
+              tensor->size()));
+        }
+    }
+}
+
+OpTensorCopy::~OpTensorCopy()
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopy destructor started");
+}
+
+void
+OpTensorCopy::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopy record called");
+
+    // We iterate from the second tensor onwards and record a copy to all
+    for (size_t i = 1; i < this->mTensors.size(); i++) {
+        this->mTensors[i]->recordCopyFrom(commandBuffer, this->mTensors[0]);
+    }
+}
+
+void
+OpTensorCopy::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopy preEval called");
+}
+
+void
+OpTensorCopy::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopy postEval called");
+
+    // Do not copy on CPU side if source is storage tensor
+    if (this->mTensors[0]->tensorType() == kp::Tensor::TensorTypes::eStorage)
+    {
+        KP_LOG_DEBUG("Kompute OpTensorCopy not copying tensor source given it's of eStorage type");
+        return;
+    }
+    void* data = this->mTensors[0]->rawData();
+
+    // Copy the data from the first tensor into all the tensors
+    for (size_t i = 1; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == kp::Tensor::TensorTypes::eStorage) {
+            KP_LOG_DEBUG("Kompute OpTensorCopy not copying to tensor dest given it's of eStorage type");
+            continue;
+        }
+        this->mTensors[i]->setRawData(data);
+    }
+}
+
+}
diff --git a/kompute/src/OpTensorSyncDevice.cpp b/kompute/src/OpTensorSyncDevice.cpp
new file mode 100644
index 000000000..4cc6abf71
--- /dev/null
+++ b/kompute/src/OpTensorSyncDevice.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpTensorSyncDevice.hpp"
+
+namespace kp {
+
+OpTensorSyncDevice::OpTensorSyncDevice(
+  const std::vector<std::shared_ptr<Tensor>>& tensors)
+  : mPrimaryBuffer(nullptr)
+  , mStagingBuffer(nullptr)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncDevice constructor with params");
+
+    if (tensors.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpTensorSyncDevice called with less than 1 tensor");
+    }
+
+    this->mTensors = tensors;
+}
+
+OpTensorSyncDevice::~OpTensorSyncDevice()
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncDevice destructor started");
+
+    this->mTensors.clear();
+}
+
+void
+OpTensorSyncDevice::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncDevice record called");
+
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
+            this->mTensors[i]->recordCopyFromStagingToDevice(commandBuffer);
+        }
+    }
+}
+
+void
+OpTensorSyncDevice::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncDevice preEval called");
+}
+
+void
+OpTensorSyncDevice::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncDevice postEval called");
+}
+
+}
diff --git a/kompute/src/OpTensorSyncLocal.cpp b/kompute/src/OpTensorSyncLocal.cpp
new file mode 100644
index 000000000..1aa091b73
--- /dev/null
+++ b/kompute/src/OpTensorSyncLocal.cpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpTensorSyncLocal.hpp"
+
+namespace kp {
+
+OpTensorSyncLocal::OpTensorSyncLocal(
+  const std::vector<std::shared_ptr<Tensor>>& tensors)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal constructor with params");
+
+    if (tensors.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpTensorSyncLocal called with less than 1 tensor");
+    }
+
+    this->mTensors = tensors;
+}
+
+OpTensorSyncLocal::~OpTensorSyncLocal()
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal destructor started");
+}
+
+void
+OpTensorSyncLocal::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal record called");
+
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
+
+            this->mTensors[i]->recordPrimaryBufferMemoryBarrier(
+              commandBuffer,
+              vk::AccessFlagBits::eShaderWrite,
+              vk::AccessFlagBits::eTransferRead,
+              vk::PipelineStageFlagBits::eComputeShader,
+              vk::PipelineStageFlagBits::eTransfer);
+
+            this->mTensors[i]->recordCopyFromDeviceToStaging(commandBuffer);
+
+            this->mTensors[i]->recordPrimaryBufferMemoryBarrier(
+              commandBuffer,
+              vk::AccessFlagBits::eTransferWrite,
+              vk::AccessFlagBits::eHostRead,
+              vk::PipelineStageFlagBits::eTransfer,
+              vk::PipelineStageFlagBits::eHost);
+        }
+    }
+}
+
+void
+OpTensorSyncLocal::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal preEval called");
+}
+
+void
+OpTensorSyncLocal::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal postEval called");
+
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal mapping data into tensor local");
+}
+
+}
diff --git a/kompute/src/Sequence.cpp b/kompute/src/Sequence.cpp
new file mode 100644
index 000000000..3b5fb5fb5
--- /dev/null
+++ b/kompute/src/Sequence.cpp
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/Sequence.hpp"
+
+namespace kp {
+
+Sequence::Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                   std::shared_ptr<vk::Device> device,
+                   std::shared_ptr<vk::Queue> computeQueue,
+                   uint32_t queueIndex,
+                   uint32_t totalTimestamps)
+{
+    KP_LOG_DEBUG("Kompute Sequence Constructor with existing device & queue");
+
+    this->mPhysicalDevice = physicalDevice;
+    this->mDevice = device;
+    this->mComputeQueue = computeQueue;
+    this->mQueueIndex = queueIndex;
+
+    this->createCommandPool();
+    this->createCommandBuffer();
+    if (totalTimestamps > 0)
+        this->createTimestampQueryPool(totalTimestamps +
+                                       1); //+1 for the first one
+}
+
+Sequence::~Sequence()
+{
+    KP_LOG_DEBUG("Kompute Sequence Destructor started");
+
+    if (this->mDevice) {
+        this->destroy();
+    }
+}
+
+void
+Sequence::begin()
+{
+    KP_LOG_DEBUG("Kompute sequence called BEGIN");
+
+    if (this->isRecording()) {
+        KP_LOG_DEBUG("Kompute Sequence begin called when already recording");
+        return;
+    }
+
+    if (this->isRunning()) {
+        throw std::runtime_error(
+          "Kompute Sequence begin called when sequence still running");
+    }
+
+    KP_LOG_INFO("Kompute Sequence command now started recording");
+    this->mCommandBuffer->begin(vk::CommandBufferBeginInfo());
+    this->mRecording = true;
+
+    // latch the first timestamp before any commands are submitted
+    if (this->timestampQueryPool)
+        this->mCommandBuffer->writeTimestamp(
+          vk::PipelineStageFlagBits::eAllCommands,
+          *this->timestampQueryPool,
+          0);
+}
+
+void
+Sequence::end()
+{
+    KP_LOG_DEBUG("Kompute Sequence calling END");
+
+    if (this->isRunning()) {
+        throw std::runtime_error(
+          "Kompute Sequence begin called when sequence still running");
+    }
+
+    if (!this->isRecording()) {
+        KP_LOG_WARN("Kompute Sequence end called when not recording");
+        return;
+    } else {
+        KP_LOG_INFO("Kompute Sequence command recording END");
+        this->mCommandBuffer->end();
+        this->mRecording = false;
+    }
+}
+
+void
+Sequence::clear()
+{
+    KP_LOG_DEBUG("Kompute Sequence calling clear");
+    if (this->isRecording()) {
+        this->end();
+    }
+}
+
+std::shared_ptr<Sequence>
+Sequence::eval()
+{
+    KP_LOG_DEBUG("Kompute sequence EVAL BEGIN");
+
+    return this->evalAsync()->evalAwait();
+}
+
+std::shared_ptr<Sequence>
+Sequence::eval(std::shared_ptr<OpBase> op)
+{
+    this->clear();
+    return this->record(op)->eval();
+}
+
+std::shared_ptr<Sequence>
+Sequence::evalAsync()
+{
+    if (this->isRecording()) {
+        this->end();
+    }
+
+    if (this->mIsRunning) {
+        throw std::runtime_error(
+          "Kompute Sequence evalAsync called when an eval async was "
+          "called without successful wait");
+    }
+
+    this->mIsRunning = true;
+
+    for (size_t i = 0; i < this->mOperations.size(); i++) {
+        this->mOperations[i]->preEval(*this->mCommandBuffer);
+    }
+
+    vk::SubmitInfo submitInfo(
+      0, nullptr, nullptr, 1, this->mCommandBuffer.get());
+
+    this->mFence = this->mDevice->createFence(vk::FenceCreateInfo());
+
+    KP_LOG_DEBUG(
+      "Kompute sequence submitting command buffer into compute queue");
+
+    this->mComputeQueue->submit(1, &submitInfo, this->mFence);
+
+    return shared_from_this();
+}
+
+std::shared_ptr<Sequence>
+Sequence::evalAsync(std::shared_ptr<OpBase> op)
+{
+    this->clear();
+    this->record(op);
+    this->evalAsync();
+    return shared_from_this();
+}
+
+std::shared_ptr<Sequence>
+Sequence::evalAwait(uint64_t waitFor)
+{
+    if (!this->mIsRunning) {
+        KP_LOG_WARN("Kompute Sequence evalAwait called without existing eval");
+        return shared_from_this();
+    }
+
+    vk::Result result =
+      this->mDevice->waitForFences(1, &this->mFence, VK_TRUE, waitFor);
+    this->mDevice->destroy(
+      this->mFence, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+
+    this->mIsRunning = false;
+
+    if (result == vk::Result::eTimeout) {
+        KP_LOG_WARN("Kompute Sequence evalAwait reached timeout of {}",
+                    waitFor);
+        return shared_from_this();
+    }
+
+    for (size_t i = 0; i < this->mOperations.size(); i++) {
+        this->mOperations[i]->postEval(*this->mCommandBuffer);
+    }
+
+    return shared_from_this();
+}
+
+bool
+Sequence::isRunning() const
+{
+    return this->mIsRunning;
+}
+
+bool
+Sequence::isRecording() const
+{
+    return this->mRecording;
+}
+
+bool
+Sequence::isInit() const
+{
+    return this->mDevice && this->mCommandPool && this->mCommandBuffer &&
+           this->mComputeQueue;
+}
+
+void
+Sequence::rerecord()
+{
+    this->end();
+    std::vector<std::shared_ptr<OpBase>> ops = this->mOperations;
+    this->mOperations.clear();
+    for (const std::shared_ptr<kp::OpBase>& op : ops) {
+        this->record(op);
+    }
+}
+
+void
+Sequence::destroy()
+{
+    KP_LOG_DEBUG("Kompute Sequence destroy called");
+
+    if (!this->mDevice) {
+        KP_LOG_WARN("Kompute Sequence destroy called "
+                    "with null Device pointer");
+        return;
+    }
+
+    if (this->mFreeCommandBuffer) {
+        KP_LOG_INFO("Freeing CommandBuffer");
+        if (!this->mCommandBuffer) {
+            KP_LOG_WARN("Kompute Sequence destroy called with null "
+                        "CommandPool pointer");
+            return;
+        }
+        this->mDevice->freeCommandBuffers(
+          *this->mCommandPool, 1, this->mCommandBuffer.get());
+
+        this->mCommandBuffer = nullptr;
+        this->mFreeCommandBuffer = false;
+
+        KP_LOG_DEBUG("Kompute Sequence Freed CommandBuffer");
+    }
+
+    if (this->mFreeCommandPool) {
+        KP_LOG_INFO("Destroying CommandPool");
+        if (this->mCommandPool == nullptr) {
+            KP_LOG_WARN("Kompute Sequence destroy called with null "
+                        "CommandPool pointer");
+            return;
+        }
+        this->mDevice->destroy(
+          *this->mCommandPool,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+
+        this->mCommandPool = nullptr;
+        this->mFreeCommandPool = false;
+
+        KP_LOG_DEBUG("Kompute Sequence Destroyed CommandPool");
+    }
+
+    if (this->mOperations.size()) {
+        KP_LOG_INFO("Kompute Sequence clearing operations buffer");
+        this->mOperations.clear();
+    }
+
+    if (this->timestampQueryPool) {
+        KP_LOG_INFO("Destroying QueryPool");
+        this->mDevice->destroy(
+          *this->timestampQueryPool,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+
+        this->timestampQueryPool = nullptr;
+        KP_LOG_DEBUG("Kompute Sequence Destroyed QueryPool");
+    }
+
+    if (this->mDevice) {
+        this->mDevice = nullptr;
+    }
+    if (this->mPhysicalDevice) {
+        this->mPhysicalDevice = nullptr;
+    }
+    if (this->mComputeQueue) {
+        this->mComputeQueue = nullptr;
+    }
+}
+
+std::shared_ptr<Sequence>
+Sequence::record(std::shared_ptr<OpBase> op)
+{
+    KP_LOG_DEBUG("Kompute Sequence record function started");
+
+    this->begin();
+
+    KP_LOG_DEBUG(
+      "Kompute Sequence running record on OpBase derived class instance");
+
+    op->record(*this->mCommandBuffer);
+
+    this->mOperations.push_back(op);
+
+    if (this->timestampQueryPool)
+        this->mCommandBuffer->writeTimestamp(
+          vk::PipelineStageFlagBits::eAllCommands,
+          *this->timestampQueryPool,
+          this->mOperations.size());
+
+    return shared_from_this();
+}
+
+void
+Sequence::createCommandPool()
+{
+    KP_LOG_DEBUG("Kompute Sequence creating command pool");
+
+    if (!this->mDevice) {
+        throw std::runtime_error("Kompute Sequence device is null");
+    }
+
+    this->mFreeCommandPool = true;
+
+    vk::CommandPoolCreateInfo commandPoolInfo(vk::CommandPoolCreateFlags(),
+                                              this->mQueueIndex);
+    this->mCommandPool = std::make_shared<vk::CommandPool>();
+    this->mDevice->createCommandPool(
+      &commandPoolInfo, nullptr, this->mCommandPool.get());
+    KP_LOG_DEBUG("Kompute Sequence Command Pool Created");
+}
+
+void
+Sequence::createCommandBuffer()
+{
+    KP_LOG_DEBUG("Kompute Sequence creating command buffer");
+    if (!this->mDevice) {
+        throw std::runtime_error("Kompute Sequence device is null");
+    }
+    if (!this->mCommandPool) {
+        throw std::runtime_error("Kompute Sequence command pool is null");
+    }
+
+    this->mFreeCommandBuffer = true;
+
+    vk::CommandBufferAllocateInfo commandBufferAllocateInfo(
+      *this->mCommandPool, vk::CommandBufferLevel::ePrimary, 1);
+
+    this->mCommandBuffer = std::make_shared<vk::CommandBuffer>();
+    this->mDevice->allocateCommandBuffers(&commandBufferAllocateInfo,
+                                          this->mCommandBuffer.get());
+    KP_LOG_DEBUG("Kompute Sequence Command Buffer Created");
+}
+
+void
+Sequence::createTimestampQueryPool(uint32_t totalTimestamps)
+{
+    KP_LOG_DEBUG("Kompute Sequence creating query pool");
+    if (!this->isInit()) {
+        throw std::runtime_error(
+          "createTimestampQueryPool() called on uninitialized Sequence");
+    }
+    if (!this->mPhysicalDevice) {
+        throw std::runtime_error("Kompute Sequence physical device is null");
+    }
+
+    vk::PhysicalDeviceProperties physicalDeviceProperties =
+      this->mPhysicalDevice->getProperties();
+
+    if (physicalDeviceProperties.limits.timestampComputeAndGraphics) {
+        vk::QueryPoolCreateInfo queryPoolInfo;
+        queryPoolInfo.setQueryCount(totalTimestamps);
+        queryPoolInfo.setQueryType(vk::QueryType::eTimestamp);
+        this->timestampQueryPool = std::make_shared<vk::QueryPool>(
+          this->mDevice->createQueryPool(queryPoolInfo));
+
+        KP_LOG_DEBUG("Query pool for timestamps created");
+    } else {
+        throw std::runtime_error("Device does not support timestamps");
+    }
+}
+
+std::vector<std::uint64_t>
+Sequence::getTimestamps()
+{
+    if (!this->timestampQueryPool)
+        throw std::runtime_error("Timestamp latching not enabled");
+
+    const auto n = this->mOperations.size() + 1;
+    std::vector<std::uint64_t> timestamps(n, 0);
+    this->mDevice->getQueryPoolResults(
+      *this->timestampQueryPool,
+      0,
+      n,
+      timestamps.size() * sizeof(std::uint64_t),
+      timestamps.data(),
+      sizeof(uint64_t),
+      vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
+
+    return timestamps;
+}
+
+}
diff --git a/kompute/src/Tensor.cpp b/kompute/src/Tensor.cpp
new file mode 100644
index 000000000..9c343ff13
--- /dev/null
+++ b/kompute/src/Tensor.cpp
@@ -0,0 +1,451 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/Tensor.hpp"
+
+namespace kp {
+
+std::string
+Tensor::toString(Tensor::TensorDataTypes dt)
+{
+    switch (dt) {
+        case TensorDataTypes::eBool:
+            return "eBool";
+        case TensorDataTypes::eInt:
+            return "eInt";
+        case TensorDataTypes::eUnsignedInt:
+            return "eUnsignedInt";
+        case TensorDataTypes::eFloat:
+            return "eFloat";
+        case TensorDataTypes::eDouble:
+            return "eDouble";
+        default:
+            return "unknown";
+    }
+}
+
+std::string
+Tensor::toString(Tensor::TensorTypes dt)
+{
+    switch (dt) {
+        case TensorTypes::eDevice:
+            return "eDevice";
+        case TensorTypes::eHost:
+            return "eHost";
+        case TensorTypes::eStorage:
+            return "eStorage";
+        default:
+            return "unknown";
+    }
+}
+
+Tensor::Tensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+               std::shared_ptr<vk::Device> device,
+               void* data,
+               uint32_t elementTotalCount,
+               uint32_t elementMemorySize,
+               const TensorDataTypes& dataType,
+               vk::DeviceMemory *primaryMemory,
+               vk::Buffer *primaryBuffer,
+               vk::DeviceMemory *stagingMemory,
+               vk::Buffer *stagingBuffer,
+               vk::DeviceSize offset,
+               const TensorTypes& tensorType)
+{
+    KP_LOG_DEBUG("Kompute Tensor constructor data length: {}, and type: {}",
+                 elementTotalCount,
+                 Tensor::toString(tensorType));
+
+    this->mPhysicalDevice = physicalDevice;
+    this->mDevice = device;
+    this->mDataType = dataType;
+    this->mTensorType = tensorType;
+
+    this->rebuild(data, elementTotalCount, elementMemorySize, primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, offset);
+}
+
+Tensor::~Tensor()
+{
+    KP_LOG_DEBUG("Kompute Tensor destructor started. Type: {}",
+                 Tensor::toString(this->tensorType()));
+
+    if (this->mDevice) {
+        this->destroy();
+    }
+
+    KP_LOG_DEBUG("Kompute Tensor destructor success");
+}
+
+void
+Tensor::rebuild(void* /*data*/,
+                uint32_t elementTotalCount,
+                uint64_t memorySize,
+                vk::DeviceMemory *primaryMemory,
+                vk::Buffer *primaryBuffer,
+                vk::DeviceMemory *stagingMemory,
+                vk::Buffer *stagingBuffer,
+                vk::DeviceSize offset)
+{
+    KP_LOG_DEBUG("Kompute Tensor rebuilding with size {}", elementTotalCount);
+
+    this->mSize = elementTotalCount;
+    this->mMemorySize = memorySize;
+    this->mOffset = offset;
+
+    if (this->mPrimaryBuffer || this->mPrimaryMemory) {
+        KP_LOG_DEBUG(
+          "Kompute Tensor destroying existing resources before rebuild");
+        this->destroy();
+    }
+
+    this->setGPUResources(primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, offset);
+}
+
+Tensor::TensorTypes
+Tensor::tensorType()
+{
+    return this->mTensorType;
+}
+
+bool
+Tensor::isInit()
+{
+    return this->mDevice && this->mPrimaryBuffer && this->mPrimaryMemory &&
+           this->mRawData;
+}
+
+uint32_t
+Tensor::size()
+{
+    return this->mSize;
+}
+
+uint64_t
+Tensor::memorySize()
+{
+    return this->mMemorySize;
+}
+
+kp::Tensor::TensorDataTypes
+Tensor::dataType()
+{
+    return this->mDataType;
+}
+
+void*
+Tensor::rawData()
+{
+    return this->mRawData;
+}
+
+void
+Tensor::setRawData(const void* data)
+{
+    memcpy(this->mRawData, data, this->memorySize());
+}
+
+void
+Tensor::recordCopyFrom(const vk::CommandBuffer& commandBuffer,
+                       std::shared_ptr<Tensor> copyFromTensor)
+{
+
+    vk::DeviceSize bufferSize(this->memorySize());
+    vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
+
+    KP_LOG_DEBUG("Kompute Tensor recordCopyFrom data size {}.", bufferSize);
+
+    this->recordCopyBuffer(commandBuffer,
+                           copyFromTensor->mPrimaryBuffer,
+                           this->mPrimaryBuffer,
+                           bufferSize,
+                           copyRegion);
+}
+
+void
+Tensor::recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer)
+{
+    if (!this->mStagingBuffer)
+        return;
+
+    vk::DeviceSize bufferSize(this->memorySize());
+    vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
+
+    KP_LOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
+
+    this->recordCopyBuffer(commandBuffer,
+                           this->mStagingBuffer,
+                           this->mPrimaryBuffer,
+                           bufferSize,
+                           copyRegion);
+}
+
+void
+Tensor::recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer)
+{
+    if (!this->mStagingBuffer)
+        return;
+
+    vk::DeviceSize bufferSize(this->memorySize());
+    vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
+
+    KP_LOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
+
+    this->recordCopyBuffer(commandBuffer,
+                           this->mPrimaryBuffer,
+                           this->mStagingBuffer,
+                           bufferSize,
+                           copyRegion);
+}
+
+void
+Tensor::recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
+                         vk::Buffer *bufferFrom,
+                         vk::Buffer *bufferTo,
+                         vk::DeviceSize /*bufferSize*/,
+                         vk::BufferCopy copyRegion)
+{
+
+    commandBuffer.copyBuffer(*bufferFrom, *bufferTo, copyRegion);
+}
+
+void
+Tensor::recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                         vk::AccessFlagBits srcAccessMask,
+                                         vk::AccessFlagBits dstAccessMask,
+                                         vk::PipelineStageFlagBits srcStageMask,
+                                         vk::PipelineStageFlagBits dstStageMask)
+{
+    KP_LOG_DEBUG("Kompute Tensor recording PRIMARY buffer memory barrier");
+
+    this->recordBufferMemoryBarrier(commandBuffer,
+                                    *this->mPrimaryBuffer,
+                                    srcAccessMask,
+                                    dstAccessMask,
+                                    srcStageMask,
+                                    dstStageMask);
+}
+
+void
+Tensor::recordStagingBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                         vk::AccessFlagBits srcAccessMask,
+                                         vk::AccessFlagBits dstAccessMask,
+                                         vk::PipelineStageFlagBits srcStageMask,
+                                         vk::PipelineStageFlagBits dstStageMask)
+{
+    if (!this->mStagingBuffer)
+        return;
+
+    KP_LOG_DEBUG("Kompute Tensor recording STAGING buffer memory barrier");
+
+    this->recordBufferMemoryBarrier(commandBuffer,
+                                    *this->mStagingBuffer,
+                                    srcAccessMask,
+                                    dstAccessMask,
+                                    srcStageMask,
+                                    dstStageMask);
+}
+
+void
+Tensor::recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                  const vk::Buffer& buffer,
+                                  vk::AccessFlagBits srcAccessMask,
+                                  vk::AccessFlagBits dstAccessMask,
+                                  vk::PipelineStageFlagBits srcStageMask,
+                                  vk::PipelineStageFlagBits dstStageMask)
+{
+    KP_LOG_DEBUG("Kompute Tensor recording buffer memory barrier");
+
+    vk::DeviceSize bufferSize = this->memorySize();
+
+    vk::BufferMemoryBarrier bufferMemoryBarrier;
+    bufferMemoryBarrier.buffer = buffer;
+    bufferMemoryBarrier.size = bufferSize;
+    bufferMemoryBarrier.srcAccessMask = srcAccessMask;
+    bufferMemoryBarrier.dstAccessMask = dstAccessMask;
+    bufferMemoryBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    bufferMemoryBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+
+    commandBuffer.pipelineBarrier(srcStageMask,
+                                  dstStageMask,
+                                  vk::DependencyFlags(),
+                                  nullptr,
+                                  bufferMemoryBarrier,
+                                  nullptr);
+}
+
+vk::DescriptorBufferInfo
+Tensor::constructDescriptorBufferInfo()
+{
+    KP_LOG_DEBUG("Kompute Tensor construct descriptor buffer info size {}",
+                 this->memorySize());
+    vk::DeviceSize bufferSize = this->memorySize();
+    return vk::DescriptorBufferInfo(*this->mPrimaryBuffer,
+                                    mOffset, // offset
+                                    bufferSize);
+}
+
+vk::BufferUsageFlags
+Tensor::getPrimaryBufferUsageFlags()
+{
+    switch (this->mTensorType) {
+        case TensorTypes::eDevice:
+            return vk::BufferUsageFlagBits::eStorageBuffer |
+                   vk::BufferUsageFlagBits::eTransferSrc |
+                   vk::BufferUsageFlagBits::eTransferDst;
+            break;
+        case TensorTypes::eHost:
+            return vk::BufferUsageFlagBits::eStorageBuffer |
+                   vk::BufferUsageFlagBits::eTransferSrc |
+                   vk::BufferUsageFlagBits::eTransferDst;
+            break;
+        case TensorTypes::eStorage:
+            return vk::BufferUsageFlagBits::eStorageBuffer;
+            break;
+        default:
+            throw std::runtime_error("Kompute Tensor invalid tensor type");
+    }
+}
+
+vk::MemoryPropertyFlags
+Tensor::getPrimaryMemoryPropertyFlags()
+{
+    switch (this->mTensorType) {
+        case TensorTypes::eDevice:
+            return vk::MemoryPropertyFlagBits::eDeviceLocal;
+            break;
+        case TensorTypes::eHost:
+            return vk::MemoryPropertyFlagBits::eHostVisible |
+                   vk::MemoryPropertyFlagBits::eHostCoherent;
+            break;
+        case TensorTypes::eStorage:
+            return vk::MemoryPropertyFlagBits::eDeviceLocal;
+            break;
+        default:
+            throw std::runtime_error("Kompute Tensor invalid tensor type");
+    }
+}
+
+vk::BufferUsageFlags
+Tensor::getStagingBufferUsageFlags()
+{
+    switch (this->mTensorType) {
+        case TensorTypes::eDevice:
+            return vk::BufferUsageFlagBits::eTransferSrc |
+                   vk::BufferUsageFlagBits::eTransferDst;
+            break;
+        default:
+            throw std::runtime_error("Kompute Tensor invalid tensor type");
+    }
+}
+
+vk::MemoryPropertyFlags
+Tensor::getStagingMemoryPropertyFlags()
+{
+    switch (this->mTensorType) {
+        case TensorTypes::eDevice:
+            return vk::MemoryPropertyFlagBits::eHostVisible |
+                   vk::MemoryPropertyFlagBits::eHostCoherent;
+            break;
+        default:
+            throw std::runtime_error("Kompute Tensor invalid tensor type");
+    }
+}
+
+void
+Tensor::setGPUResources(vk::DeviceMemory *primaryMemory,
+                        vk::Buffer *primaryBuffer,
+                        vk::DeviceMemory *stagingMemory,
+                        vk::Buffer *stagingBuffer,
+                        vk::DeviceSize /*offset*/)
+{
+    KP_LOG_DEBUG("Kompute Tensor creating buffer");
+
+    if (!this->mPhysicalDevice) {
+        throw std::runtime_error("Kompute Tensor phyisical device is null");
+    }
+    if (!this->mDevice) {
+        throw std::runtime_error("Kompute Tensor device is null");
+    }
+
+    KP_LOG_DEBUG("Kompute Tensor creating primary buffer and memory");
+
+    this->mPrimaryBuffer = primaryBuffer;
+    this->mPrimaryMemory = primaryMemory;
+
+    if (this->mTensorType == TensorTypes::eDevice) {
+        KP_LOG_DEBUG("Kompute Tensor creating staging buffer and memory");
+
+        this->mStagingBuffer = stagingBuffer;
+        this->mStagingMemory = stagingMemory;
+    }
+
+    KP_LOG_DEBUG("Kompute Tensor buffer & memory creation successful");
+}
+
+void
+Tensor::destroy()
+{
+    KP_LOG_DEBUG("Kompute Tensor started destroy()");
+
+    // Setting raw data to null regardless whether device is available to
+    // invalidate Tensor
+    this->mRawData = nullptr;
+    this->mSize = 0;
+    this->mMemorySize = 0;
+
+    if (!this->mDevice) {
+        KP_LOG_WARN(
+          "Kompute Tensor destructor reached with null Device pointer");
+        return;
+    }
+
+    if (this->mDevice) {
+        this->mDevice = nullptr;
+    }
+
+    KP_LOG_DEBUG("Kompute Tensor successful destroy()");
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<bool>::dataType()
+{
+    return Tensor::TensorDataTypes::eBool;
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<int32_t>::dataType()
+{
+    return Tensor::TensorDataTypes::eInt;
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<uint32_t>::dataType()
+{
+    return Tensor::TensorDataTypes::eUnsignedInt;
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<float>::dataType()
+{
+    return Tensor::TensorDataTypes::eFloat;
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<double>::dataType()
+{
+    return Tensor::TensorDataTypes::eDouble;
+}
+
+}
diff --git a/kompute/src/include/CMakeLists.txt b/kompute/src/include/CMakeLists.txt
new file mode 100644
index 000000000..05e1ed5e1
--- /dev/null
+++ b/kompute/src/include/CMakeLists.txt
@@ -0,0 +1,46 @@
+cmake_minimum_required(VERSION 3.20)
+
+# ####################################################
+# Kompute
+# ####################################################
+target_include_directories(kompute PUBLIC $<INSTALL_INTERFACE:include>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+
+target_sources(kompute PRIVATE
+
+    # Header files (useful in IDEs)
+    kompute/Algorithm.hpp
+    kompute/Core.hpp
+    kompute/Kompute.hpp
+    kompute/Manager.hpp
+    kompute/Sequence.hpp
+    kompute/Tensor.hpp
+
+    kompute/operations/OpAlgoDispatch.hpp
+    kompute/operations/OpBase.hpp
+    kompute/operations/OpMemoryBarrier.hpp
+    kompute/operations/OpMult.hpp
+    kompute/operations/OpTensorCopy.hpp
+    kompute/operations/OpTensorSyncDevice.hpp
+    kompute/operations/OpTensorSyncLocal.hpp
+    kompute/operations/OpBufferSyncDevice.hpp
+    kompute/operations/OpBufferSyncLocal.hpp
+
+    kompute/logger/Logger.hpp
+)
+
+install(DIRECTORY kompute DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+# ####################################################
+# Logger
+# ####################################################
+target_include_directories(kp_logger PUBLIC $<INSTALL_INTERFACE:include>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+
+target_sources(kp_logger PRIVATE
+
+    # Header files (useful in IDEs)
+    kompute/logger/Logger.hpp
+)
+
+install(DIRECTORY logger DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
\ No newline at end of file
diff --git a/kompute/src/include/kompute/Algorithm.hpp b/kompute/src/include/kompute/Algorithm.hpp
new file mode 100644
index 000000000..90fe48fef
--- /dev/null
+++ b/kompute/src/include/kompute/Algorithm.hpp
@@ -0,0 +1,338 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "fmt/format.h"
+#include "kompute/Tensor.hpp"
+#include "logger/Logger.hpp"
+
+namespace kp {
+
+/**
+    Abstraction for compute shaders that are run on top of tensors grouped via
+   ParameterGroups (which group descriptorsets)
+*/
+class Algorithm
+{
+  public:
+    /**
+     *  Main constructor for algorithm with configuration parameters to create
+     *  the underlying resources.
+     *
+     *  @param device The Vulkan device to use for creating resources
+     *  @param tensors (optional) The tensors to use to create the descriptor
+     * resources
+     *  @param spirv (optional) The spirv code to use to create the algorithm
+     *  @param workgroup (optional) The kp::Workgroup to use for the dispatch
+     * which defaults to kp::Workgroup(tensor[0].size(), 1, 1) if not set.
+     *  @param specializationConstants (optional) The templatable param is to be
+     * used to initialize the specialization constants which cannot be changed
+     * once set.
+     *  @param pushConstants (optional) This templatable param is to be used
+     * when initializing the pipeline, which set the size of the push constants
+     * - these can be modified but all new values must have the same data type
+     * and length as otherwise it will result in errors.
+     */
+    template<typename S = float, typename P = float>
+    Algorithm(std::shared_ptr<vk::Device> device,
+              vk::DescriptorPool *pool,
+              const std::vector<std::shared_ptr<Tensor>>& tensors = {},
+              const std::vector<uint32_t>& spirv = {},
+              const Workgroup& workgroup = {},
+              const std::vector<S>& specializationConstants = {},
+              const std::vector<P>& pushConstants = {})
+    {
+        KP_LOG_DEBUG("Kompute Algorithm Constructor with device");
+
+        this->mDevice = device;
+        this->mDescriptorPool = pool;
+
+        if (tensors.size() && spirv.size()) {
+            KP_LOG_INFO(
+              "Kompute Algorithm initialising with tensor size: {} and "
+              "spirv size: {}",
+              tensors.size(),
+              spirv.size());
+            this->rebuild(tensors,
+                          spirv,
+                          workgroup,
+                          specializationConstants,
+                          pushConstants);
+        } else {
+            KP_LOG_INFO(
+              "Kompute Algorithm constructor with empty tensors and or "
+              "spirv so not rebuilding vulkan components");
+        }
+    }
+
+    /**
+     *  Rebuild function to reconstruct algorithm with configuration parameters
+     * to create the underlying resources.
+     *
+     *  @param tensors The tensors to use to create the descriptor resources
+     *  @param spirv The spirv code to use to create the algorithm
+     *  @param workgroup (optional) The kp::Workgroup to use for the dispatch
+     * which defaults to kp::Workgroup(tensor[0].size(), 1, 1) if not set.
+     *  @param specializationConstants (optional) The std::vector<float> to use
+     * to initialize the specialization constants which cannot be changed once
+     * set.
+     *  @param pushConstants (optional) The std::vector<float> to use when
+     * initializing the pipeline, which set the size of the push constants -
+     * these can be modified but all new values must have the same vector size
+     * as this initial value.
+     */
+    template<typename S = float, typename P = float>
+    void rebuild(const std::vector<std::shared_ptr<Tensor>>& tensors,
+                 const std::vector<uint32_t>& spirv,
+                 const Workgroup& workgroup = {},
+                 const std::vector<S>& specializationConstants = {},
+                 const std::vector<P>& pushConstants = {})
+    {
+        KP_LOG_DEBUG("Kompute Algorithm rebuild started");
+
+        this->mTensors = tensors;
+        this->mSpirv = spirv;
+
+        if (specializationConstants.size()) {
+            if (this->mSpecializationConstantsData) {
+                free(this->mSpecializationConstantsData);
+            }
+            uint32_t memorySize =
+              sizeof(decltype(specializationConstants.back()));
+            uint32_t size = specializationConstants.size();
+            uint32_t totalSize = size * memorySize;
+            this->mSpecializationConstantsData = malloc(totalSize);
+            memcpy(this->mSpecializationConstantsData,
+                   specializationConstants.data(),
+                   totalSize);
+            this->mSpecializationConstantsDataTypeMemorySize = memorySize;
+            this->mSpecializationConstantsSize = size;
+        }
+
+        if (pushConstants.size()) {
+            if (this->mPushConstantsData) {
+                free(this->mPushConstantsData);
+            }
+            uint32_t memorySize = sizeof(decltype(pushConstants.back()));
+            uint32_t size = pushConstants.size();
+            uint32_t totalSize = size * memorySize;
+            this->mPushConstantsData = malloc(totalSize);
+            memcpy(this->mPushConstantsData, pushConstants.data(), totalSize);
+            this->mPushConstantsDataTypeMemorySize = memorySize;
+            this->mPushConstantsSize = size;
+        }
+
+        this->setWorkgroup(
+          workgroup, this->mTensors.size() ? this->mTensors[0]->size() : 1);
+
+        // Descriptor pool is created first so if available then destroy all
+        // before rebuild
+        if (this->isInit()) {
+            this->destroy();
+        }
+
+        this->createParameters();
+        this->createShaderModule();
+        this->createPipeline();
+    }
+
+    /**
+     * Destructor for Algorithm which is responsible for freeing and desroying
+     * respective pipelines and owned parameter groups.
+     */
+    ~Algorithm();
+
+    /**
+     * Records the dispatch function with the provided template parameters or
+     * alternatively using the size of the tensor by default.
+     *
+     * @param commandBuffer Command buffer to record the algorithm resources to
+     */
+    void recordDispatch(const vk::CommandBuffer& commandBuffer);
+
+    /**
+     * Records command that binds the "core" algorithm components which consist
+     * of binding the pipeline and binding the descriptorsets.
+     *
+     * @param commandBuffer Command buffer to record the algorithm resources to
+     */
+    void recordBindCore(const vk::CommandBuffer& commandBuffer);
+
+    /**
+     * Records command that binds the push constants to the command buffer
+     * provided
+     * - it is required that the pushConstants provided are of the same size as
+     * the ones provided during initialization.
+     *
+     * @param commandBuffer Command buffer to record the algorithm resources to
+     */
+    void recordBindPush(const vk::CommandBuffer& commandBuffer);
+
+    /**
+     * function that checks all the gpu resource components to verify if these
+     * have been created and returns true if all are valid.
+     *
+     * @returns returns true if the algorithm is currently initialized.
+     */
+    bool isInit();
+
+    /**
+     * Sets the work group to use in the recordDispatch
+     *
+     * @param workgroup The kp::Workgroup value to use to update the algorithm.
+     * It must have a value greater than 1 on the x value (index 1) otherwise it
+     * will be initialized on the size of the first tensor (ie.
+     * this->mTensor[0]->size())
+     */
+    void setWorkgroup(const Workgroup& workgroup, uint32_t minSize = 1);
+    /**
+     * Sets the push constants to the new value provided to use in the next
+     * bindPush()
+     *
+     * @param pushConstants The templatable vector is to be used to set the push
+     * constants to use in the next bindPush(...) calls. The constants provided
+     * must be of the same size as the ones created during initialization.
+     */
+    template<typename T>
+    void setPushConstants(const std::vector<T>& pushConstants)
+    {
+        uint32_t memorySize = sizeof(decltype(pushConstants.back()));
+        uint32_t size = pushConstants.size();
+        this->setPushConstants(pushConstants.data(), size, memorySize);
+    }
+
+    void updateDescriptors(vk::DescriptorPool *pool)
+    {
+        this->mDescriptorPool = pool;
+        this->setWorkgroup(
+          this->mWorkgroup, this->mTensors.size() ? this->mTensors[0]->size() : 1);
+
+        this->updateParameters(); // TODO: See if we can reduce this
+    }
+
+    /**
+     * Sets the push constants to the new value provided to use in the next
+     * bindPush() with the raw memory block location and memory size to be used.
+     *
+     * @param data The raw data point to copy the data from, without modifying
+     * the pointer.
+     * @param size The number of data elements provided in the data
+     * @param memorySize The memory size of each of the data elements in bytes.
+     */
+    void setPushConstants(const void* data, uint32_t size, uint32_t memorySize)
+    {
+
+        uint32_t totalSize = memorySize * size;
+        uint32_t previousTotalSize =
+          this->mPushConstantsDataTypeMemorySize * this->mPushConstantsSize;
+
+        if (totalSize != previousTotalSize) {
+            throw std::runtime_error(fmt::format(
+              "Kompute Algorithm push "
+              "constant total memory size provided is {} but expected {} bytes",
+              totalSize,
+              previousTotalSize));
+        }
+        if (this->mPushConstantsData) {
+            free(this->mPushConstantsData);
+        }
+
+        this->mPushConstantsData = malloc(totalSize);
+        memcpy(this->mPushConstantsData, data, totalSize);
+        this->mPushConstantsDataTypeMemorySize = memorySize;
+        this->mPushConstantsSize = size;
+    }
+
+    /**
+     * Gets the current workgroup from the algorithm.
+     *
+     * @param The kp::Constant to use to set the push constants to use in the
+     * next bindPush(...) calls. The constants provided must be of the same size
+     * as the ones created during initialization.
+     */
+    const Workgroup& getWorkgroup();
+    /**
+     * Gets the specialization constants of the current algorithm.
+     *
+     * @returns The std::vector<float> currently set for specialization
+     * constants
+     */
+    template<typename T>
+    const std::vector<T> getSpecializationConstants()
+    {
+        return { (T*)this->mSpecializationConstantsData,
+                 ((T*)this->mSpecializationConstantsData) +
+                   this->mSpecializationConstantsSize };
+    }
+    /**
+     * Gets the specialization constants of the current algorithm.
+     *
+     * @returns The std::vector<float> currently set for push constants
+     */
+    template<typename T>
+    const std::vector<T> getPushConstants()
+    {
+        return { (T*)this->mPushConstantsData,
+                 ((T*)this->mPushConstantsData) + this->mPushConstantsSize };
+    }
+    /**
+     * Gets the current tensors that are used in the algorithm.
+     *
+     * @returns The list of tensors used in the algorithm.
+     */
+    const std::vector<std::shared_ptr<Tensor>>& getTensors();
+    void setTensors(const std::vector<std::shared_ptr<Tensor>>& tensors);
+
+    void destroy();
+
+  private:
+    // -------------- NEVER OWNED RESOURCES
+    std::shared_ptr<vk::Device> mDevice;
+    std::vector<std::shared_ptr<Tensor>> mTensors;
+
+    // -------------- OPTIONALLY OWNED RESOURCES
+    std::shared_ptr<vk::DescriptorSetLayout> mDescriptorSetLayout;
+    bool mFreeDescriptorSetLayout = false;
+    vk::DescriptorPool *mDescriptorPool = nullptr;
+    std::shared_ptr<vk::DescriptorSet> mDescriptorSet;
+    bool mFreeDescriptorSet = false;
+    std::shared_ptr<vk::ShaderModule> mShaderModule;
+    bool mFreeShaderModule = false;
+    std::shared_ptr<vk::PipelineLayout> mPipelineLayout;
+    bool mFreePipelineLayout = false;
+    std::shared_ptr<vk::PipelineCache> mPipelineCache;
+    bool mFreePipelineCache = false;
+    std::shared_ptr<vk::Pipeline> mPipeline;
+    bool mFreePipeline = false;
+
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<uint32_t> mSpirv;
+    void* mSpecializationConstantsData = nullptr;
+    uint32_t mSpecializationConstantsDataTypeMemorySize = 0;
+    uint32_t mSpecializationConstantsSize = 0;
+    void* mPushConstantsData = nullptr;
+    uint32_t mPushConstantsDataTypeMemorySize = 0;
+    uint32_t mPushConstantsSize = 0;
+    Workgroup mWorkgroup;
+
+    // Create util functions
+    void createShaderModule();
+    void createPipeline();
+
+    // Parameters
+    void freeParameters();
+    void createParameters();
+    void updateParameters();
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/Core.hpp b/kompute/src/include/kompute/Core.hpp
new file mode 100644
index 000000000..99222cbde
--- /dev/null
+++ b/kompute/src/include/kompute/Core.hpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#pragma once
+
+#include <vulkan/vulkan.hpp>
+
+// Typedefs to simplify interaction with core types
+namespace kp {
+typedef std::array<uint32_t, 3> Workgroup;
+typedef std::vector<float> Constants;
+}
+
+// Must be after vulkan is included
+#ifndef KOMPUTE_VK_API_VERSION
+#ifndef KOMPUTE_VK_API_MAJOR_VERSION
+#define KOMPUTE_VK_API_MAJOR_VERSION 1
+#endif // KOMPUTE_VK_API_MAJOR_VERSION
+#ifndef KOMPUTE_VK_API_MINOR_VERSION
+#define KOMPUTE_VK_API_MINOR_VERSION 2
+#endif // KOMPUTE_VK_API_MINOR_VERSION
+#define KOMPUTE_VK_API_VERSION                                                 \
+    VK_MAKE_VERSION(                                                           \
+      KOMPUTE_VK_API_MAJOR_VERSION, KOMPUTE_VK_API_MINOR_VERSION, 0)
+#endif // KOMPUTE_VK_API_VERSION
+
+#if defined(KOMPUTE_BUILD_PYTHON)
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+// from python/src/main.cpp
+extern py::object kp_trace, kp_debug, kp_info, kp_warning, kp_error;
+#endif
diff --git a/kompute/src/include/kompute/Kompute.hpp b/kompute/src/include/kompute/Kompute.hpp
new file mode 100644
index 000000000..f59a63b50
--- /dev/null
+++ b/kompute/src/include/kompute/Kompute.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "Algorithm.hpp"
+#include "Core.hpp"
+#include "Manager.hpp"
+#include "Sequence.hpp"
+#include "Tensor.hpp"
+
+#include "operations/OpAlgoDispatch.hpp"
+#include "operations/OpBase.hpp"
+#include "operations/OpMemoryBarrier.hpp"
+#include "operations/OpMult.hpp"
+#include "operations/OpTensorCopy.hpp"
+#include "operations/OpTensorSyncDevice.hpp"
+#include "operations/OpTensorSyncLocal.hpp"
+#include "operations/OpBufferSyncDevice.hpp"
+#include "operations/OpBufferSyncLocal.hpp"
+
+// Will be build by CMake and placed inside the build directory
+#include "ShaderLogisticRegression.hpp"
+#include "ShaderOpMult.hpp"
diff --git a/kompute/src/include/kompute/Manager.hpp b/kompute/src/include/kompute/Manager.hpp
new file mode 100644
index 000000000..8fda58f84
--- /dev/null
+++ b/kompute/src/include/kompute/Manager.hpp
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#pragma once
+
+#include <set>
+#include <unordered_map>
+
+#include "kompute/Core.hpp"
+
+#include "kompute/Sequence.hpp"
+#include "logger/Logger.hpp"
+
+#define KP_DEFAULT_SESSION "DEFAULT"
+
+namespace kp {
+
+/**
+    Base orchestrator which creates and manages device and child components
+*/
+class Manager
+{
+  public:
+    /**
+        Base constructor.
+    */
+    Manager();
+
+    /**
+     * Manager destructor which would ensure all owned resources are destroyed
+     * unless explicitly stated that resources should not be destroyed or freed.
+     */
+    ~Manager();
+
+    bool hasDevice() const {
+        return this->mDevice.get();
+    }
+
+    /**
+     * Initialize a device.
+     *
+     * @param physicalDeviceIndex The index of the physical device to use
+     * @param familyQueueIndices (Optional) List of queue indices to add for
+     * explicit allocation
+     * @param desiredExtensions The desired extensions to load from
+     * physicalDevice
+     */
+    void initializeDevice(uint32_t physicalDeviceIndex,
+            const std::vector<uint32_t>& familyQueueIndices = {},
+            const std::vector<std::string>& desiredExtensions = {});
+
+    /**
+     * Create a managed sequence that will be destroyed by this manager
+     * if it hasn't been destroyed by its reference count going to zero.
+     *
+     * @param queueIndex The queue to use from the available queues
+     * @param nrOfTimestamps The maximum number of timestamps to allocate.
+     * If zero (default), disables latching of timestamps.
+     * @returns Shared pointer with initialised sequence
+     */
+    std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0,
+                                       uint32_t totalTimestamps = 0);
+
+    /**
+     * Create a managed tensor that will be destroyed by this manager
+     * if it hasn't been destroyed by its reference count going to zero.
+     *
+     * @param data The data to initialize the tensor with
+     * @param tensorType The type of tensor to initialize
+     * @returns Shared pointer with initialised tensor
+     */
+    template<typename T>
+    std::shared_ptr<TensorT<T>> tensorT(
+      const std::vector<T>& data,
+       vk::DeviceMemory *primaryMemory,
+       vk::Buffer *primaryBuffer,
+       vk::DeviceMemory *stagingMemory,
+       vk::Buffer *stagingBuffer,
+      Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
+    {
+        KP_LOG_DEBUG("Kompute Manager tensor creation triggered");
+
+        std::shared_ptr<TensorT<T>> tensor{ new kp::TensorT<T>(
+          this->mPhysicalDevice, this->mDevice, data, primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, tensorType) };
+
+        if (this->mManageResources) {
+            this->mManagedTensors.push_back(tensor);
+        }
+
+        return tensor;
+    }
+
+    std::shared_ptr<Tensor> tensor(
+      void* data,
+      uint32_t elementTotalCount,
+      uint64_t memorySize,
+      const Tensor::TensorDataTypes& dataType,
+      vk::DeviceMemory *primaryMemory,
+      vk::Buffer *primaryBuffer,
+      vk::DeviceMemory *stagingMemory,
+      vk::Buffer *stagingBuffer,
+      vk::DeviceSize offset,
+      Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
+    {
+        std::shared_ptr<Tensor> tensor{ new kp::Tensor(this->mPhysicalDevice,
+                                                       this->mDevice,
+                                                       data,
+                                                       elementTotalCount,
+                                                       memorySize,
+                                                       dataType,
+                                                       primaryMemory,
+                                                       primaryBuffer,
+                                                       stagingMemory,
+                                                       stagingBuffer,
+                                                       offset,
+                                                       tensorType) };
+
+        if (this->mManageResources) {
+            this->mManagedTensors.push_back(tensor);
+        }
+
+        return tensor;
+    }
+
+    /**
+     * Default non-template function that can be used to create algorithm
+     * objects which provides default types to the push and spec constants as
+     * floats.
+     *
+     * @param tensors (optional) The tensors to initialise the algorithm with
+     * @param spirv (optional) The SPIRV bytes for the algorithm to dispatch
+     * @param workgroup (optional) kp::Workgroup for algorithm to use, and
+     * defaults to (tensor[0].size(), 1, 1)
+     * @param specializationConstants (optional) float vector to use for
+     * specialization constants, and defaults to an empty constant
+     * @param pushConstants (optional) float vector to use for push constants,
+     * and defaults to an empty constant
+     * @returns Shared pointer with initialised algorithm
+     */
+    std::shared_ptr<Algorithm> algorithm(
+      vk::DescriptorPool *pool,
+      const std::vector<std::shared_ptr<Tensor>>& tensors = {},
+      const std::vector<uint32_t>& spirv = {},
+      const Workgroup& workgroup = {},
+      const std::vector<float>& specializationConstants = {},
+      const std::vector<float>& pushConstants = {})
+    {
+        return this->algorithm<>(
+          pool, tensors, spirv, workgroup, specializationConstants, pushConstants);
+    }
+
+    /**
+     * Create a managed algorithm that will be destroyed by this manager
+     * if it hasn't been destroyed by its reference count going to zero.
+     *
+     * @param tensors (optional) The tensors to initialise the algorithm with
+     * @param spirv (optional) The SPIRV bytes for the algorithm to dispatch
+     * @param workgroup (optional) kp::Workgroup for algorithm to use, and
+     * defaults to (tensor[0].size(), 1, 1)
+     * @param specializationConstants (optional) templatable vector parameter to
+     * use for specialization constants, and defaults to an empty constant
+     * @param pushConstants (optional) templatable vector parameter to use for
+     * push constants, and defaults to an empty constant
+     * @returns Shared pointer with initialised algorithm
+     */
+    template<typename S = float, typename P = float>
+    std::shared_ptr<Algorithm> algorithm(
+      vk::DescriptorPool *pool,
+      const std::vector<std::shared_ptr<Tensor>>& tensors,
+      const std::vector<uint32_t>& spirv,
+      const Workgroup& workgroup,
+      const std::vector<S>& specializationConstants,
+      const std::vector<P>& pushConstants)
+    {
+
+        KP_LOG_DEBUG("Kompute Manager algorithm creation triggered");
+
+        std::shared_ptr<Algorithm> algorithm{ new kp::Algorithm(
+          this->mDevice,
+          pool,
+          tensors,
+          spirv,
+          workgroup,
+          specializationConstants,
+          pushConstants) };
+
+        if (this->mManageResources) {
+            this->mManagedAlgorithms.push_back(algorithm);
+        }
+
+        return algorithm;
+    }
+
+    /**
+     * Destroy the GPU resources and all managed resources by manager.
+     **/
+    void destroy();
+    /**
+     * Run a pseudo-garbage collection to release all the managed resources
+     * that have been already freed due to these reaching to zero ref count.
+     **/
+    void clear();
+
+    /**
+     * Information about the current device.
+     *
+     * @return vk::PhysicalDeviceProperties containing information about the
+     *device
+     **/
+    vk::PhysicalDeviceProperties getDeviceProperties() const;
+
+    /**
+     * List the devices available in the current vulkan instance.
+     *
+     * @return vector of physical devices containing their respective properties
+     **/
+    std::vector<vk::PhysicalDevice> listDevices() const;
+
+    /**
+     * The current Vulkan instance.
+     *
+     * @return a shared pointer to the current Vulkan instance held by this
+     *object
+     **/
+    std::shared_ptr<vk::Instance> getVkInstance() const;
+
+    std::shared_ptr<vk::Device> device() const { return mDevice; }
+    std::shared_ptr<vk::PhysicalDevice> physicalDevice() const { return mPhysicalDevice; }
+
+  private:
+    // -------------- OPTIONALLY OWNED RESOURCES
+    std::shared_ptr<vk::Instance> mInstance = nullptr;
+    bool mFreeInstance = false;
+    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
+    std::shared_ptr<vk::Device> mDevice = nullptr;
+    bool mFreeDevice = false;
+
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::weak_ptr<Tensor>> mManagedTensors;
+    std::vector<std::weak_ptr<Sequence>> mManagedSequences;
+    std::vector<std::weak_ptr<Algorithm>> mManagedAlgorithms;
+
+    std::vector<uint32_t> mComputeQueueFamilyIndices;
+    std::vector<std::shared_ptr<vk::Queue>> mComputeQueues;
+
+    bool mManageResources = false;
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+    vk::DebugReportCallbackEXT mDebugReportCallback;
+    vk::DispatchLoaderDynamic mDebugDispatcher;
+#endif
+
+    // Create functions
+    void createInstance();
+    void createDevice(const std::vector<uint32_t>& familyQueueIndices = {},
+                      uint32_t physicalDeviceIndex = 0,
+                      const std::vector<std::string>& desiredExtensions = {});
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/Sequence.hpp b/kompute/src/include/kompute/Sequence.hpp
new file mode 100644
index 000000000..e282242f1
--- /dev/null
+++ b/kompute/src/include/kompute/Sequence.hpp
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "kompute/operations/OpAlgoDispatch.hpp"
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ *  Container of operations that can be sent to GPU as batch
+ */
+class Sequence : public std::enable_shared_from_this<Sequence>
+{
+  public:
+    /**
+     * Main constructor for sequence which requires core vulkan components to
+     * generate all dependent resources.
+     *
+     * @param physicalDevice Vulkan physical device
+     * @param device Vulkan logical device
+     * @param computeQueue Vulkan compute queue
+     * @param queueIndex Vulkan compute queue index in device
+     * @param totalTimestamps Maximum number of timestamps to allocate
+     */
+    Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+             std::shared_ptr<vk::Device> device,
+             std::shared_ptr<vk::Queue> computeQueue,
+             uint32_t queueIndex,
+             uint32_t totalTimestamps = 0);
+    /**
+     * Destructor for sequence which is responsible for cleaning all subsequent
+     * owned operations.
+     */
+    ~Sequence();
+
+    /**
+     * Record function for operation to be added to the GPU queue in batch. This
+     * template requires classes to be derived from the OpBase class. This
+     * function also requires the Sequence to be recording, otherwise it will
+     * not be able to add the operation.
+     *
+     * @param op Object derived from kp::BaseOp that will be recoreded by the
+     * sequence which will be used when the operation is evaluated.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    std::shared_ptr<Sequence> record(std::shared_ptr<OpBase> op);
+
+    /**
+     * Record function for operation to be added to the GPU queue in batch. This
+     * template requires classes to be derived from the OpBase class. This
+     * function also requires the Sequence to be recording, otherwise it will
+     * not be able to add the operation.
+     *
+     * @param tensors Vector of tensors to use for the operation
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> record(
+      std::vector<std::shared_ptr<Tensor>> tensors,
+      TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
+        return this->record(op);
+    }
+    /**
+     * Record function for operation to be added to the GPU queue in batch. This
+     * template requires classes to be derived from the OpBase class. This
+     * function also requires the Sequence to be recording, otherwise it will
+     * not be able to add the operation.
+     *
+     * @param algorithm Algorithm to use for the record often used for OpAlgo
+     * operations
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> record(std::shared_ptr<Algorithm> algorithm,
+                                     TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(algorithm,
+                                     std::forward<TArgs>(params)...) };
+        return this->record(op);
+    }
+
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job synchronously (with a barrier).
+     *
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    std::shared_ptr<Sequence> eval();
+
+    /**
+     * Resets all the recorded and stored operations, records the operation
+     * provided and submits into the gpu as a submit job synchronously (with a
+     * barrier).
+     *
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    std::shared_ptr<Sequence> eval(std::shared_ptr<OpBase> op);
+
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier.
+     *
+     * @param tensors Vector of tensors to use for the operation
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> eval(std::vector<std::shared_ptr<Tensor>> tensors,
+                                   TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
+        return this->eval(op);
+    }
+
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> eval(vk::Buffer *primaryBuffer,
+                                   vk::Buffer *stagingBuffer,
+                                   vk::DeviceSize size,
+                                   TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(primaryBuffer, stagingBuffer, size, std::forward<TArgs>(params)...) };
+        return this->eval(op);
+    }
+
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier.
+     *
+     * @param algorithm Algorithm to use for the record often used for OpAlgo
+     * operations
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> eval(std::shared_ptr<Algorithm> algorithm,
+                                   TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(algorithm,
+                                     std::forward<TArgs>(params)...) };
+        return this->eval(op);
+    }
+
+    /**
+     * Eval Async sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job without a barrier. EvalAwait()
+     * must ALWAYS be called after to ensure the sequence is terminated
+     * correctly.
+     *
+     * @return Boolean stating whether execution was successful.
+     */
+    std::shared_ptr<Sequence> evalAsync();
+    /**
+     * Clears currnet operations to record provided one in the vector of
+     * operations into the gpu as a submit job without a barrier. EvalAwait()
+     * must ALWAYS be called after to ensure the sequence is terminated
+     * correctly.
+     *
+     * @return Boolean stating whether execution was successful.
+     */
+    std::shared_ptr<Sequence> evalAsync(std::shared_ptr<OpBase> op);
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier.
+     *
+     * @param tensors Vector of tensors to use for the operation
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> evalAsync(
+      std::vector<std::shared_ptr<Tensor>> tensors,
+      TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
+        return this->evalAsync(op);
+    }
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier.
+     *
+     * @param algorithm Algorithm to use for the record often used for OpAlgo
+     * operations
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> evalAsync(std::shared_ptr<Algorithm> algorithm,
+                                        TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(algorithm,
+                                     std::forward<TArgs>(params)...) };
+        return this->evalAsync(op);
+    }
+
+    /**
+     * Eval Await waits for the fence to finish processing and then once it
+     * finishes, it runs the postEval of all operations.
+     *
+     * @param waitFor Number of milliseconds to wait before timing out.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    std::shared_ptr<Sequence> evalAwait(uint64_t waitFor = UINT64_MAX);
+
+    /**
+     * Clear function clears all operations currently recorded and starts
+     * recording again.
+     */
+    void clear();
+
+    /**
+     * Return the timestamps that were latched at the beginning and
+     * after each operation during the last eval() call.
+     */
+    std::vector<std::uint64_t> getTimestamps();
+
+    /**
+     * Begins recording commands for commands to be submitted into the command
+     * buffer.
+     */
+    void begin();
+
+    /**
+     * Ends the recording and stops recording commands when the record command
+     * is sent.
+     */
+    void end();
+
+    /**
+     * Returns true if the sequence is currently in recording activated.
+     *
+     * @return Boolean stating if recording ongoing.
+     */
+    bool isRecording() const;
+
+    /**
+     * Returns true if the sequence has been initialised, and it's based on the
+     * GPU resources being referenced.
+     *
+     * @return Boolean stating if is initialized
+     */
+    bool isInit() const;
+
+    /**
+     * Clears command buffer and triggers re-record of all the current
+     * operations saved, which is useful if the underlying kp::Tensors or
+     * kp::Algorithms are modified and need to be re-recorded.
+     */
+    void rerecord();
+
+    /**
+     * Returns true if the sequence is currently running - mostly used for async
+     * workloads.
+     *
+     * @return Boolean stating if currently running.
+     */
+    bool isRunning() const;
+
+    /**
+     * Destroys and frees the GPU resources which include the buffer and memory
+     * and sets the sequence as init=False.
+     */
+    void destroy();
+
+  private:
+    // -------------- NEVER OWNED RESOURCES
+    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
+    std::shared_ptr<vk::Device> mDevice = nullptr;
+    std::shared_ptr<vk::Queue> mComputeQueue = nullptr;
+    uint32_t mQueueIndex = -1;
+
+    // -------------- OPTIONALLY OWNED RESOURCES
+    std::shared_ptr<vk::CommandPool> mCommandPool = nullptr;
+    bool mFreeCommandPool = false;
+    std::shared_ptr<vk::CommandBuffer> mCommandBuffer = nullptr;
+    bool mFreeCommandBuffer = false;
+
+    // -------------- ALWAYS OWNED RESOURCES
+    vk::Fence mFence;
+    std::vector<std::shared_ptr<OpBase>> mOperations{};
+    std::shared_ptr<vk::QueryPool> timestampQueryPool = nullptr;
+
+    // State
+    bool mRecording = false;
+    bool mIsRunning = false;
+
+    // Create functions
+    void createCommandPool();
+    void createCommandBuffer();
+    void createTimestampQueryPool(uint32_t totalTimestamps);
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/Tensor.hpp b/kompute/src/include/kompute/Tensor.hpp
new file mode 100644
index 000000000..4c260ce6b
--- /dev/null
+++ b/kompute/src/include/kompute/Tensor.hpp
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+#pragma once
+
+#include "kompute/Core.hpp"
+#include "logger/Logger.hpp"
+#include <memory>
+#include <string>
+
+namespace kp {
+
+/**
+ * Structured data used in GPU operations.
+ *
+ * Tensors are the base building block in Kompute to perform operations across
+ * GPUs. Each tensor would have a respective Vulkan memory and buffer, which
+ * would be used to store their respective data. The tensors can be used for GPU
+ * data storage or transfer.
+ */
+class Tensor
+{
+  public:
+    /**
+     * Type for tensors created: Device allows memory to be transferred from
+     * staging buffers. Staging are host memory visible. Storage are device
+     * visible but are not set up to transfer or receive data (only for shader
+     * storage).
+     */
+    enum class TensorTypes
+    {
+        eDevice = 0,  ///< Type is device memory, source and destination
+        eHost = 1,    ///< Type is host memory, source and destination
+        eStorage = 2, ///< Type is Device memory (only)
+    };
+    enum class TensorDataTypes
+    {
+        eBool = 0,
+        eInt = 1,
+        eUnsignedInt = 2,
+        eFloat = 3,
+        eDouble = 4,
+    };
+
+    static std::string toString(TensorDataTypes dt);
+    static std::string toString(TensorTypes dt);
+
+    /**
+     *  Constructor with data provided which would be used to create the
+     * respective vulkan buffer and memory.
+     *
+     *  @param physicalDevice The physical device to use to fetch properties
+     *  @param device The device to use to create the buffer and memory from
+     *  @param data Non-zero-sized vector of data that will be used by the
+     * tensor
+     *  @param tensorTypes Type for the tensor which is of type TensorTypes
+     */
+    Tensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+           std::shared_ptr<vk::Device> device,
+           void* data,
+           uint32_t elementTotalCount,
+           uint32_t memorySize,
+           const TensorDataTypes& dataType,
+           vk::DeviceMemory *primaryMemory,
+           vk::Buffer *primaryBuffer,
+           vk::DeviceMemory *stagingMemory,
+           vk::Buffer *stagingBuffer,
+           vk::DeviceSize offset,
+           const TensorTypes& tensorType = TensorTypes::eDevice);
+
+    /**
+     * Destructor which is in charge of freeing vulkan resources unless they
+     * have been provided externally.
+     */
+    virtual ~Tensor();
+
+    /**
+     * Function to trigger reinitialisation of the tensor buffer and memory with
+     * new data as well as new potential device type.
+     *
+     * @param data Vector of data to use to initialise vector from
+     * @param tensorType The type to use for the tensor
+     */
+    void rebuild(void* data,
+                 uint32_t elementTotalCount,
+                 uint64_t memorySize,
+                 vk::DeviceMemory *primaryMemory,
+                 vk::Buffer *primaryBuffer,
+                 vk::DeviceMemory *stagingMemory,
+                 vk::Buffer *stagingBuffer,
+                 vk::DeviceSize offset);
+
+    /**
+     * Destroys and frees the GPU resources which include the buffer and memory.
+     */
+    void destroy();
+
+    /**
+     * Check whether tensor is initialized based on the created gpu resources.
+     *
+     * @returns Boolean stating whether tensor is initialized
+     */
+    bool isInit();
+
+    /**
+     * Retrieve the tensor type of the Tensor
+     *
+     * @return Tensor type of tensor
+     */
+    TensorTypes tensorType();
+
+    /**
+     * Records a copy from the memory of the tensor provided to the current
+     * thensor. This is intended to pass memory into a processing, to perform
+     * a staging buffer transfer, or to gather output (between others).
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param copyFromTensor Tensor to copy the data from
+     */
+    void recordCopyFrom(const vk::CommandBuffer& commandBuffer,
+                        std::shared_ptr<Tensor> copyFromTensor);
+
+    /**
+     * Records a copy from the internal staging memory to the device memory
+     * using an optional barrier to wait for the operation. This function would
+     * only be relevant for kp::Tensors of type eDevice.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     */
+    void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer);
+
+    /**
+     * Records a copy from the internal device memory to the staging memory
+     * using an optional barrier to wait for the operation. This function would
+     * only be relevant for kp::Tensors of type eDevice.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     */
+    void recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer);
+
+    /**
+     * Records the buffer memory barrier into the primary buffer and command
+     * buffer which ensures that relevant data transfers are carried out
+     * correctly.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param srcAccessMask Access flags for source access mask
+     * @param dstAccessMask Access flags for destination access mask
+     * @param scrStageMask Pipeline stage flags for source stage mask
+     * @param dstStageMask Pipeline stage flags for destination stage mask
+     */
+    void recordPrimaryBufferMemoryBarrier(
+      const vk::CommandBuffer& commandBuffer,
+      vk::AccessFlagBits srcAccessMask,
+      vk::AccessFlagBits dstAccessMask,
+      vk::PipelineStageFlagBits srcStageMask,
+      vk::PipelineStageFlagBits dstStageMask);
+    /**
+     * Records the buffer memory barrier into the staging buffer and command
+     * buffer which ensures that relevant data transfers are carried out
+     * correctly.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param srcAccessMask Access flags for source access mask
+     * @param dstAccessMask Access flags for destination access mask
+     * @param scrStageMask Pipeline stage flags for source stage mask
+     * @param dstStageMask Pipeline stage flags for destination stage mask
+     */
+    void recordStagingBufferMemoryBarrier(
+      const vk::CommandBuffer& commandBuffer,
+      vk::AccessFlagBits srcAccessMask,
+      vk::AccessFlagBits dstAccessMask,
+      vk::PipelineStageFlagBits srcStageMask,
+      vk::PipelineStageFlagBits dstStageMask);
+
+    /**
+     * Constructs a vulkan descriptor buffer info which can be used to specify
+     * and reference the underlying buffer component of the tensor without
+     * exposing it.
+     *
+     * @return Descriptor buffer info with own buffer
+     */
+    vk::DescriptorBufferInfo constructDescriptorBufferInfo();
+
+    /**
+     * Returns the size/magnitude of the Tensor, which will be the total number
+     * of elements across all dimensions
+     *
+     * @return Unsigned integer representing the total number of elements
+     */
+    uint32_t size();
+
+    /**
+     * Returns the total memory size of the data contained by the Tensor object
+     *
+     * @return Unsigned integer representing the memory of the tensor in bytes.
+     */
+    uint64_t memorySize();
+
+    /**
+     * Retrieve the data type of the tensor (host, device, storage)
+     *
+     * @return Data type of tensor of type kp::Tensor::TensorDataTypes
+     */
+    TensorDataTypes dataType();
+
+    /**
+     * Retrieve the raw data via the pointer to the memory that contains the raw
+     * memory of this current tensor. This tensor gets changed to a nullptr when
+     * the Tensor is removed.
+     *
+     * @return Pointer to raw memory containing raw bytes data of Tensor.
+     */
+    void* rawData();
+
+    /**
+     * Sets / resets the data of the tensor which is directly done on the GPU
+     * host visible memory available by the tensor.
+     */
+    void setRawData(const void* data);
+
+    /**
+     * Template to return the pointer data converted by specific type, which
+     * would be any of the supported types including float, double, int32,
+     * uint32 and bool.
+     *
+     * @return Pointer to raw memory containing raw bytes data of Tensor.
+     */
+    template<typename T>
+    T* data()
+    {
+        return (T*)this->mRawData;
+    }
+
+    /**
+     * Template to get the data of the current tensor as a vector of specific
+     * type, which would be any of the supported types including float, double,
+     * int32, uint32 and bool.
+     *
+     * @return Vector of type provided by template.
+     */
+    template<typename T>
+    std::vector<T> vector()
+    {
+        return { (T*)this->mRawData, ((T*)this->mRawData) + this->size() };
+    }
+
+  protected:
+    // -------------- ALWAYS OWNED RESOURCES
+    TensorTypes mTensorType;
+    TensorDataTypes mDataType;
+    uint32_t mSize = 0;
+    uint64_t mMemorySize = 0;
+    vk::DeviceSize mOffset = 0;
+    void* mRawData = nullptr;
+
+  private:
+    // -------------- NEVER OWNED RESOURCES
+    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice;
+    std::shared_ptr<vk::Device> mDevice;
+    vk::Buffer *mPrimaryBuffer = nullptr;
+    vk::Buffer *mStagingBuffer = nullptr;
+    vk::DeviceMemory *mPrimaryMemory = nullptr;
+    vk::DeviceMemory *mStagingMemory = nullptr;
+
+    void setGPUResources(vk::DeviceMemory *primaryMemory,
+                         vk::Buffer *primaryBuffer,
+                         vk::DeviceMemory *stagingMemory,
+                         vk::Buffer *stagingBuffer,
+                         vk::DeviceSize offset);
+    void recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
+                          vk::Buffer *bufferFrom,
+                          vk::Buffer *bufferTo,
+                          vk::DeviceSize bufferSize,
+                          vk::BufferCopy copyRegion);
+    void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                   const vk::Buffer& buffer,
+                                   vk::AccessFlagBits srcAccessMask,
+                                   vk::AccessFlagBits dstAccessMask,
+                                   vk::PipelineStageFlagBits srcStageMask,
+                                   vk::PipelineStageFlagBits dstStageMask);
+
+    // Private util functions
+    vk::BufferUsageFlags getPrimaryBufferUsageFlags();
+    vk::MemoryPropertyFlags getPrimaryMemoryPropertyFlags();
+    vk::BufferUsageFlags getStagingBufferUsageFlags();
+    vk::MemoryPropertyFlags getStagingMemoryPropertyFlags();
+};
+
+template<typename T>
+class TensorT : public Tensor
+{
+
+  public:
+    ~TensorT() { KP_LOG_DEBUG("Kompute TensorT destructor"); }
+
+    TensorDataTypes dataType();
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/logger/Logger.hpp b/kompute/src/include/kompute/logger/Logger.hpp
new file mode 100644
index 000000000..f97e95cf0
--- /dev/null
+++ b/kompute/src/include/kompute/logger/Logger.hpp
@@ -0,0 +1,197 @@
+#pragma once
+
+#define KOMPUTE_LOG_LEVEL_TRACE 0
+#define KOMPUTE_LOG_LEVEL_DEBUG 1
+#define KOMPUTE_LOG_LEVEL_INFO 2
+#define KOMPUTE_LOG_LEVEL_WARN 3
+#define KOMPUTE_LOG_LEVEL_ERROR 4
+#define KOMPUTE_LOG_LEVEL_CRITICAL 5
+#define KOMPUTE_LOG_LEVEL_OFF 6
+
+// Logging is disabled entirely.
+#if KOMPUTE_OPT_LOG_LEVEL_DISABLED
+#define KP_LOG_TRACE(...)
+#define KP_LOG_DEBUG(...)
+#define KP_LOG_INFO(...)
+#define KP_LOG_WARN(...)
+#define KP_LOG_ERROR(...)
+#else
+
+#if !KOMPUTE_OPT_USE_SPDLOG
+#if VK_USE_PLATFORM_ANDROID_KHR
+#include <android/log.h>
+#include <fmt/core.h>
+static const char* KOMPUTE_LOG_TAG = "KomputeLog";
+#else
+#if KOMPUTE_BUILD_PYTHON
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+// from python/src/main.cpp
+extern py::object kp_trace, kp_debug, kp_info, kp_warning, kp_error;
+#else
+#include <fmt/core.h>
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#include <spdlog/spdlog.h>
+#endif // !KOMPUTE_OPT_USE_SPDLOG
+#include <set>
+#include <string>
+#include <vector>
+namespace logger {
+// Setup the logger, note the loglevel can not be set below the CMake log level
+// (To change this use -DKOMPUTE_OPT_LOG_LEVEL=...)
+void
+setupLogger();
+
+// Logging is enabled, but we do not use Spdlog. So we use fmt in case nothing
+// else is defined, overriding logging.
+#if !KOMPUTE_OPT_USE_SPDLOG
+
+#ifndef KP_LOG_TRACE
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_TRACE
+#if VK_USE_PLATFORM_ANDROID_KHR
+#define KP_LOG_TRACE(...)                                                      \
+    ((void)__android_log_write(                                                \
+      ANDROID_LOG_VERBOSE, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
+#else
+#if KOMPUTE_BUILD_PYTHON
+#define KP_LOG_DEBUG(...) kp_trace(fmt::format(__VA_ARGS__))
+#else
+#define KP_LOG_TRACE(...)                                                      \
+    fmt::print("[{} {}] [trace] [{}:{}] {}\n",                                 \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#define KP_LOG_TRACE(...)
+#endif
+#endif // !KP_LOG_TRACE
+
+#ifndef KP_LOG_DEBUG
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_DEBUG
+#if VK_USE_PLATFORM_ANDROID_KHR
+#define KP_LOG_DEBUG(...)                                                      \
+    ((void)__android_log_write(                                                \
+      ANDROID_LOG_DEBUG, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
+#else
+#if KOMPUTE_BUILD_PYTHON
+#define KP_LOG_DEBUG(...) kp_debug(fmt::format(__VA_ARGS__))
+#else
+#ifdef __FILE_NAME__ // gcc 12 provides only file name without path
+#define KP_LOG_DEBUG(...)                                                      \
+    fmt::print("[{} {}] [debug] [{}:{}] {}\n",                                 \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE_NAME__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#else
+#define KP_LOG_DEBUG(...)                                                      \
+    fmt::print("[{} {}] [debug] [{}:{}] {}\n",                                 \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#endif // __FILE__NAME__
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#define KP_LOG_DEBUG(...)
+#endif
+#endif // !KP_LOG_DEBUG
+
+#ifndef KP_LOG_INFO
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_INFO
+#if VK_USE_PLATFORM_ANDROID_KHR
+#define KP_LOG_INFO(...)                                                       \
+    ((void)__android_log_write(                                                \
+      ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
+#else
+#if KOMPUTE_BUILD_PYTHON
+#define KP_LOG_DEBUG(...) kp_info(fmt::format(__VA_ARGS__))
+#else
+#define KP_LOG_INFO(...)                                                       \
+    fmt::print("[{} {}] [info] [{}:{}] {}\n",                                  \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#define KP_LOG_INFO(...)
+#endif
+#endif // !KP_LOG_INFO
+
+#ifndef KP_LOG_WARN
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_WARN
+#if VK_USE_PLATFORM_ANDROID_KHR
+#define KP_LOG_WARN(...)                                                       \
+    ((void)__android_log_write(                                                \
+      ANDROID_LOG_WARN, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
+#else
+#if KOMPUTE_BUILD_PYTHON
+#define KP_LOG_DEBUG(...) kp_warning(fmt::format(__VA_ARGS__))
+#else
+#define KP_LOG_WARN(...)                                                       \
+    fmt::print("[{} {}] [warn] [{}:{}] {}\n",                                  \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#define KP_LOG_WARN(...)
+#endif
+#endif // !KP_LOG_WARN
+
+#ifndef KP_LOG_ERROR
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_ERROR
+#if VK_USE_PLATFORM_ANDROID_KHR
+#define KP_LOG_ERROR(...)                                                      \
+    ((void)__android_log_write(                                                \
+      ANDROID_LOG_ERROR, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
+#else
+#if KOMPUTE_BUILD_PYTHON
+#define KP_LOG_DEBUG(...) kp_error(fmt::format(__VA_ARGS__))
+#else
+#define KP_LOG_ERROR(...)                                                      \
+    fmt::print("[{} {}] [error] [{}:{}] {}\n",                                 \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#define KP_LOG_ERROR(...)
+#endif
+#endif // !KP_LOG_ERROR
+#else
+
+#define KP_LOG_TRACE(...) SPDLOG_TRACE(__VA_ARGS__)
+#define KP_LOG_DEBUG(...) SPDLOG_DEBUG(__VA_ARGS__)
+#define KP_LOG_INFO(...) SPDLOG_INFO(__VA_ARGS__)
+#define KP_LOG_WARN(...) SPDLOG_WARN(__VA_ARGS__)
+#define KP_LOG_ERROR(...) SPDLOG_ERROR(__VA_ARGS__)
+
+void
+setLogLevel(spdlog::level::level_enum level);
+
+spdlog::level::level_enum
+getLogLevel();
+
+#endif // !KOMPUTE_OPT_USE_SPDLOG
+} // namespace logger
+
+#endif // KOMPUTE_OPT_LOG_LEVEL_DISABLED
diff --git a/kompute/src/include/kompute/operations/OpAlgoDispatch.hpp b/kompute/src/include/kompute/operations/OpAlgoDispatch.hpp
new file mode 100644
index 000000000..e91598f05
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpAlgoDispatch.hpp
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Algorithm.hpp"
+#include "kompute/Core.hpp"
+#include "kompute/Tensor.hpp"
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that provides a general abstraction that simplifies the use of
+ * algorithm and parameter components which can be used with shaders.
+ * By default it enables the user to provide a dynamic number of tensors
+ * which are then passed as inputs.
+ */
+class OpAlgoDispatch : public OpBase
+{
+  public:
+    /**
+     * Constructor that stores the algorithm to use as well as the relevant
+     * push constants to override when recording.
+     *
+     * @param algorithm The algorithm object to use for dispatch
+     * @param pushConstants The push constants to use for override
+     */
+    template<typename T = float>
+    OpAlgoDispatch(const std::shared_ptr<kp::Algorithm>& algorithm,
+                   const std::vector<T>& pushConstants = {})
+    {
+        KP_LOG_DEBUG("Kompute OpAlgoDispatch constructor");
+
+        this->mAlgorithm = algorithm;
+
+        if (pushConstants.size()) {
+            uint32_t memorySize = sizeof(decltype(pushConstants.back()));
+            uint32_t size = pushConstants.size();
+            uint32_t totalSize = size * memorySize;
+            this->mPushConstantsData = malloc(totalSize);
+            memcpy(this->mPushConstantsData, pushConstants.data(), totalSize);
+            this->mPushConstantsDataTypeMemorySize = memorySize;
+            this->mPushConstantsSize = size;
+        }
+    }
+
+    /**
+     * Default destructor, which is in charge of destroying the algorithm
+     * components but does not destroy the underlying tensors
+     */
+    virtual ~OpAlgoDispatch() override;
+
+    /**
+     * This records the commands that are to be sent to the GPU. This includes
+     * the barriers that ensure the memory has been copied before going in and
+     * out of the shader, as well as the dispatch operation that sends the
+     * shader processing to the gpu. This function also records the GPU memory
+     * copy of the output data for the staging buffer so it can be read by the
+     * host.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::shared_ptr<Algorithm> mAlgorithm;
+    void* mPushConstantsData = nullptr;
+    uint32_t mPushConstantsDataTypeMemorySize = 0;
+    uint32_t mPushConstantsSize = 0;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpBase.hpp b/kompute/src/include/kompute/operations/OpBase.hpp
new file mode 100644
index 000000000..737670846
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpBase.hpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Algorithm.hpp"
+#include "kompute/Core.hpp"
+#include "kompute/Tensor.hpp"
+
+namespace kp {
+
+/**
+ *  Base Operation which provides the high level interface that Kompute
+ *  operations implement in order to perform a set of actions in the GPU.
+ *
+ *  Operations can perform actions on tensors, and optionally can also own an
+ *  Algorithm with respective parameters. kp::Operations with kp::Algorithms
+ *  would inherit from kp::OpBaseAlgo.
+ */
+class OpBase
+{
+  public:
+    /**
+     * Default destructor for OpBase class. This OpBase destructor class should
+     * always be called to destroy and free owned resources unless it is
+     * intended to destroy the resources in the parent class.
+     */
+    virtual ~OpBase() { KP_LOG_DEBUG("Kompute OpBase destructor started"); }
+
+    /**
+     * The record function is intended to only send a record command or run
+     * commands that are expected to record operations that are to be submitted
+     * as a batch into the GPU.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void record(const vk::CommandBuffer& commandBuffer) = 0;
+
+    /**
+     * Pre eval is called before the Sequence has called eval and submitted the
+     * commands to the GPU for processing, and can be used to perform any
+     * per-eval setup steps required as the computation iteration begins. It's
+     * worth noting that there are situations where eval can be called multiple
+     * times, so the resources that are created should be idempotent in case
+     * it's called multiple times in a row.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) = 0;
+
+    /**
+     * Post eval is called after the Sequence has called eval and submitted the
+     * commands to the GPU for processing, and can be used to perform any
+     * tear-down steps required as the computation iteration finishes. It's
+     * worth noting that there are situations where eval can be called multiple
+     * times, so the resources that are destroyed should not require a re-init
+     * unless explicitly provided by the user.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) = 0;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp b/kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp
new file mode 100644
index 000000000..50d8e9707
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+class OpBufferSyncDevice : public OpBase
+{
+  public:
+    OpBufferSyncDevice(
+        vk::Buffer *primaryBuffer,
+        vk::Buffer *stagingBuffer,
+        vk::DeviceSize size);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpBufferSyncDevice() override;
+
+    /**
+     * For device buffers, it records the copy command for the buffer to copy
+     * the data from its staging to device memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    vk::Buffer *mPrimaryBuffer;
+    vk::Buffer *mStagingBuffer;
+    vk::DeviceSize mSize;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp b/kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp
new file mode 100644
index 000000000..7db997199
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+class OpBufferSyncLocal : public OpBase
+{
+  public:
+    OpBufferSyncLocal(
+        vk::Buffer *primaryBuffer,
+        vk::Buffer *stagingBuffer,
+        vk::DeviceSize size);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpBufferSyncLocal() override;
+
+    /**
+     * For device buffers, it records the copy command for the buffer to copy
+     * the data from its staging to device memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    vk::Buffer *mPrimaryBuffer;
+    vk::Buffer *mStagingBuffer;
+    vk::DeviceSize mSize;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpMemoryBarrier.hpp b/kompute/src/include/kompute/operations/OpMemoryBarrier.hpp
new file mode 100644
index 000000000..4a2322323
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpMemoryBarrier.hpp
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Algorithm.hpp"
+#include "kompute/Core.hpp"
+#include "kompute/Tensor.hpp"
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that provides a general abstraction that simplifies the use of
+ * algorithm and parameter components which can be used with shaders.
+ * It exposes the pipeline barrier functionality specifically for memory
+ * barriers that can be configured through the respective source and destination
+ * masks
+ */
+class OpMemoryBarrier : public OpBase
+{
+  public:
+    /**
+     * Constructor that stores tensors as well as memory barrier parameters to
+     * be used to create a pipeline barrier on the respective primary or staging
+     * tensor.
+     *
+     * @param tensors The tensors to apply the memory barriers on
+     * @param srcAccessMask The kp::AccessFlagBits for the source access mask
+     * @param dstAccessMask The kp::AccessFlagBits for the destination access
+     * mask
+     * @param srcStageMask The kp::PipelineStageFlagBits for the source stage
+     * mask
+     * @param dstStageMask The kp::PipelineStageFlagBits for the destination
+     * stage mask
+     * @param barrierOnPrimary Boolean to select primary or secondary buffers on
+     * tensors
+     */
+    OpMemoryBarrier(const std::vector<std::shared_ptr<Tensor>>& tensors,
+                    const vk::AccessFlagBits& srcAccessMask,
+                    const vk::AccessFlagBits& dstAccessMask,
+                    const vk::PipelineStageFlagBits& srcStageMask,
+                    const vk::PipelineStageFlagBits& dstStageMask,
+                    bool barrierOnPrimary = true);
+
+    /**
+     * Default destructor, which is in charge of destroying the reference to the
+     * tensors and all the relevant access / stage masks created
+     */
+    virtual ~OpMemoryBarrier() override;
+
+    /**
+     * This records the memory barrier with the access and stage masks provided
+     * across all relevant tensors.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    const vk::AccessFlagBits mSrcAccessMask;
+    const vk::AccessFlagBits mDstAccessMask;
+    const vk::PipelineStageFlagBits mSrcStageMask;
+    const vk::PipelineStageFlagBits mDstStageMask;
+    const bool mBarrierOnPrimary;
+    const std::vector<std::shared_ptr<Tensor>> mTensors;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpMult.hpp b/kompute/src/include/kompute/operations/OpMult.hpp
new file mode 100644
index 000000000..f75ccc4fb
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpMult.hpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include <fstream>
+
+#include "kompute/Core.hpp"
+
+#include "ShaderOpMult.hpp"
+
+#include "kompute/Algorithm.hpp"
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpAlgoDispatch.hpp"
+
+namespace kp {
+
+/**
+ * Operation that performs multiplication on two tensors and outpus on third
+ * tensor.
+ */
+class OpMult : public OpAlgoDispatch
+{
+  public:
+    /**
+     * Default constructor with parameters that provides the bare minimum
+     * requirements for the operations to be able to create and manage their
+     * sub-components.
+     *
+     * @param tensors Tensors that are to be used in this operation
+     * @param algorithm An algorithm that will be overridden with the OpMult
+     * shader data and the tensors provided which are expected to be 3
+     */
+    OpMult(std::vector<std::shared_ptr<Tensor>> tensors,
+           std::shared_ptr<Algorithm> algorithm)
+      : OpAlgoDispatch(algorithm)
+    {
+        KP_LOG_DEBUG("Kompute OpMult constructor with params");
+
+        if (tensors.size() != 3) {
+            throw std::runtime_error(
+              "Kompute OpMult expected 3 tensors but got " +
+              std::to_string(tensors.size()));
+        }
+
+        const std::vector<uint32_t> spirv = std::vector<uint32_t>(
+          SHADEROPMULT_COMP_SPV.begin(), SHADEROPMULT_COMP_SPV.end());
+
+        algorithm->rebuild<>(tensors, spirv);
+    }
+
+    /**
+     * Default destructor, which is in charge of destroying the algorithm
+     * components but does not destroy the underlying tensors
+     */
+    ~OpMult() override { KP_LOG_DEBUG("Kompute OpMult destructor started"); }
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpTensorCopy.hpp b/kompute/src/include/kompute/operations/OpTensorCopy.hpp
new file mode 100644
index 000000000..968c1065a
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpTensorCopy.hpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that copies the data from the first tensor to the rest of the
+ * tensors provided, using a record command for all the vectors. This operation
+ * does not own/manage the memory of the tensors passed to it. The operation
+ * must only receive tensors of type
+ */
+class OpTensorCopy : public OpBase
+{
+  public:
+    /**
+     * Default constructor with parameters that provides the core vulkan
+     * resources and the tensors that will be used in the operation.
+     *
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorCopy(const std::vector<std::shared_ptr<Tensor>>& tensors);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpTensorCopy() override;
+
+    /**
+     * Records the copy commands from the first tensor into all the other
+     * tensors provided. Also optionally records a barrier.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Copies the local vectors for all the tensors to sync the data with the
+     * gpu.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::shared_ptr<Tensor>> mTensors;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp b/kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp
new file mode 100644
index 000000000..9b39e490f
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Core.hpp"
+#include "kompute/Tensor.hpp"
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that syncs tensor's device by mapping local data into the device
+ * memory. For TensorTypes::eDevice it will use a record operation for the
+ * memory to be syncd into GPU memory which means that the operation will be
+ * done in sync with GPU commands. For TensorTypes::eHost it will only map the
+ * data into host memory which will happen during preEval before the recorded
+ * commands are dispatched.
+ */
+class OpTensorSyncDevice : public OpBase
+{
+  public:
+    /**
+     * Default constructor with parameters that provides the core vulkan
+     * resources and the tensors that will be used in the operation. The tensos
+     * provided cannot be of type TensorTypes::eStorage.
+     *
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorSyncDevice(const std::vector<std::shared_ptr<Tensor>>& tensors);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpTensorSyncDevice() override;
+
+    /**
+     * For device tensors, it records the copy command for the tensor to copy
+     * the data from its staging to device memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::shared_ptr<Tensor>> mTensors;
+    vk::Buffer *mPrimaryBuffer;
+    vk::Buffer *mStagingBuffer;
+    vk::DeviceSize mSize;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp b/kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp
new file mode 100644
index 000000000..4216003e5
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that syncs tensor's local memory by mapping device data into the
+ * local CPU memory. For TensorTypes::eDevice it will use a record operation
+ * for the memory to be syncd into GPU memory which means that the operation
+ * will be done in sync with GPU commands. For TensorTypes::eHost it will
+ * only map the data into host memory which will happen during preEval before
+ * the recorded commands are dispatched.
+ */
+class OpTensorSyncLocal : public OpBase
+{
+  public:
+    /**
+     * Default constructor with parameters that provides the core vulkan
+     * resources and the tensors that will be used in the operation. The tensors
+     * provided cannot be of type TensorTypes::eStorage.
+     *
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorSyncLocal(const std::vector<std::shared_ptr<Tensor>>& tensors);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpTensorSyncLocal() override;
+
+    /**
+     * For device tensors, it records the copy command for the tensor to copy
+     * the data from its device to staging memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * For host tensors it performs the map command from the host memory into
+     * local memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::shared_ptr<Tensor>> mTensors;
+};
+
+} // End namespace kp
diff --git a/kompute/src/logger/CMakeLists.txt b/kompute/src/logger/CMakeLists.txt
new file mode 100644
index 000000000..1dcc1e6b5
--- /dev/null
+++ b/kompute/src/logger/CMakeLists.txt
@@ -0,0 +1,69 @@
+cmake_minimum_required(VERSION 3.20)
+
+set(LOGGER_SOURCES Logger.cpp)
+
+add_library(kp_logger ${LOGGER_SOURCES})
+
+# Define log levels in code
+add_compile_definitions(KOMPUTE_LOG_LEVEL_TRACE=0)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_DEBUG=1)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_INFO=2)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_WARN=3)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_ERROR=4)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_CRITICAL=5)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_OFF=6)
+
+if(KOMPUTE_OPT_BUILD_PYTHON AND KOMPUTE_OPT_USE_SPDLOG)
+    message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_PYTHON' is incompatible with 'KOMPUTE_OPT_USE_SPDLOG'. To continue set either one option to 'OFF'.")
+endif()
+
+if(KOMPUTE_OPT_ANDROID_BUILD AND KOMPUTE_OPT_USE_SPDLOG)
+    message(FATAL_ERROR "'KOMPUTE_OPT_ANDROID_BUILD' is incompatible with 'KOMPUTE_OPT_USE_SPDLOG'. To continue set either one option to 'OFF'.")
+endif()
+
+if(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Trace")
+    set(KOMPUTE_OPT_LOG_LEVEL TRACE)
+    message(STATUS "Using log level Trace")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Debug")
+    set(KOMPUTE_OPT_LOG_LEVEL DEBUG)
+    message(STATUS "Using log level Debug")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Info")
+    set(KOMPUTE_OPT_LOG_LEVEL INFO)
+    message(STATUS "Using log level Info")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Warn")
+    set(KOMPUTE_OPT_LOG_LEVEL WARN)
+    message(STATUS "Using log level Warn")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Error")
+    set(KOMPUTE_OPT_LOG_LEVEL ERROR)
+    message(STATUS "Using log level Error")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Critical")
+    set(KOMPUTE_OPT_LOG_LEVEL CRITICAL)
+    message(STATUS "Using log level Critical")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Off")
+    set(KOMPUTE_OPT_LOG_LEVEL OFF)
+    message(STATUS "Using log level Off")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Default")
+    set(KOMPUTE_OPT_LOG_LEVEL $<IF:$<CONFIG:Debug>,DEBUG,INFO>)
+    message(STATUS "Setting KOMPUTE_OPT_LOG_LEVEL to according to the build type")
+else()
+    message(FATAL_ERROR "Log level '${KOMPUTE_OPT_LOG_LEVEL}' unknown, use -DKOMPUTE_OPT_LOG_LEVEL={Trace, Debug, Info, Warn, Error, Critical, Off, Default} to set it to a correct value.")
+endif()
+
+# Always make sure we define the Kompute log level independent of the Spdlog log level
+target_compile_definitions(kp_logger INTERFACE KOMPUTE_OPT_ACTIVE_LOG_LEVEL=KOMPUTE_LOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
+
+# Link depending on how the logger should be setup
+if(NOT KOMPUTE_OPT_LOG_LEVEL_DISABLED)
+    if(KOMPUTE_OPT_USE_SPDLOG)
+        target_link_libraries(kp_logger PUBLIC spdlog::spdlog)
+        target_compile_definitions(spdlog INTERFACE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
+        target_compile_definitions(kp_logger INTERFACE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
+        message(STATUS "setting SPDLOG_ACTIVE_LEVEL to SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL}")
+
+        if(KOMPUTE_OPT_SPDLOG_ASYNC_MODE)
+            target_compile_definitions(kp_logger INTERFACE KOMPUTE_SPDLOG_ASYNC_LOGGING=1)
+        endif()
+    else()
+        target_link_libraries(kp_logger PUBLIC fmt::fmt)
+    endif()
+endif()
diff --git a/kompute/src/logger/Logger.cpp b/kompute/src/logger/Logger.cpp
new file mode 100644
index 000000000..69df2b609
--- /dev/null
+++ b/kompute/src/logger/Logger.cpp
@@ -0,0 +1,101 @@
+#include "kompute/logger/Logger.hpp"
+
+#if !KOMPUTE_OPT_LOG_LEVEL_DISABLED
+#if !KOMPUTE_OPT_USE_SPDLOG
+#else
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <spdlog/async.h>
+#include <spdlog/common.h>
+#include <spdlog/logger.h>
+#include <spdlog/sinks/stdout_color_sinks.h>
+#include <spdlog/spdlog.h>
+#include <string>
+#endif // !KOMPUTE_OPT_USE_SPDLOG
+
+namespace logger {
+#if !KOMPUTE_OPT_USE_SPDLOG
+
+void
+setupLogger()
+{
+}
+
+#else
+constexpr int THREAD_QUEUE_LENGTH = 8192;
+
+void
+setupLogger()
+{
+    // Ensure we setup the logger only once
+    static bool setup = false;
+    static std::mutex setupMutex{};
+    setupMutex.lock();
+    if (setup) {
+        setupMutex.unlock();
+        return;
+    }
+    setup = true;
+    setupMutex.unlock();
+
+    spdlog::init_thread_pool(THREAD_QUEUE_LENGTH, 1);
+    spdlog::sink_ptr console_sink =
+      std::make_shared<spdlog::sinks::stdout_color_sink_mt>();
+#if SPDLOG_ACTIVE_LEVEL < SPDLOG_LEVEL_INFO
+    console_sink->set_pattern("[%H:%M:%S %z] [%^%=9l%$] [%=21s] %v");
+#else
+    console_sink->set_pattern("[%H:%M:%S %z] [%^%=9l%$] [%=15s] %v");
+#endif
+    std::vector<spdlog::sink_ptr> sinks{ console_sink };
+    // TODO: Add flag in compile flags
+    std::shared_ptr<spdlog::logger> logger =
+#if KOMPUTE_SPDLOG_ASYNC_LOGGING
+          std::make_shared<spdlog::async_logger>(
+            "",
+            sinks.begin(),
+            sinks.end(),
+            spdlog::thread_pool(),
+            spdlog::async_overflow_policy::block);
+#else
+          std::make_shared<spdlog::logger>(
+            "",
+            sinks.begin(),
+            sinks.end());
+#endif
+
+    logger->set_level(getLogLevel());
+
+    spdlog::set_default_logger(logger);
+}
+
+spdlog::level::level_enum
+getLogLevel()
+{
+#if SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_TRACE
+    return spdlog::level::trace;
+#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_DEBUG
+    return spdlog::level::debug;
+#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_INFO
+    return spdlog::level::info;
+#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_WARN
+    return spdlog::level::warn;
+#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_ERROR
+    return spdlog::level::error;
+#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_CRITICAL
+    return spdlog::level::critical;
+#else
+    return spdlog::level::off;
+#endif
+}
+
+void
+setLogLevel(const spdlog::level::level_enum level)
+{
+    spdlog::default_logger()->set_level(level);
+}
+#endif // !KOMPUTE_OPT_USE_SPDLOG
+} // namespace logger
+
+#endif
diff --git a/kompute/src/shaders/CMakeLists.txt b/kompute/src/shaders/CMakeLists.txt
new file mode 100644
index 000000000..901bf3e8a
--- /dev/null
+++ b/kompute/src/shaders/CMakeLists.txt
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+# ######################
+cmake_minimum_required(VERSION 3.20)
+
+add_subdirectory(glsl)
\ No newline at end of file
diff --git a/kompute/src/shaders/glsl/CMakeLists.txt b/kompute/src/shaders/glsl/CMakeLists.txt
new file mode 100644
index 000000000..3101a2b17
--- /dev/null
+++ b/kompute/src/shaders/glsl/CMakeLists.txt
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: Apache-2.0
+# ######################
+cmake_minimum_required(VERSION 3.20)
+
+# Check if build shaders from source is enabled
+if(KOMPUTE_OPT_BUILD_SHADERS)
+    vulkan_compile_shader(INFILE ShaderOpMult.comp
+        OUTFILE ShaderOpMult.hpp
+        NAMESPACE "kp")
+
+    vulkan_compile_shader(INFILE ShaderLogisticRegression.comp
+        OUTFILE ShaderLogisticRegression.hpp
+        NAMESPACE "kp")
+else() # Else we will use our precompiled versions
+    add_custom_command(OUTPUT $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/ShaderOpMult.hpp.in $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp)
+    add_custom_command(OUTPUT $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/ShaderLogisticRegression.hpp.in $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp)
+endif()
+
+add_library(kp_shader INTERFACE "${CMAKE_CURRENT_BINARY_DIR}/ShaderOpMult.hpp"
+    "${CMAKE_CURRENT_BINARY_DIR}/ShaderLogisticRegression.hpp")
+
+target_include_directories(kp_shader INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
+
+# Make sure we install shaders:
+install(FILES $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+install(FILES $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
diff --git a/kompute/src/shaders/glsl/ShaderLogisticRegression.comp b/kompute/src/shaders/glsl/ShaderLogisticRegression.comp
new file mode 100644
index 000000000..5a1c5d948
--- /dev/null
+++ b/kompute/src/shaders/glsl/ShaderLogisticRegression.comp
@@ -0,0 +1,52 @@
+#version 450
+
+layout (constant_id = 0) const float m = 0;
+
+layout (local_size_x = 1) in;
+
+layout(set = 0, binding = 0) buffer bxi { float xi[]; };
+layout(set = 0, binding = 1) buffer bxj { float xj[]; };
+layout(set = 0, binding = 2) buffer by { float y[]; };
+layout(set = 0, binding = 3) buffer bwin { float win[]; };
+layout(set = 0, binding = 4) buffer bwouti { float wouti[]; };
+layout(set = 0, binding = 5) buffer bwoutj { float woutj[]; };
+layout(set = 0, binding = 6) buffer bbin { float bin[]; };
+layout(set = 0, binding = 7) buffer bbout { float bout[]; };
+layout(set = 0, binding = 8) buffer blout { float lout[]; };
+
+float sigmoid(float z) {
+    return 1.0 / (1.0 + exp(-z));
+}
+
+float inference(vec2 x, vec2 w, float b) {
+    // Compute the linear mapping function
+    float z = dot(w, x) + b;
+    // Calculate the y-hat with sigmoid
+    float yHat = sigmoid(z);
+    return yHat;
+}
+
+float calculateLoss(float yHat, float y) {
+    return -(y * log(yHat)  +  (1.0 - y) * log(1.0 - yHat));
+}
+
+void main() {
+    uint idx = gl_GlobalInvocationID.x;
+
+    vec2 wCurr = vec2(win[0], win[1]);
+    float bCurr = bin[0];
+
+    vec2 xCurr = vec2(xi[idx], xj[idx]);
+    float yCurr = y[idx];
+
+    float yHat = inference(xCurr, wCurr, bCurr);
+
+    float dZ = yHat - yCurr;
+    vec2 dW = (1. / m) * xCurr * dZ;
+    float dB = (1. / m) * dZ;
+    wouti[idx] = dW.x;
+    woutj[idx] = dW.y;
+    bout[idx] = dB;
+
+    lout[idx] = calculateLoss(yHat, yCurr);
+}
diff --git a/kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in b/kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in
new file mode 100644
index 000000000..bfe7792c6
--- /dev/null
+++ b/kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in
@@ -0,0 +1,310 @@
+#pragma once
+#include <array>
+#include <cstdint>
+
+namespace kp {
+const std::array<uint32_t, 1204> SHADERLOGISTICREGRESSION_COMP_SPV = { 
+0x07230203, 0x00010000, 0x0008000a, 0x000000ae, 
+0x00000000, 0x00020011, 0x00000001, 0x0006000b, 
+0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e, 
+0x00000000, 0x0003000e, 0x00000000, 0x00000001, 
+0x0006000f, 0x00000005, 0x00000004, 0x6e69616d, 
+0x00000000, 0x00000041, 0x00060010, 0x00000004, 
+0x00000011, 0x00000001, 0x00000001, 0x00000001, 
+0x00030003, 0x00000002, 0x000001c2, 0x00040005, 
+0x00000004, 0x6e69616d, 0x00000000, 0x00050005, 
+0x0000000a, 0x6d676973, 0x2864696f, 0x003b3166, 
+0x00030005, 0x00000009, 0x0000007a, 0x00080005, 
+0x00000012, 0x65666e69, 0x636e6572, 0x66762865, 
+0x66763b32, 0x31663b32, 0x0000003b, 0x00030005, 
+0x0000000f, 0x00000078, 0x00030005, 0x00000010, 
+0x00000077, 0x00030005, 0x00000011, 0x00000062, 
+0x00080005, 0x00000017, 0x636c6163, 0x74616c75, 
+0x736f4c65, 0x31662873, 0x3b31663b, 0x00000000, 
+0x00040005, 0x00000015, 0x74614879, 0x00000000, 
+0x00030005, 0x00000016, 0x00000079, 0x00030005, 
+0x00000021, 0x0000007a, 0x00040005, 0x00000027, 
+0x74614879, 0x00000000, 0x00040005, 0x00000028, 
+0x61726170, 0x0000006d, 0x00030005, 0x0000003e, 
+0x00786469, 0x00080005, 0x00000041, 0x475f6c67, 
+0x61626f6c, 0x766e496c, 0x7461636f, 0x496e6f69, 
+0x00000044, 0x00040005, 0x00000046, 0x72754377, 
+0x00000072, 0x00040005, 0x00000048, 0x6e697762, 
+0x00000000, 0x00040006, 0x00000048, 0x00000000, 
+0x006e6977, 0x00030005, 0x0000004a, 0x00000000, 
+0x00040005, 0x00000054, 0x72754362, 0x00000072, 
+0x00040005, 0x00000056, 0x6e696262, 0x00000000, 
+0x00040006, 0x00000056, 0x00000000, 0x006e6962, 
+0x00030005, 0x00000058, 0x00000000, 0x00040005, 
+0x0000005b, 0x72754378, 0x00000072, 0x00030005, 
+0x0000005d, 0x00697862, 0x00040006, 0x0000005d, 
+0x00000000, 0x00006978, 0x00030005, 0x0000005f, 
+0x00000000, 0x00030005, 0x00000064, 0x006a7862, 
+0x00040006, 0x00000064, 0x00000000, 0x00006a78, 
+0x00030005, 0x00000066, 0x00000000, 0x00040005, 
+0x0000006b, 0x72754379, 0x00000072, 0x00030005, 
+0x0000006d, 0x00007962, 0x00040006, 0x0000006d, 
+0x00000000, 0x00000079, 0x00030005, 0x0000006f, 
+0x00000000, 0x00040005, 0x00000073, 0x74614879, 
+0x00000000, 0x00040005, 0x00000074, 0x61726170, 
+0x0000006d, 0x00040005, 0x00000076, 0x61726170, 
+0x0000006d, 0x00040005, 0x00000078, 0x61726170, 
+0x0000006d, 0x00030005, 0x0000007b, 0x00005a64, 
+0x00030005, 0x0000007f, 0x00005764, 0x00030005, 
+0x00000080, 0x0000006d, 0x00030005, 0x00000086, 
+0x00004264, 0x00040005, 0x0000008b, 0x756f7762, 
+0x00006974, 0x00050006, 0x0000008b, 0x00000000, 
+0x74756f77, 0x00000069, 0x00030005, 0x0000008d, 
+0x00000000, 0x00040005, 0x00000093, 0x756f7762, 
+0x00006a74, 0x00050006, 0x00000093, 0x00000000, 
+0x74756f77, 0x0000006a, 0x00030005, 0x00000095, 
+0x00000000, 0x00040005, 0x0000009c, 0x756f6262, 
+0x00000074, 0x00050006, 0x0000009c, 0x00000000, 
+0x74756f62, 0x00000000, 0x00030005, 0x0000009e, 
+0x00000000, 0x00040005, 0x000000a3, 0x756f6c62, 
+0x00000074, 0x00050006, 0x000000a3, 0x00000000, 
+0x74756f6c, 0x00000000, 0x00030005, 0x000000a5, 
+0x00000000, 0x00040005, 0x000000a7, 0x61726170, 
+0x0000006d, 0x00040005, 0x000000a9, 0x61726170, 
+0x0000006d, 0x00040047, 0x00000041, 0x0000000b, 
+0x0000001c, 0x00040047, 0x00000047, 0x00000006, 
+0x00000004, 0x00050048, 0x00000048, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x00000048, 
+0x00000003, 0x00040047, 0x0000004a, 0x00000022, 
+0x00000000, 0x00040047, 0x0000004a, 0x00000021, 
+0x00000003, 0x00040047, 0x00000055, 0x00000006, 
+0x00000004, 0x00050048, 0x00000056, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x00000056, 
+0x00000003, 0x00040047, 0x00000058, 0x00000022, 
+0x00000000, 0x00040047, 0x00000058, 0x00000021, 
+0x00000006, 0x00040047, 0x0000005c, 0x00000006, 
+0x00000004, 0x00050048, 0x0000005d, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x0000005d, 
+0x00000003, 0x00040047, 0x0000005f, 0x00000022, 
+0x00000000, 0x00040047, 0x0000005f, 0x00000021, 
+0x00000000, 0x00040047, 0x00000063, 0x00000006, 
+0x00000004, 0x00050048, 0x00000064, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x00000064, 
+0x00000003, 0x00040047, 0x00000066, 0x00000022, 
+0x00000000, 0x00040047, 0x00000066, 0x00000021, 
+0x00000001, 0x00040047, 0x0000006c, 0x00000006, 
+0x00000004, 0x00050048, 0x0000006d, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x0000006d, 
+0x00000003, 0x00040047, 0x0000006f, 0x00000022, 
+0x00000000, 0x00040047, 0x0000006f, 0x00000021, 
+0x00000002, 0x00040047, 0x00000080, 0x00000001, 
+0x00000000, 0x00040047, 0x0000008a, 0x00000006, 
+0x00000004, 0x00050048, 0x0000008b, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x0000008b, 
+0x00000003, 0x00040047, 0x0000008d, 0x00000022, 
+0x00000000, 0x00040047, 0x0000008d, 0x00000021, 
+0x00000004, 0x00040047, 0x00000092, 0x00000006, 
+0x00000004, 0x00050048, 0x00000093, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x00000093, 
+0x00000003, 0x00040047, 0x00000095, 0x00000022, 
+0x00000000, 0x00040047, 0x00000095, 0x00000021, 
+0x00000005, 0x00040047, 0x0000009b, 0x00000006, 
+0x00000004, 0x00050048, 0x0000009c, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x0000009c, 
+0x00000003, 0x00040047, 0x0000009e, 0x00000022, 
+0x00000000, 0x00040047, 0x0000009e, 0x00000021, 
+0x00000007, 0x00040047, 0x000000a2, 0x00000006, 
+0x00000004, 0x00050048, 0x000000a3, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x000000a3, 
+0x00000003, 0x00040047, 0x000000a5, 0x00000022, 
+0x00000000, 0x00040047, 0x000000a5, 0x00000021, 
+0x00000008, 0x00040047, 0x000000ad, 0x0000000b, 
+0x00000019, 0x00020013, 0x00000002, 0x00030021, 
+0x00000003, 0x00000002, 0x00030016, 0x00000006, 
+0x00000020, 0x00040020, 0x00000007, 0x00000007, 
+0x00000006, 0x00040021, 0x00000008, 0x00000006, 
+0x00000007, 0x00040017, 0x0000000c, 0x00000006, 
+0x00000002, 0x00040020, 0x0000000d, 0x00000007, 
+0x0000000c, 0x00060021, 0x0000000e, 0x00000006, 
+0x0000000d, 0x0000000d, 0x00000007, 0x00050021, 
+0x00000014, 0x00000006, 0x00000007, 0x00000007, 
+0x0004002b, 0x00000006, 0x00000019, 0x3f800000, 
+0x00040015, 0x0000003c, 0x00000020, 0x00000000, 
+0x00040020, 0x0000003d, 0x00000007, 0x0000003c, 
+0x00040017, 0x0000003f, 0x0000003c, 0x00000003, 
+0x00040020, 0x00000040, 0x00000001, 0x0000003f, 
+0x0004003b, 0x00000040, 0x00000041, 0x00000001, 
+0x0004002b, 0x0000003c, 0x00000042, 0x00000000, 
+0x00040020, 0x00000043, 0x00000001, 0x0000003c, 
+0x0003001d, 0x00000047, 0x00000006, 0x0003001e, 
+0x00000048, 0x00000047, 0x00040020, 0x00000049, 
+0x00000002, 0x00000048, 0x0004003b, 0x00000049, 
+0x0000004a, 0x00000002, 0x00040015, 0x0000004b, 
+0x00000020, 0x00000001, 0x0004002b, 0x0000004b, 
+0x0000004c, 0x00000000, 0x00040020, 0x0000004d, 
+0x00000002, 0x00000006, 0x0004002b, 0x0000004b, 
+0x00000050, 0x00000001, 0x0003001d, 0x00000055, 
+0x00000006, 0x0003001e, 0x00000056, 0x00000055, 
+0x00040020, 0x00000057, 0x00000002, 0x00000056, 
+0x0004003b, 0x00000057, 0x00000058, 0x00000002, 
+0x0003001d, 0x0000005c, 0x00000006, 0x0003001e, 
+0x0000005d, 0x0000005c, 0x00040020, 0x0000005e, 
+0x00000002, 0x0000005d, 0x0004003b, 0x0000005e, 
+0x0000005f, 0x00000002, 0x0003001d, 0x00000063, 
+0x00000006, 0x0003001e, 0x00000064, 0x00000063, 
+0x00040020, 0x00000065, 0x00000002, 0x00000064, 
+0x0004003b, 0x00000065, 0x00000066, 0x00000002, 
+0x0003001d, 0x0000006c, 0x00000006, 0x0003001e, 
+0x0000006d, 0x0000006c, 0x00040020, 0x0000006e, 
+0x00000002, 0x0000006d, 0x0004003b, 0x0000006e, 
+0x0000006f, 0x00000002, 0x00040032, 0x00000006, 
+0x00000080, 0x00000000, 0x0003001d, 0x0000008a, 
+0x00000006, 0x0003001e, 0x0000008b, 0x0000008a, 
+0x00040020, 0x0000008c, 0x00000002, 0x0000008b, 
+0x0004003b, 0x0000008c, 0x0000008d, 0x00000002, 
+0x0003001d, 0x00000092, 0x00000006, 0x0003001e, 
+0x00000093, 0x00000092, 0x00040020, 0x00000094, 
+0x00000002, 0x00000093, 0x0004003b, 0x00000094, 
+0x00000095, 0x00000002, 0x0004002b, 0x0000003c, 
+0x00000097, 0x00000001, 0x0003001d, 0x0000009b, 
+0x00000006, 0x0003001e, 0x0000009c, 0x0000009b, 
+0x00040020, 0x0000009d, 0x00000002, 0x0000009c, 
+0x0004003b, 0x0000009d, 0x0000009e, 0x00000002, 
+0x0003001d, 0x000000a2, 0x00000006, 0x0003001e, 
+0x000000a3, 0x000000a2, 0x00040020, 0x000000a4, 
+0x00000002, 0x000000a3, 0x0004003b, 0x000000a4, 
+0x000000a5, 0x00000002, 0x0006002c, 0x0000003f, 
+0x000000ad, 0x00000097, 0x00000097, 0x00000097, 
+0x00050036, 0x00000002, 0x00000004, 0x00000000, 
+0x00000003, 0x000200f8, 0x00000005, 0x0004003b, 
+0x0000003d, 0x0000003e, 0x00000007, 0x0004003b, 
+0x0000000d, 0x00000046, 0x00000007, 0x0004003b, 
+0x00000007, 0x00000054, 0x00000007, 0x0004003b, 
+0x0000000d, 0x0000005b, 0x00000007, 0x0004003b, 
+0x00000007, 0x0000006b, 0x00000007, 0x0004003b, 
+0x00000007, 0x00000073, 0x00000007, 0x0004003b, 
+0x0000000d, 0x00000074, 0x00000007, 0x0004003b, 
+0x0000000d, 0x00000076, 0x00000007, 0x0004003b, 
+0x00000007, 0x00000078, 0x00000007, 0x0004003b, 
+0x00000007, 0x0000007b, 0x00000007, 0x0004003b, 
+0x0000000d, 0x0000007f, 0x00000007, 0x0004003b, 
+0x00000007, 0x00000086, 0x00000007, 0x0004003b, 
+0x00000007, 0x000000a7, 0x00000007, 0x0004003b, 
+0x00000007, 0x000000a9, 0x00000007, 0x00050041, 
+0x00000043, 0x00000044, 0x00000041, 0x00000042, 
+0x0004003d, 0x0000003c, 0x00000045, 0x00000044, 
+0x0003003e, 0x0000003e, 0x00000045, 0x00060041, 
+0x0000004d, 0x0000004e, 0x0000004a, 0x0000004c, 
+0x0000004c, 0x0004003d, 0x00000006, 0x0000004f, 
+0x0000004e, 0x00060041, 0x0000004d, 0x00000051, 
+0x0000004a, 0x0000004c, 0x00000050, 0x0004003d, 
+0x00000006, 0x00000052, 0x00000051, 0x00050050, 
+0x0000000c, 0x00000053, 0x0000004f, 0x00000052, 
+0x0003003e, 0x00000046, 0x00000053, 0x00060041, 
+0x0000004d, 0x00000059, 0x00000058, 0x0000004c, 
+0x0000004c, 0x0004003d, 0x00000006, 0x0000005a, 
+0x00000059, 0x0003003e, 0x00000054, 0x0000005a, 
+0x0004003d, 0x0000003c, 0x00000060, 0x0000003e, 
+0x00060041, 0x0000004d, 0x00000061, 0x0000005f, 
+0x0000004c, 0x00000060, 0x0004003d, 0x00000006, 
+0x00000062, 0x00000061, 0x0004003d, 0x0000003c, 
+0x00000067, 0x0000003e, 0x00060041, 0x0000004d, 
+0x00000068, 0x00000066, 0x0000004c, 0x00000067, 
+0x0004003d, 0x00000006, 0x00000069, 0x00000068, 
+0x00050050, 0x0000000c, 0x0000006a, 0x00000062, 
+0x00000069, 0x0003003e, 0x0000005b, 0x0000006a, 
+0x0004003d, 0x0000003c, 0x00000070, 0x0000003e, 
+0x00060041, 0x0000004d, 0x00000071, 0x0000006f, 
+0x0000004c, 0x00000070, 0x0004003d, 0x00000006, 
+0x00000072, 0x00000071, 0x0003003e, 0x0000006b, 
+0x00000072, 0x0004003d, 0x0000000c, 0x00000075, 
+0x0000005b, 0x0003003e, 0x00000074, 0x00000075, 
+0x0004003d, 0x0000000c, 0x00000077, 0x00000046, 
+0x0003003e, 0x00000076, 0x00000077, 0x0004003d, 
+0x00000006, 0x00000079, 0x00000054, 0x0003003e, 
+0x00000078, 0x00000079, 0x00070039, 0x00000006, 
+0x0000007a, 0x00000012, 0x00000074, 0x00000076, 
+0x00000078, 0x0003003e, 0x00000073, 0x0000007a, 
+0x0004003d, 0x00000006, 0x0000007c, 0x00000073, 
+0x0004003d, 0x00000006, 0x0000007d, 0x0000006b, 
+0x00050083, 0x00000006, 0x0000007e, 0x0000007c, 
+0x0000007d, 0x0003003e, 0x0000007b, 0x0000007e, 
+0x00050088, 0x00000006, 0x00000081, 0x00000019, 
+0x00000080, 0x0004003d, 0x0000000c, 0x00000082, 
+0x0000005b, 0x0005008e, 0x0000000c, 0x00000083, 
+0x00000082, 0x00000081, 0x0004003d, 0x00000006, 
+0x00000084, 0x0000007b, 0x0005008e, 0x0000000c, 
+0x00000085, 0x00000083, 0x00000084, 0x0003003e, 
+0x0000007f, 0x00000085, 0x00050088, 0x00000006, 
+0x00000087, 0x00000019, 0x00000080, 0x0004003d, 
+0x00000006, 0x00000088, 0x0000007b, 0x00050085, 
+0x00000006, 0x00000089, 0x00000087, 0x00000088, 
+0x0003003e, 0x00000086, 0x00000089, 0x0004003d, 
+0x0000003c, 0x0000008e, 0x0000003e, 0x00050041, 
+0x00000007, 0x0000008f, 0x0000007f, 0x00000042, 
+0x0004003d, 0x00000006, 0x00000090, 0x0000008f, 
+0x00060041, 0x0000004d, 0x00000091, 0x0000008d, 
+0x0000004c, 0x0000008e, 0x0003003e, 0x00000091, 
+0x00000090, 0x0004003d, 0x0000003c, 0x00000096, 
+0x0000003e, 0x00050041, 0x00000007, 0x00000098, 
+0x0000007f, 0x00000097, 0x0004003d, 0x00000006, 
+0x00000099, 0x00000098, 0x00060041, 0x0000004d, 
+0x0000009a, 0x00000095, 0x0000004c, 0x00000096, 
+0x0003003e, 0x0000009a, 0x00000099, 0x0004003d, 
+0x0000003c, 0x0000009f, 0x0000003e, 0x0004003d, 
+0x00000006, 0x000000a0, 0x00000086, 0x00060041, 
+0x0000004d, 0x000000a1, 0x0000009e, 0x0000004c, 
+0x0000009f, 0x0003003e, 0x000000a1, 0x000000a0, 
+0x0004003d, 0x0000003c, 0x000000a6, 0x0000003e, 
+0x0004003d, 0x00000006, 0x000000a8, 0x00000073, 
+0x0003003e, 0x000000a7, 0x000000a8, 0x0004003d, 
+0x00000006, 0x000000aa, 0x0000006b, 0x0003003e, 
+0x000000a9, 0x000000aa, 0x00060039, 0x00000006, 
+0x000000ab, 0x00000017, 0x000000a7, 0x000000a9, 
+0x00060041, 0x0000004d, 0x000000ac, 0x000000a5, 
+0x0000004c, 0x000000a6, 0x0003003e, 0x000000ac, 
+0x000000ab, 0x000100fd, 0x00010038, 0x00050036, 
+0x00000006, 0x0000000a, 0x00000000, 0x00000008, 
+0x00030037, 0x00000007, 0x00000009, 0x000200f8, 
+0x0000000b, 0x0004003d, 0x00000006, 0x0000001a, 
+0x00000009, 0x0004007f, 0x00000006, 0x0000001b, 
+0x0000001a, 0x0006000c, 0x00000006, 0x0000001c, 
+0x00000001, 0x0000001b, 0x0000001b, 0x00050081, 
+0x00000006, 0x0000001d, 0x00000019, 0x0000001c, 
+0x00050088, 0x00000006, 0x0000001e, 0x00000019, 
+0x0000001d, 0x000200fe, 0x0000001e, 0x00010038, 
+0x00050036, 0x00000006, 0x00000012, 0x00000000, 
+0x0000000e, 0x00030037, 0x0000000d, 0x0000000f, 
+0x00030037, 0x0000000d, 0x00000010, 0x00030037, 
+0x00000007, 0x00000011, 0x000200f8, 0x00000013, 
+0x0004003b, 0x00000007, 0x00000021, 0x00000007, 
+0x0004003b, 0x00000007, 0x00000027, 0x00000007, 
+0x0004003b, 0x00000007, 0x00000028, 0x00000007, 
+0x0004003d, 0x0000000c, 0x00000022, 0x00000010, 
+0x0004003d, 0x0000000c, 0x00000023, 0x0000000f, 
+0x00050094, 0x00000006, 0x00000024, 0x00000022, 
+0x00000023, 0x0004003d, 0x00000006, 0x00000025, 
+0x00000011, 0x00050081, 0x00000006, 0x00000026, 
+0x00000024, 0x00000025, 0x0003003e, 0x00000021, 
+0x00000026, 0x0004003d, 0x00000006, 0x00000029, 
+0x00000021, 0x0003003e, 0x00000028, 0x00000029, 
+0x00050039, 0x00000006, 0x0000002a, 0x0000000a, 
+0x00000028, 0x0003003e, 0x00000027, 0x0000002a, 
+0x0004003d, 0x00000006, 0x0000002b, 0x00000027, 
+0x000200fe, 0x0000002b, 0x00010038, 0x00050036, 
+0x00000006, 0x00000017, 0x00000000, 0x00000014, 
+0x00030037, 0x00000007, 0x00000015, 0x00030037, 
+0x00000007, 0x00000016, 0x000200f8, 0x00000018, 
+0x0004003d, 0x00000006, 0x0000002e, 0x00000016, 
+0x0004003d, 0x00000006, 0x0000002f, 0x00000015, 
+0x0006000c, 0x00000006, 0x00000030, 0x00000001, 
+0x0000001c, 0x0000002f, 0x00050085, 0x00000006, 
+0x00000031, 0x0000002e, 0x00000030, 0x0004003d, 
+0x00000006, 0x00000032, 0x00000016, 0x00050083, 
+0x00000006, 0x00000033, 0x00000019, 0x00000032, 
+0x0004003d, 0x00000006, 0x00000034, 0x00000015, 
+0x00050083, 0x00000006, 0x00000035, 0x00000019, 
+0x00000034, 0x0006000c, 0x00000006, 0x00000036, 
+0x00000001, 0x0000001c, 0x00000035, 0x00050085, 
+0x00000006, 0x00000037, 0x00000033, 0x00000036, 
+0x00050081, 0x00000006, 0x00000038, 0x00000031, 
+0x00000037, 0x0004007f, 0x00000006, 0x00000039, 
+0x00000038, 0x000200fe, 0x00000039, 0x00010038 };
+} // namespace kp
+
+
diff --git a/kompute/src/shaders/glsl/ShaderOpMult.comp b/kompute/src/shaders/glsl/ShaderOpMult.comp
new file mode 100644
index 000000000..d54865037
--- /dev/null
+++ b/kompute/src/shaders/glsl/ShaderOpMult.comp
@@ -0,0 +1,28 @@
+#version 450
+
+layout(set = 0, binding = 0) buffer tensorLhs {
+   float valuesLhs[ ];
+};
+
+layout(set = 0, binding = 1) buffer tensorRhs {
+   float valuesRhs[ ];
+};
+
+layout(set = 0, binding = 2) buffer tensorOutput {
+   float valuesOutput[ ];
+};
+
+layout (constant_id = 0) const uint LEN_LHS = 0;
+layout (constant_id = 1) const uint LEN_RHS = 0;
+layout (constant_id = 2) const uint LEN_OUT = 0;
+
+layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+void main() 
+{
+	uint index = gl_GlobalInvocationID.x;
+
+    valuesOutput[index] = valuesLhs[index] * valuesRhs[index];
+}
+
+
diff --git a/kompute/src/shaders/glsl/ShaderOpMult.hpp.in b/kompute/src/shaders/glsl/ShaderOpMult.hpp.in
new file mode 100644
index 000000000..5af29c66d
--- /dev/null
+++ b/kompute/src/shaders/glsl/ShaderOpMult.hpp.in
@@ -0,0 +1,101 @@
+#pragma once
+#include <array>
+#include <cstdint>
+
+namespace kp {
+const std::array<uint32_t, 366> SHADEROPMULT_COMP_SPV = { 
+0x07230203, 0x00010000, 0x0008000a, 0x0000002e, 
+0x00000000, 0x00020011, 0x00000001, 0x0006000b, 
+0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e, 
+0x00000000, 0x0003000e, 0x00000000, 0x00000001, 
+0x0006000f, 0x00000005, 0x00000004, 0x6e69616d, 
+0x00000000, 0x0000000b, 0x00060010, 0x00000004, 
+0x00000011, 0x00000001, 0x00000001, 0x00000001, 
+0x00030003, 0x00000002, 0x000001c2, 0x00040005, 
+0x00000004, 0x6e69616d, 0x00000000, 0x00040005, 
+0x00000008, 0x65646e69, 0x00000078, 0x00080005, 
+0x0000000b, 0x475f6c67, 0x61626f6c, 0x766e496c, 
+0x7461636f, 0x496e6f69, 0x00000044, 0x00060005, 
+0x00000012, 0x736e6574, 0x754f726f, 0x74757074, 
+0x00000000, 0x00070006, 0x00000012, 0x00000000, 
+0x756c6176, 0x754f7365, 0x74757074, 0x00000000, 
+0x00030005, 0x00000014, 0x00000000, 0x00050005, 
+0x00000019, 0x736e6574, 0x684c726f, 0x00000073, 
+0x00060006, 0x00000019, 0x00000000, 0x756c6176, 
+0x684c7365, 0x00000073, 0x00030005, 0x0000001b, 
+0x00000000, 0x00050005, 0x00000021, 0x736e6574, 
+0x6852726f, 0x00000073, 0x00060006, 0x00000021, 
+0x00000000, 0x756c6176, 0x68527365, 0x00000073, 
+0x00030005, 0x00000023, 0x00000000, 0x00040005, 
+0x00000029, 0x5f4e454c, 0x0053484c, 0x00040005, 
+0x0000002a, 0x5f4e454c, 0x00534852, 0x00040005, 
+0x0000002b, 0x5f4e454c, 0x0054554f, 0x00040047, 
+0x0000000b, 0x0000000b, 0x0000001c, 0x00040047, 
+0x00000011, 0x00000006, 0x00000004, 0x00050048, 
+0x00000012, 0x00000000, 0x00000023, 0x00000000, 
+0x00030047, 0x00000012, 0x00000003, 0x00040047, 
+0x00000014, 0x00000022, 0x00000000, 0x00040047, 
+0x00000014, 0x00000021, 0x00000002, 0x00040047, 
+0x00000018, 0x00000006, 0x00000004, 0x00050048, 
+0x00000019, 0x00000000, 0x00000023, 0x00000000, 
+0x00030047, 0x00000019, 0x00000003, 0x00040047, 
+0x0000001b, 0x00000022, 0x00000000, 0x00040047, 
+0x0000001b, 0x00000021, 0x00000000, 0x00040047, 
+0x00000020, 0x00000006, 0x00000004, 0x00050048, 
+0x00000021, 0x00000000, 0x00000023, 0x00000000, 
+0x00030047, 0x00000021, 0x00000003, 0x00040047, 
+0x00000023, 0x00000022, 0x00000000, 0x00040047, 
+0x00000023, 0x00000021, 0x00000001, 0x00040047, 
+0x00000029, 0x00000001, 0x00000000, 0x00040047, 
+0x0000002a, 0x00000001, 0x00000001, 0x00040047, 
+0x0000002b, 0x00000001, 0x00000002, 0x00040047, 
+0x0000002d, 0x0000000b, 0x00000019, 0x00020013, 
+0x00000002, 0x00030021, 0x00000003, 0x00000002, 
+0x00040015, 0x00000006, 0x00000020, 0x00000000, 
+0x00040020, 0x00000007, 0x00000007, 0x00000006, 
+0x00040017, 0x00000009, 0x00000006, 0x00000003, 
+0x00040020, 0x0000000a, 0x00000001, 0x00000009, 
+0x0004003b, 0x0000000a, 0x0000000b, 0x00000001, 
+0x0004002b, 0x00000006, 0x0000000c, 0x00000000, 
+0x00040020, 0x0000000d, 0x00000001, 0x00000006, 
+0x00030016, 0x00000010, 0x00000020, 0x0003001d, 
+0x00000011, 0x00000010, 0x0003001e, 0x00000012, 
+0x00000011, 0x00040020, 0x00000013, 0x00000002, 
+0x00000012, 0x0004003b, 0x00000013, 0x00000014, 
+0x00000002, 0x00040015, 0x00000015, 0x00000020, 
+0x00000001, 0x0004002b, 0x00000015, 0x00000016, 
+0x00000000, 0x0003001d, 0x00000018, 0x00000010, 
+0x0003001e, 0x00000019, 0x00000018, 0x00040020, 
+0x0000001a, 0x00000002, 0x00000019, 0x0004003b, 
+0x0000001a, 0x0000001b, 0x00000002, 0x00040020, 
+0x0000001d, 0x00000002, 0x00000010, 0x0003001d, 
+0x00000020, 0x00000010, 0x0003001e, 0x00000021, 
+0x00000020, 0x00040020, 0x00000022, 0x00000002, 
+0x00000021, 0x0004003b, 0x00000022, 0x00000023, 
+0x00000002, 0x00040032, 0x00000006, 0x00000029, 
+0x00000000, 0x00040032, 0x00000006, 0x0000002a, 
+0x00000000, 0x00040032, 0x00000006, 0x0000002b, 
+0x00000000, 0x0004002b, 0x00000006, 0x0000002c, 
+0x00000001, 0x0006002c, 0x00000009, 0x0000002d, 
+0x0000002c, 0x0000002c, 0x0000002c, 0x00050036, 
+0x00000002, 0x00000004, 0x00000000, 0x00000003, 
+0x000200f8, 0x00000005, 0x0004003b, 0x00000007, 
+0x00000008, 0x00000007, 0x00050041, 0x0000000d, 
+0x0000000e, 0x0000000b, 0x0000000c, 0x0004003d, 
+0x00000006, 0x0000000f, 0x0000000e, 0x0003003e, 
+0x00000008, 0x0000000f, 0x0004003d, 0x00000006, 
+0x00000017, 0x00000008, 0x0004003d, 0x00000006, 
+0x0000001c, 0x00000008, 0x00060041, 0x0000001d, 
+0x0000001e, 0x0000001b, 0x00000016, 0x0000001c, 
+0x0004003d, 0x00000010, 0x0000001f, 0x0000001e, 
+0x0004003d, 0x00000006, 0x00000024, 0x00000008, 
+0x00060041, 0x0000001d, 0x00000025, 0x00000023, 
+0x00000016, 0x00000024, 0x0004003d, 0x00000010, 
+0x00000026, 0x00000025, 0x00050085, 0x00000010, 
+0x00000027, 0x0000001f, 0x00000026, 0x00060041, 
+0x0000001d, 0x00000028, 0x00000014, 0x00000016, 
+0x00000017, 0x0003003e, 0x00000028, 0x00000027, 
+0x000100fd, 0x00010038 };
+} // namespace kp
+
+
diff --git a/kompute/src/shaders/hlsl/computeheadless.comp b/kompute/src/shaders/hlsl/computeheadless.comp
new file mode 100644
index 000000000..ee3cd024f
--- /dev/null
+++ b/kompute/src/shaders/hlsl/computeheadless.comp
@@ -0,0 +1,29 @@
+// Copyright 2020 Google LLC
+
+RWStructuredBuffer<uint> values : register(u0);
+[[vk::constant_id(0)]] const uint BUFFER_ELEMENTS = 32;
+
+uint fibonacci(uint n) {
+	if(n <= 1){
+		return n;
+	}
+	uint curr = 1;
+	uint prev = 1;
+	for(uint i = 2; i < n; ++i) {
+		uint temp = curr;
+		curr += prev;
+		prev = temp;
+	}
+	return curr;
+}
+
+[numthreads(1, 1, 1)]
+void main(uint3 GlobalInvocationID : SV_DispatchThreadID)
+{
+	uint index = GlobalInvocationID.x;
+	if (index >= BUFFER_ELEMENTS)
+		return;
+	values[index] = fibonacci(values[index]);
+}
+
+
diff --git a/llama.cpp b/llama.cpp
index 6e23a0772..c835c6fd4 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9,6 +9,8 @@
 #  include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
 #  include "ggml-opencl.h"
+#elif defined(GGML_USE_KOMPUTE)
+#   include "ggml-vulkan.h"
 #endif
 
 #ifdef GGML_USE_METAL
@@ -1182,11 +1184,14 @@ struct llama_context {
 
 #ifdef GGML_USE_METAL
     ggml_metal_context * ctx_metal = NULL;
+#elif defined(GGML_USE_KOMPUTE)
+    ggml_kompute_context * ctx_kompute = NULL;
 #endif
 
 #ifdef GGML_USE_MPI
     ggml_mpi_context * ctx_mpi = NULL;
 #endif
+
 };
 
 //
@@ -2474,6 +2479,9 @@ static struct ggml_cgraph * llm_build_llama(
 
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
+#if defined(GGML_USE_KOMPUTE)
+    struct ggml_tensor * toDeviceTensor = nullptr;
+#endif
 
     if (tokens) {
         struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
@@ -2483,6 +2491,9 @@ static struct ggml_cgraph * llm_build_llama(
             memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
         }
         ggml_set_name(inp_tokens, "inp_tokens");
+#if defined(GGML_USE_KOMPUTE)
+        toDeviceTensor = inp_tokens;
+#endif
 
         inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
     } else {
@@ -2491,6 +2502,9 @@ static struct ggml_cgraph * llm_build_llama(
 #endif
 
         inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
+#if defined(GGML_USE_KOMPUTE)
+        toDeviceTensor = inpL;
+#endif
 
         ggml_allocr_alloc(lctx.alloc, inpL);
         if (!ggml_allocr_is_measure(lctx.alloc)) {
@@ -2693,7 +2707,6 @@ static struct ggml_cgraph * llm_build_llama(
                 offload_func(cur);
                 ggml_set_name(cur, "ffn_norm");
             }
-
             struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
                     model.layers[il].w3,
                     cur);
@@ -2752,6 +2765,16 @@ static struct ggml_cgraph * llm_build_llama(
 
     ggml_free(ctx0);
 
+#if defined(GGML_USE_KOMPUTE)
+    if (lctx.ctx_kompute && N == 1) {
+        if (!ggml_vk_has_h2d_all(lctx.ctx_kompute)) {
+            ggml_vk_h2d_all(lctx.ctx_kompute);
+        } else {
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, toDeviceTensor);
+        }
+    }
+#endif
+
     return gf;
 }
 
@@ -3792,6 +3815,17 @@ static bool llama_eval_internal(
     } else {
         ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
     }
+#elif defined(GGML_USE_KOMPUTE)
+    if (lctx.ctx_kompute && N == 1) {
+        ggml_vk_graph_compute(lctx.ctx_kompute, gf);
+        ggml_vk_d2h_tensor(lctx.ctx_kompute, res);
+    } else {
+        ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
+        if (lctx.ctx_kompute) {
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.k);
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.v);
+        }
+    }
 #else
     ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
 #endif
@@ -3833,12 +3867,12 @@ static bool llama_eval_internal(
     }
 
     // extract embeddings
-    if (!lctx.embedding.empty()) {
-        auto & embedding_out = lctx.embedding;
+    //if (!lctx.embedding.empty()) {
+    //    auto & embedding_out = lctx.embedding;
 
-        embedding_out.resize(n_embd);
-        memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
-    }
+    //    embedding_out.resize(n_embd);
+    //    memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
+    //}
 
     // measure the performance only for the single-token evals
     if (N == 1) {
@@ -5904,6 +5938,7 @@ static int llama_apply_lora_from_file_internal(
 ) {
     LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
 
+
     const int64_t t_start_lora_us = ggml_time_us();
 
     auto fin = std::ifstream(path_lora, std::ios::binary);
diff --git a/llama.h b/llama.h
index 350268b9a..3d911adca 100644
--- a/llama.h
+++ b/llama.h
@@ -42,7 +42,7 @@
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION 1
 
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_KOMPUTE)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif
diff --git a/undump.py b/undump.py
new file mode 100644
index 000000000..db19ffe69
--- /dev/null
+++ b/undump.py
@@ -0,0 +1,18 @@
+import struct
+import numpy as np
+from pathlib import Path
+
+def undump(fn):
+    with open(fn, 'rb') as df:
+        dims = struct.unpack('=QQQQ', df.read(8*4))
+        (dsz,) = struct.unpack('=Q', df.read(8))
+        ## assume f32
+        data = df.read(dsz)
+        data = [i for (i,) in struct.iter_unpack('=f', data)]
+        return np.array(data).reshape(dims).squeeze()
+
+if __name__ == '__main__':
+    for dfn in sorted(Path('.').glob('*.dump')):
+        darr = undump(dfn)
+        print(f'{dfn}: {darr.shape}\n{darr}')
+