Nomic vulkan backend licensed under the Software for Open Models License (SOM), version 1.0.

This commit is contained in:
niansa 2023-06-22 12:58:07 +02:00 committed by Cebtenzzre
parent 45855b3f1c
commit ba15dfd0be
95 changed files with 13489 additions and 23 deletions

0
.gitmodules vendored Normal file
View File

View File

@ -86,6 +86,7 @@ option(LLAMA_HIPBLAS "llama: use hipBLAS"
option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON) option(LLAMA_K_QUANTS "llama: use k-quants" ON)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
@ -412,6 +413,127 @@ if (LLAMA_HIPBLAS)
endif() endif()
endif() endif()
if (LLAMA_KOMPUTE)
find_package(Vulkan COMPONENTS glslc REQUIRED)
find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
if (NOT glslc_executable)
message(FATAL_ERROR "glslc not found")
endif()
function(compile_shader)
set(options)
set(oneValueArgs)
set(multiValueArgs SOURCES)
cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
foreach(source ${compile_shader_SOURCES})
set(spv_file ${source}.spv)
add_custom_command(
OUTPUT ${spv_file}
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
COMMENT "Compiling ${source} to ${source}.spv"
)
get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
set(FILE_NAME "shader${RAW_FILE_NAME}")
string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
add_custom_command(
OUTPUT ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
COMMAND xxd -i ${spv_file} >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
DEPENDS ${spv_file}
COMMENT "Converting to hpp: ${FILE_NAME}"
)
endforeach()
endfunction()
if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
message(STATUS "Kompute found")
add_subdirectory(kompute)
# Compile our shaders
compile_shader(SOURCES
kompute/op_scale.comp
kompute/op_add.comp
kompute/op_addrow.comp
kompute/op_mul.comp
kompute/op_mulrow.comp
kompute/op_silu.comp
kompute/op_relu.comp
kompute/op_gelu.comp
kompute/op_softmax.comp
kompute/op_norm.comp
kompute/op_rmsnorm.comp
kompute/op_diagmask.comp
kompute/op_mul_mat_f16.comp
kompute/op_mul_mat_q4_0.comp
kompute/op_mul_mat_q4_1.comp
kompute/op_getrows_f16.comp
kompute/op_getrows_q4_0.comp
kompute/op_getrows_q4_1.comp
kompute/op_rope.comp
kompute/op_cpy_f16_f16.comp
kompute/op_cpy_f16_f32.comp
kompute/op_cpy_f32_f16.comp
kompute/op_cpy_f32_f32.comp
)
# Create a custom target for our generated shaders
add_custom_target(generated_shaders DEPENDS
shaderop_scale.h
shaderop_add.h
shaderop_addrow.h
shaderop_mul.h
shaderop_mulrow.h
shaderop_silu.h
shaderop_relu.h
shaderop_gelu.h
shaderop_softmax.h
shaderop_norm.h
shaderop_rmsnorm.h
shaderop_diagmask.h
shaderop_mul_mat_f16.h
shaderop_mul_mat_q4_0.h
shaderop_mul_mat_q4_1.h
shaderop_getrows_f16.h
shaderop_getrows_q4_0.h
shaderop_getrows_q4_1.h
shaderop_rope.h
shaderop_cpy_f16_f16.h
shaderop_cpy_f16_f32.h
shaderop_cpy_f32_f16.h
shaderop_cpy_f32_f32.h
)
# Create a custom command that depends on the generated_shaders
add_custom_command(
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp
COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp
DEPENDS generated_shaders
COMMENT "Ensuring shaders are generated before compiling ggml-vulkan.cpp"
)
# Add the stamp to the main sources to ensure dependency tracking
set(GGML_SOURCES_KOMPUTE ggml-vulkan.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp)
set(GGML_HEADERS_KOMPUTE ggml-vulkan.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp)
add_compile_definitions(GGML_USE_KOMPUTE)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
else()
message(WARNING "Kompute not found")
endif()
endif()
if (LLAMA_ALL_WARNINGS) if (LLAMA_ALL_WARNINGS)
if (NOT MSVC) if (NOT MSVC)
set(c_flags set(c_flags
@ -648,6 +770,7 @@ add_library(ggml OBJECT
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
) )
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})

30
LICENSE_SOM.txt Normal file
View File

@ -0,0 +1,30 @@
Software for Open Models License (SOM)
Version 1.0 dated August 30th, 2023
This license governs use of the accompanying Software. If you use the Software, you accept this license. If you do not accept the license, do not use the Software.
This license is intended to encourage open release of models created, modified, processed, or otherwise used via the Software under open licensing terms, and should be interpreted in light of that intent.
1. Definitions
The “Licensor” is the person or entity who is making the Software available under this license. “Software” is the software made available by Licensor under this license.
A “Model” is the output of a machine learning algorithm, and excludes the Software.
“Model Source Materials” must include the Model and model weights, and may include any input data, input data descriptions, documentation or training descriptions for the Model.
“Open Licensing Terms” means: (a) any open source license approved by the Open Source Initiative, or (b) any other terms that make the Model Source Materials publicly available free of charge, and allow recipients to use, modify and distribute the Model Source Materials. Terms described in (b) may include reasonable restrictions such as non-commercial or non-production limitations, or require use in compliance with law.
2. Grant of Rights. Subject to the conditions and limitations in section 3:
(A) Copyright Grant. Licensor grants you a non-exclusive, worldwide, royalty-free copyright license to copy, modify, and distribute the Software and any modifications of the Software you create under this license. The foregoing license includes without limitation the right to create, modify, and use Models using this Software.
(B) Patent Grant. Licensor grants you a non-exclusive, worldwide, royalty-free license, under any patents owned or controlled by Licensor, to make, have made, use, sell, offer for sale, import, or otherwise exploit the Software. No license is granted to patent rights that are not embodied in the operation of the Software in the form provided by Licensor.
3. Conditions and Limitations
(A) Model Licensing and Access. If you use the Software to create, modify, process, or otherwise use any Model, including usage to create inferences with a Model, whether or not you make the Model available to others, you must make that Model Source Materials publicly available under Open Licensing Terms.
(B) No Re-Licensing. If you redistribute the Software, or modifications to the Software made under the license granted above, you must make it available only under the terms of this license. You may offer additional terms such as warranties, maintenance and support, but You, and not Licensor, are responsible for performing such terms.
(C) No Trademark License. This license does not grant you rights to use the Licensors name, logo, or trademarks.
(D) If you assert in writing a claim against any person or entity alleging that the use of the Software infringes any patent, all of your licenses to the Software under Section 2 end automatically as of the date you asserted the claim.
(E) If you distribute any portion of the Software, you must retain all copyright, patent, trademark, and attribution notices that are present in the Software, and you must include a copy of this license.
(F) The Software is licensed “as-is.” You bear the entire risk of using it. Licensor gives You no express warranties, guarantees or conditions. You may have additional consumer rights under your local laws that this license cannot change. To the extent permitted under your local laws, the Licensor disclaims and excludes the implied warranties of merchantability, fitness for a particular purpose and non-infringement. To the extent this disclaimer is unlawful, you, and not Licensor, are responsible for any liability.

View File

@ -33,6 +33,10 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
#if defined(GGML_USE_KOMPUTE)
#include "ggml-vulkan.h"
#endif
static llama_context ** g_ctx; static llama_context ** g_ctx;
static llama_model ** g_model; static llama_model ** g_model;
static gpt_params * g_params; static gpt_params * g_params;
@ -171,6 +175,10 @@ int main(int argc, char ** argv) {
g_model = &model; g_model = &model;
g_ctx = &ctx; g_ctx = &ctx;
#if defined(GGML_USE_KOMPUTE)
ggml_vk_init_device(0, "gpu");
#endif
// load the model and apply lora adapter, if any // load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__); LOG("%s: load the model and apply lora adapter, if any\n", __func__);
std::tie(model, ctx) = llama_init_from_gpt_params(params); std::tie(model, ctx) = llama_init_from_gpt_params(params);

1313
ggml-vulkan.cpp Normal file

File diff suppressed because it is too large Load Diff

61
ggml-vulkan.h Normal file
View File

@ -0,0 +1,61 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#pragma once
#include <cstddef>
#include <vector>
#include <string>
struct ggml_kompute_context;
namespace vk {
class DeviceMemory;
class Buffer;
};
struct ggml_vk_memory {
void *data = nullptr;
size_t size = 0;
vk::DeviceMemory *primaryMemory = nullptr;
vk::Buffer *primaryBuffer = nullptr;
vk::DeviceMemory *stagingMemory = nullptr;
vk::Buffer *stagingBuffer = nullptr;
};
struct ggml_vk_device {
int index = 0;
int type = 0; // same as VkPhysicalDeviceType
size_t heapSize = 0;
std::string name;
std::string vendor;
};
std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired);
bool ggml_vk_init_device(size_t memoryRequired, const std::string &device);
bool ggml_vk_init_device(const ggml_vk_device &device);
bool ggml_vk_init_device(int device);
bool ggml_vk_has_device();
ggml_vk_device ggml_vk_current_device();
struct ggml_kompute_context * ggml_vk_init(void);
bool ggml_vk_has_h2d_all(struct ggml_kompute_context * ctx);
void ggml_vk_free(struct ggml_kompute_context * ctx);
size_t ggml_vk_aligned_offset(size_t offset);
ggml_vk_memory ggml_vk_allocate(size_t size);
void ggml_vk_free_memory(ggml_vk_memory &memory);
void ggml_vk_add_buffer(
struct ggml_kompute_context * ctx,
const char * name,
const ggml_vk_memory &memory);
void ggml_vk_h2d_all(struct ggml_kompute_context * ctx);
void ggml_vk_d2h_all(struct ggml_kompute_context * ctx);
void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf);

32
ggml.c
View File

@ -9007,7 +9007,7 @@ static void ggml_compute_forward_add_q_f32(
} }
} }
static void ggml_compute_forward_add( void ggml_compute_forward_add(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
@ -9587,7 +9587,7 @@ static void ggml_compute_forward_mul_f32(
} }
} }
static void ggml_compute_forward_mul( void ggml_compute_forward_mul(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
@ -10510,7 +10510,7 @@ static void ggml_compute_forward_elu(
// ggml_compute_forward_relu // ggml_compute_forward_relu
static void ggml_compute_forward_relu_f32( void ggml_compute_forward_relu_f32(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
@ -10534,7 +10534,7 @@ static void ggml_compute_forward_relu_f32(
} }
} }
static void ggml_compute_forward_relu( void ggml_compute_forward_relu(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
@ -10552,7 +10552,7 @@ static void ggml_compute_forward_relu(
// ggml_compute_forward_gelu // ggml_compute_forward_gelu
static void ggml_compute_forward_gelu_f32( void ggml_compute_forward_gelu_f32(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
@ -10593,7 +10593,7 @@ static void ggml_compute_forward_gelu_f32(
} }
} }
static void ggml_compute_forward_gelu( void ggml_compute_forward_gelu(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
@ -10670,7 +10670,7 @@ static void ggml_compute_forward_gelu_quick(
// ggml_compute_forward_silu // ggml_compute_forward_silu
static void ggml_compute_forward_silu_f32( void ggml_compute_forward_silu_f32(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
@ -10711,7 +10711,7 @@ static void ggml_compute_forward_silu_f32(
} }
} }
static void ggml_compute_forward_silu( void ggml_compute_forward_silu(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
@ -10844,7 +10844,7 @@ static void ggml_compute_forward_norm_f32(
} }
} }
static void ggml_compute_forward_norm( void ggml_compute_forward_norm(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
@ -10910,7 +10910,7 @@ static void ggml_compute_forward_rms_norm_f32(
} }
} }
static void ggml_compute_forward_rms_norm( void ggml_compute_forward_rms_norm(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
@ -11623,7 +11623,7 @@ static void ggml_compute_forward_scale_f32(
} }
} }
static void ggml_compute_forward_scale( void ggml_compute_forward_scale(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
@ -11744,7 +11744,7 @@ static void ggml_compute_forward_set(
// ggml_compute_forward_cpy // ggml_compute_forward_cpy
static void ggml_compute_forward_cpy( void ggml_compute_forward_cpy(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
@ -11888,7 +11888,7 @@ static void ggml_compute_forward_get_rows_f32(
} }
} }
static void ggml_compute_forward_get_rows( void ggml_compute_forward_get_rows(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
@ -12164,7 +12164,7 @@ static void ggml_compute_forward_diag_mask_f32(
} }
} }
static void ggml_compute_forward_diag_mask_inf( void ggml_compute_forward_diag_mask_inf(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
@ -12198,7 +12198,7 @@ static void ggml_compute_forward_diag_mask_zero(
// ggml_compute_forward_soft_max // ggml_compute_forward_soft_max
static void ggml_compute_forward_soft_max_f32( void ggml_compute_forward_soft_max_f32(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
@ -12887,7 +12887,7 @@ static void ggml_compute_forward_rope_f16(
} }
} }
static void ggml_compute_forward_rope( void ggml_compute_forward_rope(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {

27
kompute/.ccls Normal file
View File

@ -0,0 +1,27 @@
%clang
-fdeclspec
-fms-extensions
-Wall
-Wextra
-std=c++17
%h -x
%h c++-header
-DDEBUG=1
-DKOMPUTE_INCLUDE_FOR_SYNTAX
-I/usr/include/python3.6/
-I./python/pybind11/include/
-I./build/_deps/vulkan_header-src/include/
-I./build/_deps/spdlog-src/include/
-I./build/_deps/googletest-src/googletest/include/
-I./build/_deps/fmt-src/include/
-I./src/include/
-I./build/src/shaders/glsl/
-I./build/test/shaders/glsl/
-I./test/utils/

5
kompute/.clang-format Normal file
View File

@ -0,0 +1,5 @@
---
BasedOnStyle: Mozilla
IndentWidth: 4
...

4
kompute/.dockerignore Normal file
View File

@ -0,0 +1,4 @@
build/*
examples/*
docker-builders/
swiftshader/

View File

@ -0,0 +1,58 @@
name: C++ Tests
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
jobs:
array-multiplication-example:
runs-on: ubuntu-latest
container: axsauze/kompute-builder:0.4
env:
VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
- name: "[Release g++] Build & Test"
uses: KomputeProject/action-cmake-build@master
with:
build-dir: ${{github.workspace}}/examples/array_multiplication/build
source-dir: ${{github.workspace}}/examples/array_multiplication
cc: gcc
cxx: g++
build-type: Debug
run-test: false
ctest-options: -V
configure-options: -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON KOMPUTE_OPT_FROM_SOURCE=ON
build-options: --parallel # Given we don't build too many resources we can leverage parallel
- name: Run tests
run: ./examples/array_multiplication/build/src/kompute_array_mult
logistc-regression-example:
runs-on: ubuntu-latest
container: axsauze/kompute-builder:0.4
env:
VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
- name: "[Release g++] Build & Test"
uses: KomputeProject/action-cmake-build@master
with:
build-dir: ${{github.workspace}}/examples/logistic_regression/build
source-dir: ${{github.workspace}}/examples/logistic_regression
cc: gcc
cxx: g++
build-type: Debug
run-test: false
ctest-options: -V
configure-options: -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON KOMPUTE_OPT_FROM_SOURCE=ON
build-options: --parallel # Given we don't build too many resources we can leverage parallel
- name: Run tests
run: ./examples/logistic_regression/build/src/kompute_logistic_regression

104
kompute/.github/workflows/cpp_tests.yml vendored Normal file
View File

@ -0,0 +1,104 @@
name: C++ Tests
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
jobs:
cpp-tests-debug-with-debug-layers:
runs-on: ubuntu-latest
container: axsauze/kompute-builder:0.4
env:
VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
- name: "[Release g++] Build & Test"
uses: KomputeProject/action-cmake-build@master
with:
build-dir: ${{github.workspace}}/build
source-dir: ${{github.workspace}}
cc: gcc
cxx: g++
build-type: Debug
run-test: false
ctest-options: -V
configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=OFF -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
- name: Run tests
run: make mk_run_tests
cpp-tests-release-with-debug-layers:
runs-on: ubuntu-latest
container: axsauze/kompute-builder:0.4
env:
VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
- name: "[Release g++] Build & Test"
uses: KomputeProject/action-cmake-build@master
with:
build-dir: ${{github.workspace}}/build
source-dir: ${{github.workspace}}
cc: gcc
cxx: g++
build-type: Release
run-test: false
ctest-options: -V
configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=OFF -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
- name: Run tests
run: make mk_run_tests
cpp-tests-debug-without-debug-layers:
runs-on: ubuntu-latest
container: axsauze/kompute-builder:0.4
env:
VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
- name: "[Release g++] Build & Test"
uses: KomputeProject/action-cmake-build@master
with:
build-dir: ${{github.workspace}}/build
source-dir: ${{github.workspace}}
cc: gcc
cxx: g++
build-type: Debug
run-test: false
ctest-options: -V
configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=ON -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
- name: Run tests
run: make mk_run_tests
cpp-tests-release-without-debug-layers:
runs-on: ubuntu-latest
container: axsauze/kompute-builder:0.4
env:
VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
- name: "[Release g++] Build & Test"
uses: KomputeProject/action-cmake-build@master
with:
build-dir: ${{github.workspace}}/build
source-dir: ${{github.workspace}}
cc: gcc
cxx: g++
build-type: Release
run-test: false
ctest-options: -V
configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=ON -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
- name: Run tests
run: make mk_run_tests

View File

@ -0,0 +1,28 @@
name: Python Tests
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
jobs:
python-tests:
runs-on: ubuntu-latest
container: axsauze/kompute-builder:0.4
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
- name: Install Python Requirements
run: pip3 install --user -r python/test/requirements-dev.txt
- name: Python Build
env:
KOMPUTE_PYTHON_NUM_PARALLEL_THREADS: 2
KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER: ON
run: pip3 install --user . -v
- name: Python run Tests
run: |
export VK_ICD_FILENAMES=/swiftshader/vk_swiftshader_icd.json
make test_python

187
kompute/CMakeLists.txt Normal file
View File

@ -0,0 +1,187 @@
# SPDX-License-Identifier: Apache-2.0
cmake_minimum_required(VERSION 3.20)
project(kompute VERSION 0.8.1 LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 14)
# Only change the folder behavior if kompute is not a subproject
if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
set_property(GLOBAL PROPERTY USE_FOLDERS ON)
set_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER "CMake")
set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}/bin)
set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}/lib)
endif()
# Avoid the dll boilerplate code for windows
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake;${CMAKE_MODULE_PATH}")
set(KOMPUTE_LIBRARIES kompute CACHE INTERNAL "")
# ####################################################
# Options
# ####################################################
macro(kompute_option OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
option(${OPTION_NAME} ${OPTION_TEXT} ${OPTION_DEFAULT})
if(DEFINED ENV{${OPTION_NAME}})
# Allow overriding the option through an environment variable
set(${OPTION_NAME} $ENV{${OPTION_NAME}})
endif()
if(${OPTION_NAME})
add_definitions(-D${OPTION_NAME})
endif()
message(STATUS " ${OPTION_NAME}: ${${OPTION_NAME}}")
endmacro()
macro(kompute_log_level OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
set(${OPTION_NAME} ${OPTION_DEFAULT} CACHE STRING ${OPTION_TEXT})
set_property(CACHE ${OPTION_NAME} PROPERTY STRINGS "Trace" "Debug" "Info" "Warn" "Error" "Critical" "Default" "Off")
if(DEFINED ENV{${OPTION_NAME}})
# Allow setting the option through an environment variable
set(${OPTION_NAME} $ENV{${OPTION_NAME}})
endif()
if(${OPTION_NAME})
add_definitions(-D${OPTION_NAME})
endif()
# Allow disabling logging completely and prevent linking against it:
if(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Off")
set(${OPTION_NAME}_DISABLED ON)
add_compile_definitions(${OPTION_NAME}_DISABLED=1)
endif()
message(STATUS " ${OPTION_NAME}: ${${OPTION_NAME}}")
endmacro()
macro(kompute_option_string OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
set(${OPTION_NAME} ${OPTION_DEFAULT} CACHE STRING ${OPTION_TEXT})
if(DEFINED ENV{${OPTION_NAME}})
# Allow setting the option through an environment variable
set(${OPTION_NAME} $ENV{${OPTION_NAME}})
endif()
if(${OPTION_NAME})
add_definitions(-D${OPTION_NAME})
endif()
message(STATUS " ${OPTION_NAME}: ${${OPTION_NAME}}")
endmacro()
message(STATUS "General purpose GPU compute framework built on Vulkan")
message(STATUS "=======================================================")
# Build options
kompute_log_level(KOMPUTE_OPT_LOG_LEVEL "Internally we use Spdlog or fmt for logging, depending on the value of 'KOMPUTE_OPT_USE_SPDLOG'. The log level used can be changed here. Possible values: 'Trace', 'Debug', 'Info', 'Warn', 'Error', 'Critical', 'Off', 'Default'. If set to 'Off' logging will be deactivated completely. If set to 'Default', the log level will be set to 'Info' for release builds and 'Debug' else." "Off")
kompute_option(KOMPUTE_OPT_USE_SPDLOG "If enabled, logging via KP_LOG_<DEBUG, INFO, etc...> will happen through Spdlog instead of plan fmt." OFF)
kompute_option(KOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS "Explicitly disable debug layers even on debug." ON)
kompute_option(KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK "Whether to check if your driver supports the Vulkan Header version you are linking against. This might be useful in case you build shared on a different system than you run later." OFF)
kompute_option(KOMPUTE_OPT_BUILD_SHADERS "Rebuilds all compute shaders during compilation and does not use the already precompiled versions. Requires glslangValidator to be installed on your system." OFF)
# External components
kompute_option(KOMPUTE_OPT_USE_BUILT_IN_SPDLOG "Use the built-in version of Spdlog. Requires 'KOMPUTE_OPT_USE_SPDLOG' to be set to ON in order to have any effect." ON)
kompute_option(KOMPUTE_OPT_SPDLOG_ASYNC_MODE "If spdlog is enabled this allows for selecting whether the default logger setup creates sync or async logger" OFF)
kompute_option(KOMPUTE_OPT_USE_BUILT_IN_FMT "Use the built-in version of fmt." ON)
kompute_option(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER "Use the built-in version of Vulkan Headers. This could be helpful in case your system Vulkan Headers are too new for your driver. If you set this to OFF, please make sure your system Vulkan Headers are supported by your driver." ON)
kompute_option_string(KOMPUTE_OPT_BUILT_IN_VULKAN_HEADER_TAG "The git tag used for the built-in Vulkan Headers when 'KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER' is enabled. A list of tags can be found here: https://github.com/KhronosGroup/Vulkan-Headers/tags" "v1.3.231")
message(STATUS "=======================================================")
# ####################################################
# Deprecated Options
# ####################################################
include(cmake/deprecation_warnings.cmake)
# ####################################################
# Dependencies
# ####################################################
include(cmake/vulkan_shader_compiler.cmake)
include(cmake/check_vulkan_version.cmake)
include(FetchContent)
# Vulkan Header
if(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER)
FetchContent_Declare(vulkan_header GIT_REPOSITORY https://github.com/KhronosGroup/Vulkan-Headers.git
GIT_TAG ${KOMPUTE_OPT_BUILT_IN_VULKAN_HEADER_TAG}) # Source: https://github.com/KhronosGroup/Vulkan-Headers/tags
FetchContent_MakeAvailable(vulkan_header)
if(NOT KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK)
# Ensure the driver supports this Vulkan version
check_vulkan_version(INCLUDE_DIR "${vulkan_header_SOURCE_DIR}/include")
endif()
endif()
find_package(Vulkan REQUIRED)
if(Vulkan_FOUND AND NOT TARGET Vulkan::Headers)
add_library(Vulkan::Headers INTERFACE IMPORTED)
set_target_properties(Vulkan::Headers PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${Vulkan_INCLUDE_DIRS}")
endif()
if(NOT KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER AND NOT KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK)
# Ensure the driver supports this Vulkan version
check_vulkan_version(INCLUDE_DIR ${Vulkan_INCLUDE_DIR})
endif()
# Spdlog
if(KOMPUTE_OPT_USE_SPDLOG)
add_compile_definitions(KOMPUTE_OPT_USE_SPDLOG=1)
if(NOT KOMPUTE_OPT_LOG_LEVEL_DISABLED)
if(KOMPUTE_OPT_USE_BUILT_IN_SPDLOG)
set(SPDLOG_BUILD_SHARED ${BUILD_SHARED_LIBS})
FetchContent_Declare(spdlog GIT_REPOSITORY https://github.com/gabime/spdlog.git
GIT_TAG v1.10.0) # Source: https://github.com/gabime/spdlog/releases
FetchContent_MakeAvailable(spdlog)
else()
find_package(spdlog REQUIRED)
endif()
endif()
endif()
# fmt
if(KOMPUTE_OPT_USE_BUILT_IN_FMT)
FetchContent_Declare(fmt GIT_REPOSITORY https://github.com/fmtlib/fmt.git
GIT_TAG 10.0.0) # Source: https://github.com/fmtlib/fmt/releases
FetchContent_MakeAvailable(fmt)
else()
find_package(fmt REQUIRED)
endif()
# ####################################################
# Preprocessor Macros
# ####################################################
if(KOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS)
add_compile_definitions(KOMPUTE_DISABLE_VK_DEBUG_LAYERS=1)
endif()
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Werror")
endif()
# If glslang is cloned, then SPIRV/GlslangToSpv.h will be used instead of glslang/SPIRV/GlslangToSpv.h
# As after installation, SPIRV/ header files will be found in glslang/SPIRV/ , more info in #193
if(KOMPUTE_OPT_REPO_SUBMODULE_BUILD)
add_definitions(-DUSE_EXTERNAL_GLSLANG)
endif()
# Allow scripts to call main kompute Makefile
function(kompute_make KOMPUTE_MAKE_TARGET)
add_custom_target(${KOMPUTE_MAKE_TARGET}
COMMAND make -C ${PROJECT_SOURCE_DIR} ${KOMPUTE_MAKE_TARGET})
endfunction()
add_executable(xxd external/bin/xxd.c)
add_subdirectory(src)

203
kompute/LICENSE Normal file
View File

@ -0,0 +1,203 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2021 The Institute for Ethical AI & Machine Learning
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

210
kompute/Makefile Normal file
View File

@ -0,0 +1,210 @@
# This makefile is optimized to be run from WSL and to interact with the
# Windows host as there are limitations when building GPU programs. This
# makefile contains the commands for interacting with the visual studio
# build via command line for faster iterations, as the intention is to
# support other editors (optimised for vim). There are also commands that
# support the builds for linux-native compilations and these are the commands
# starting with mk_.
VERSION := $(shell cat ./VERSION)
VCPKG_WIN_PATH ?= "C:\\Users\\axsau\\Programming\\lib\\vcpkg\\scripts\\buildsystems\\vcpkg.cmake"
VCPKG_UNIX_PATH ?= "/c/Users/axsau/Programming/lib/vcpkg/scripts/buildsystems/vcpkg.cmake"
# These are the tests that don't work with swiftshader but can be run directly with vulkan
FILTER_TESTS ?= "-TestAsyncOperations.TestManagerParallelExecution:TestSequence.SequenceTimestamps:TestPushConstants.TestConstantsDouble"
ifeq ($(OS),Windows_NT) # is Windows_NT on XP, 2000, 7, Vista, 10...
CMAKE_BIN ?= "C:\Program Files\CMake\bin\cmake.exe"
SCMP_BIN="C:\\VulkanSDK\\1.2.141.2\\Bin32\\glslangValidator.exe"
MSBUILD_BIN ?= "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\MSBuild\\Current\\Bin\\MSBuild.exe"
else
CLANG_FORMAT_BIN ?= "/home/alejandro/Programming/lib/clang+llvm-10.0.0-x86_64-linux-gnu-ubuntu-18.04/bin/clang-format"
CMAKE_BIN ?= "/c/Program Files/CMake/bin/cmake.exe"
MSBUILD_BIN ?= "/c/Program Files (x86)/Microsoft Visual Studio/2019/Community/MSBuild/Current/Bin/MSBuild.exe"
# Choosing the binary based on whether it's on WSL or linux-native
KERNEL := $(shell uname -r)
IS_WSL := $(shell (if [[ "$(KERNEL)" =~ Microsoft$ ]]; then echo '0'; fi))
ifeq ($(IS_WSL),0)
SCMP_BIN ?= "/c/VulkanSDK/1.2.141.2/Bin32/glslangValidator.exe"
else
SCMP_BIN ?= "/usr/bin/glslangValidator"
endif
endif
####### Main Target Rules #######
push_docs_to_ghpages:
GIT_DEPLOY_DIR="build/docs/sphinx/" \
GIT_DEPLOY_BRANCH="gh-pages" \
GIT_DEPLOY_REPO="origin" \
./scripts/push_folder_to_branch.sh
####### CMAKE quickstart commands #######
clean_cmake:
rm -rf build/
####### Visual studio build shortcut commands #######
MK_BUILD_TYPE ?= "Release"
MK_INSTALL_PATH ?= "build/src/CMakeFiles/Export/" # Set to "" if prefer default
MK_CMAKE_EXTRA_FLAGS ?= ""
MK_KOMPUTE_EXTRA_CXX_FLAGS ?= ""
mk_cmake:
cmake \
-Bbuild \
-DCMAKE_CXX_FLAGS=$(MK_KOMPUTE_EXTRA_CXX_FLAGS) \
-DCMAKE_BUILD_TYPE=$(MK_BUILD_TYPE) \
-DCMAKE_INSTALL_PREFIX=$(MK_INSTALL_PATH) \
-DKOMPUTE_OPT_INSTALL=ON \
-DKOMPUTE_OPT_BUILD_TESTS=ON \
-DKOMPUTE_OPT_BUILD_DOCS=ON \
-DKOMPUTE_OPT_BUILD_SHADERS=ON \
-DKOMPUTE_OPT_CODE_COVERAGE=ON \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-DKOMPUTE_OPT_LOG_LEVEL=Debug \
$(MK_CMAKE_EXTRA_FLAGS) \
-G "Unix Makefiles"
mk_build_all:
cmake --build build/. --parallel
mk_build_docs:
cmake --build build/. --target gendocsall --parallel
mk_build_kompute:
cmake --build build/. --target kompute --parallel
mk_build_tests:
cmake --build build/. --target kompute_tests --parallel
mk_run_docs: mk_build_docs
(cd build/docs/sphinx && python2.7 -m SimpleHTTPServer)
# An alternative would be: ctest -vv --test-dir build/.
# But this is not possible since we need to filter specific tests, not complete executables, which is not possible with ctest.
# https://gitlab.kitware.com/cmake/cmake/-/issues/13168
mk_run_tests: mk_build_tests
./build/bin/kompute_tests --gtest_filter=$(FILTER_TESTS)
mk_build_swiftshader_library:
git clone https://github.com/google/swiftshader || echo "Assuming already cloned"
# GCC 8 or above is required otherwise error on "filesystem" lib will appear
CC="/usr/bin/gcc-8" CXX="/usr/bin/g++-8" cmake swiftshader/. -Bswiftshader/build/
cmake --build swiftshader/build/. --parallel
mk_run_tests_cpu: export VK_ICD_FILENAMES=$(PWD)/swiftshader/build/vk_swiftshader_icd.json
mk_run_tests_cpu: mk_build_swiftshader_library mk_build_tests mk_run_tests_cpu_only
####### Visual studio build shortcut commands #######
VS_BUILD_TYPE ?= "Debug"
# Run with multiprocessin / parallel build by default
VS_CMAKE_EXTRA_FLAGS ?= ""
VS_KOMPUTE_EXTRA_CXX_FLAGS ?= ""
VS_INSTALL_PATH ?= "build/src/CMakeFiles/Export/" # Set to "" if prefer default
vs_cmake:
$(CMAKE_BIN) \
-Bbuild \
$(VS_CMAKE_EXTRA_FLAGS) \
-DCMAKE_TOOLCHAIN_FILE=$(VCPKG_WIN_PATH) \
-DCMAKE_CXX_FLAGS=$(VS_KOMPUTE_EXTRA_CXX_FLAGS) \
-DCMAKE_INSTALL_PREFIX=$(VS_INSTALL_PATH) \
-DKOMPUTE_OPT_INSTALL=ON \
-DKOMPUTE_OPT_BUILD_TESTS=ON \
-DKOMPUTE_OPT_BUILD_SHADERS=ON \
-DKOMPUTE_OPT_CODE_COVERAGE=OFF \
-DKOMPUTE_OPT_BUILD_DOCS=OFF \
-G "Visual Studio 16 2019" \
-DCMAKE_BUILD_TYPE=$(VS_BUILD_TYPE)
vs_build_all:
cmake --build build/. --parallel
vs_build_docs:
cmake --build build/. --target gendocsall --parallel
vs_install_kompute:
cmake --build build/. --target install --parallel
vs_build_kompute:
cmake --build build/. --target kompute --parallel
vs_build_tests:
cmake --build build/. --target kompute_tests --parallel
vs_run_docs: vs_build_docs
(cd build/docs/sphinx && python2.7 -m SimpleHTTPServer)
vs_run_tests: vs_build_tests
./build/test/$(VS_BUILD_TYPE)/bin/kompute_tests.exe --gtest_filter=$(FILTER_TESTS)
#### PYTHONG ####
test_python:
python3 -m pytest -s --log-cli-level=DEBUG -v python/test/
####### Run CI Commands #######
# This command uses act to replicate github action
# https://github.com/nektos/act
run_ci:
act
####### General project commands #######
generate_python_docstrings:
python -m pybind11_mkdoc \
-o python/src/docstrings.hpp \
kompute/Kompute.hpp \
-Iexternal/fmt/include/ \
-Iexternal/spdlog/include/ \
-Iexternal/glslang/ \
-I/usr/include/c++/7.5.0/
install_python_reqs:
python3 -m pip install -r scripts/requirements.txt
install_lcov:
sudo apt install lcov -y
build_shaders:
python3 scripts/convert_shaders.py \
--shader-path shaders/glsl \
--shader-binary $(SCMP_BIN) \
--header-path src/include/kompute/shaders/ \
-v
python3 scripts/convert_shaders.py \
--shader-path test/shaders/glsl \
--shader-binary $(SCMP_BIN) \
--header-path test/compiled_shaders_include/kompute_test/shaders/ \
-v
build_single_header:
quom \
--include_directory \
"src/include/" \
"single_include/AggregateHeaders.cpp" \
"single_include/kompute/Kompute.hpp"
win_build_xxd:
cd external/bin/ && gcc.exe -o xxd.exe xxd.c -DCYGWIN
format:
for val in "examples single_include src test" ; do \
find $$val -depth -iname *.h -or -iname *.c -or -iname *.hpp -or -iname *.cpp | grep -v "shaders" | xargs $(CLANG_FORMAT_BIN) -style=file -i; \
done
static_scan:
cppcheck --project=build/compile_commands.json -iexternal/
build_changelog:
docker run --rm -it -v "$(PWD)":/usr/local/src/your-app -e CHANGELOG_GITHUB_TOKEN=${CHANGELOG_GITHUB_TOKEN} ferrarimarco/github-changelog-generator:1.15.2 -u KomputeProject -p kompute
chmod 664 CHANGELOG.md # (Read+Write, Read+Write, Read)
sed -i -e 's/\(HEAD\|Unreleased\)/v${VERSION}/g' CHANGELOG.md # Replacing unreleased version with latest tag

513
kompute/README.md Normal file
View File

@ -0,0 +1,513 @@
![GitHub](https://img.shields.io/badge/Version-0.7.0-green.svg)
![GitHub](https://img.shields.io/badge/C++-14—20-purple.svg)
![GitHub](https://img.shields.io/badge/Build-cmake-red.svg)
![GitHub](https://img.shields.io/badge/Python-3.7—3.9-blue.svg)
![GitHub](https://img.shields.io/badge/License-Apache-black.svg)
[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/4834/badge)](https://bestpractices.coreinfrastructure.org/projects/4834)
<table>
<tr>
<td width="20%">
<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute.jpg">
</td>
<td>
<h1>Kompute</h1>
<h3>The general purpose GPU compute framework for cross vendor graphics cards (AMD, Qualcomm, NVIDIA & friends)</h3>
</td>
</tr>
</table>
<h4>Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU acceleration usecases.</h4>
💬 [Join the Discord & Community Calls](https://kompute.cc/overview/community.html) 🔋 [Documentation](https://kompute.cc) 💻 [Blog Post](https://medium.com/@AxSaucedo/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) ⌨ [Examples](#more-examples) 💾
<hr>
##### Kompute is backed by the Linux Foundation as a <a href="https://lfaidata.foundation/blog/2021/08/26/kompute-joins-lf-ai-data-as-new-sandbox-project/">hosted project</a> by the LF AI & Data Foundation.
<table>
<tr>
<td>
<a href="https://www.linuxfoundation.org/projects/">
<img src="https://upload.wikimedia.org/wikipedia/commons/b/b5/Linux_Foundation_logo.png">
</a>
</td>
<td>
<a href="https://lfaidata.foundation/projects/">
<img src="https://raw.githubusercontent.com/lfai/artwork/main/lfaidata-assets/lfaidata/horizontal/color/lfaidata-horizontal-color.png">
</a>
</td>
</tr>
</table>
## Principles & Features
* [Flexible Python module](#your-first-kompute-python) with [C++ SDK](#your-first-kompute-c) for optimizations
* [Asynchronous & parallel processing](#asynchronous-and-parallel-operations) support through GPU family queues
* [Mobile enabled](#mobile-enabled) with examples via Android NDK across several architectures
* BYOV: [Bring-your-own-Vulkan design](#motivations) to play nice with existing Vulkan applications
* Explicit relationships for GPU and host [memory ownership and memory management](https://kompute.cc/overview/memory-management.html)
* Robust codebase with [90% unit test code coverage](https://kompute.cc/codecov/)
* Advanced use-cases on [machine learning 🤖](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a), [mobile development 📱](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617) and [game development 🎮](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0).
* Active community with [monthly calls, discord chat and more](https://kompute.cc/overview/community.html)
![](https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/komputer-logos.gif)
## Getting Started
Below you can find a GPU multiplication example using the C++ and Python Kompute interfaces.
You can [join the Discord](https://discord.gg/MaH5Jv5zwv) for questions / discussion, open a [github issue](https://github.com/KomputeProject/kompute/issues/new), or read [the documentation](https://kompute.cc/).
### Your First Kompute (C++)
The C++ interface provides low level access to the native components of Kompute, enabling for [advanced optimizations](https://kompute.cc/overview/async-parallel.html) as well as [extension of components](https://kompute.cc/overview/reference.html).
```c++
void kompute(const std::string& shader) {
// 1. Create Kompute Manager with default settings (device 0, first queue and no extensions)
kp::Manager mgr;
// 2. Create and initialise Kompute Tensors through manager
// Default tensor constructor simplifies creation of float values
auto tensorInA = mgr.tensor({ 2., 2., 2. });
auto tensorInB = mgr.tensor({ 1., 2., 3. });
// Explicit type constructor supports uint32, int32, double, float and bool
auto tensorOutA = mgr.tensorT<uint32_t>({ 0, 0, 0 });
auto tensorOutB = mgr.tensorT<uint32_t>({ 0, 0, 0 });
std::vector<std::shared_ptr<kp::Tensor>> params = {tensorInA, tensorInB, tensorOutA, tensorOutB};
// 3. Create algorithm based on shader (supports buffers & push/spec constants)
kp::Workgroup workgroup({3, 1, 1});
std::vector<float> specConsts({ 2 });
std::vector<float> pushConstsA({ 2.0 });
std::vector<float> pushConstsB({ 3.0 });
auto algorithm = mgr.algorithm(params,
// See documentation shader section for compileSource
compileSource(shader),
workgroup,
specConsts,
pushConstsA);
// 4. Run operation synchronously using sequence
mgr.sequence()
->record<kp::OpTensorSyncDevice>(params)
->record<kp::OpAlgoDispatch>(algorithm) // Binds default push consts
->eval() // Evaluates the two recorded operations
->record<kp::OpAlgoDispatch>(algorithm, pushConstsB) // Overrides push consts
->eval(); // Evaluates only last recorded operation
// 5. Sync results from the GPU asynchronously
auto sq = mgr.sequence();
sq->evalAsync<kp::OpTensorSyncLocal>(params);
// ... Do other work asynchronously whilst GPU finishes
sq->evalAwait();
// Prints the first output which is: { 4, 8, 12 }
for (const float& elem : tensorOutA->vector()) std::cout << elem << " ";
// Prints the second output which is: { 10, 10, 10 }
for (const float& elem : tensorOutB->vector()) std::cout << elem << " ";
} // Manages / releases all CPU and GPU memory resources
int main() {
// Define a raw string shader (or use the Kompute tools to compile to SPIRV / C++ header
// files). This shader shows some of the main components including constants, buffers, etc
std::string shader = (R"(
#version 450
layout (local_size_x = 1) in;
// The input tensors bind index is relative to index in parameter passed
layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; };
layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; };
// Kompute supports push constants updated on dispatch
layout(push_constant) uniform PushConstants {
float val;
} push_const;
// Kompute also supports spec constants on initalization
layout(constant_id = 0) const float const_one = 0;
void main() {
uint index = gl_GlobalInvocationID.x;
out_a[index] += uint( in_a[index] * in_b[index] );
out_b[index] += uint( const_one * push_const.val );
}
)");
// Run the function declared above with our raw string shader
kompute(shader);
}
```
### Your First Kompute (Python)
The [Python package](https://kompute.cc/overview/python-package.html) provides a [high level interactive interface](https://kompute.cc/overview/python-reference.html) that enables for experimentation whilst ensuring high performance and fast development workflows.
```python
from .utils import compile_source # using util function from python/test/utils
def kompute(shader):
# 1. Create Kompute Manager with default settings (device 0, first queue and no extensions)
mgr = kp.Manager()
# 2. Create and initialise Kompute Tensors through manager
# Default tensor constructor simplifies creation of float values
tensor_in_a = mgr.tensor([2, 2, 2])
tensor_in_b = mgr.tensor([1, 2, 3])
# Explicit type constructor supports uint32, int32, double, float and bool
tensor_out_a = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))
tensor_out_b = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))
params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b]
# 3. Create algorithm based on shader (supports buffers & push/spec constants)
workgroup = (3, 1, 1)
spec_consts = [2]
push_consts_a = [2]
push_consts_b = [3]
# See documentation shader section for compile_source
spirv = compile_source(shader)
algo = mgr.algorithm(params, spirv, workgroup, spec_consts, push_consts_a)
# 4. Run operation synchronously using sequence
(mgr.sequence()
.record(kp.OpTensorSyncDevice(params))
.record(kp.OpAlgoDispatch(algo)) # Binds default push consts provided
.eval() # evaluates the two recorded ops
.record(kp.OpAlgoDispatch(algo, push_consts_b)) # Overrides push consts
.eval()) # evaluates only the last recorded op
# 5. Sync results from the GPU asynchronously
sq = mgr.sequence()
sq.eval_async(kp.OpTensorSyncLocal(params))
# ... Do other work asynchronously whilst GPU finishes
sq.eval_await()
# Prints the first output which is: { 4, 8, 12 }
print(tensor_out_a)
# Prints the first output which is: { 10, 10, 10 }
print(tensor_out_b)
if __name__ == "__main__":
# Define a raw string shader (or use the Kompute tools to compile to SPIRV / C++ header
# files). This shader shows some of the main components including constants, buffers, etc
shader = """
#version 450
layout (local_size_x = 1) in;
// The input tensors bind index is relative to index in parameter passed
layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; };
layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; };
// Kompute supports push constants updated on dispatch
layout(push_constant) uniform PushConstants {
float val;
} push_const;
// Kompute also supports spec constants on initalization
layout(constant_id = 0) const float const_one = 0;
void main() {
uint index = gl_GlobalInvocationID.x;
out_a[index] += uint( in_a[index] * in_b[index] );
out_b[index] += uint( const_one * push_const.val );
}
"""
kompute(shader)
```
### Interactive Notebooks & Hands on Videos
You are able to try out the interactive Colab Notebooks which allow you to use a free GPU. The available examples are the Python and C++ examples below:
<table>
<tr>
<td width="50%">
<h5>Try the interactive <a href="https://colab.research.google.com/drive/1l3hNSq2AcJ5j2E3YIw__jKy5n6M615GP?usp=sharing">C++ Colab</a> from <a href="https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a">Blog Post</a></h5>
</td>
<td>
<h5>Try the interactive <a href="https://colab.research.google.com/drive/15uQ7qMZuOyk8JcXF-3SB2R5yNFW21I4P">Python Colab</a> from <a href="https://towardsdatascience.com/beyond-cuda-gpu-accelerated-python-for-machine-learning-in-cross-vendor-graphics-cards-made-simple-6cc828a45cc3">Blog Post</a></h5>
</td>
</tr>
<tr>
<td width="50%">
<a href="https://colab.research.google.com/drive/1l3hNSq2AcJ5j2E3YIw__jKy5n6M615GP?authuser=1#scrollTo=1BipBsO-fQRD">
<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/binder-cpp.jpg">
</a>
</td>
<td>
<a href="https://colab.research.google.com/drive/15uQ7qMZuOyk8JcXF-3SB2R5yNFW21I4P">
<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/binder-python.jpg">
</a>
</td>
</tr>
</table>
You can also check out the two following talks presented at the FOSDEM 2021 conference.
Both videos have timestamps which will allow you to skip to the most relevant section for you - the intro & motivations for both is almost the same so you can skip to the more specific content.
<table>
<tr>
<td width="50%">
<h5>Watch the video for <a href="https://www.youtube.com/watch?v=Xz4fiQNmGSA">C++ Enthusiasts</a> </h5>
</td>
<td>
<h5>Watch the video for <a href="https://www.youtube.com/watch?v=AJRyZ09IUdg">Python & Machine Learning</a> Enthusiasts</h5>
</td>
</tr>
<tr>
<td width="50%">
<a href="https://www.youtube.com/watch?v=Xz4fiQNmGSA">
<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-cpp-video.png">
</a>
</td>
<td>
<a href="https://www.youtube.com/watch?v=AJRyZ09IUdg">
<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-python-video.png">
</a>
</td>
</tr>
</table>
## Architectural Overview
The core architecture of Kompute includes the following:
* [Kompute Manager](https://kompute.cc/overview/reference.html#manager) - Base orchestrator which creates and manages device and child components
* [Kompute Sequence](https://kompute.cc/overview/reference.html#sequence) - Container of operations that can be sent to GPU as batch
* [Kompute Operation (Base)](https://kompute.cc/overview/reference.html#algorithm) - Base class from which all operations inherit
* [Kompute Tensor](https://kompute.cc/overview/reference.html#tensor) - Tensor structured data used in GPU operations
* [Kompute Algorithm](https://kompute.cc/overview/reference.html#algorithm) - Abstraction for (shader) logic executed in the GPU
To see a full breakdown you can read further in the [C++ Class Reference](https://kompute.cc/overview/reference.html).
<table>
<th>
Full Architecture
</th>
<th>
Simplified Kompute Components
</th>
<tr>
<td width=30%>
<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-vulkan-architecture.jpg">
<br>
<br>
(very tiny, check the <a href="https://ethicalml.github.io/vulkan-kompute/overview/reference.html">full reference diagram in docs for details</a>)
<br>
<br>
<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/suspicious.jfif">
</td>
<td>
<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-architecture.jpg">
</td>
</tr>
</table>
## Asynchronous and Parallel Operations
Kompute provides flexibility to run operations in an asynrchonous way through vk::Fences. Furthermore, Kompute enables for explicit allocation of queues, which allow for parallel execution of operations across queue families.
The image below provides an intuition on how Kompute Sequences can be allocated to different queues to enable parallel execution based on hardware. You can see the [hands on example](https://kompute.cc/overview/advanced-examples.html#parallel-operations), as well as the [detailed documentation page](https://kompute.cc/overview/async-parallel.html) describing how it would work using an NVIDIA 1650 as an example.
![](https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/queue-allocation.jpg)
## Mobile Enabled
Kompute has been optimized to work in mobile environments. The [build system](#build-overview) enables for dynamic loading of the Vulkan shared library for Android environments, together with a working [Android NDK wrapper](https://github.com/KomputeProject/kompute/tree/master/vk_ndk_wrapper_include) for the CPP headers.
<table>
<tr>
<td width="70%">
<p>
For a full deep dive you can read the blog post "<a href="https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617">Supercharging your Mobile Apps with On-Device GPU Accelerated Machine Learning</a>".
You can also access the <a href="https://github.com/KomputeProject/kompute/tree/v0.4.0/examples/android/android-simple">end-to-end example code</a> in the repository, which can be run using android studio.
</p>
<img src="https://raw.githubusercontent.com/KomputeProject/kompute/android-example/docs/images/android-editor.jpg">
</td>
<td width="30%">
<img src="https://raw.githubusercontent.com/KomputeProject/kompute/android-example/docs/images/android-kompute.jpg">
</td>
</tr>
</table>
## More examples
### Simple examples
* [Simple multiplication example](https://kompute.cc/overview/advanced-examples.html#simple-shader-example)
* [Record batch commands with a Kompute Sequence](https://kompute.cc/overview/advanced-examples.html#record-batch-commands)
* [Run Asynchronous Operations](https://kompute.cc/overview/advanced-examples.html#asynchronous-operations)
* [Run Parallel Operations Across Multiple GPU Queues](https://kompute.cc/overview/advanced-examples.html#parallel-operations)
* [Create your custom Kompute Operations](https://kompute.cc/overview/advanced-examples.html#your-custom-kompute-operation)
* [Implementing logistic regression from scratch](https://kompute.cc/overview/advanced-examples.html#logistic-regression-example)
### End-to-end examples
* [Machine Learning Logistic Regression Implementation](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a)
* [Parallelizing GPU-intensive Workloads via Multi-Queue Operations](https://towardsdatascience.com/parallelizing-heavy-gpu-workloads-via-multi-queue-operations-50a38b15a1dc)
* [Android NDK Mobile Kompute ML Application](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617)
* [Game Development Kompute ML in Godot Engine](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0)
## Python Package
Besides the C++ core SDK you can also use the Python package of Kompute, which exposes the same core functionality, and supports interoperability with Python objects like Lists, Numpy Arrays, etc.
The only dependencies are Python 3.5+ and Cmake 3.4.1+. You can install Kompute from the [Python pypi package](https://pypi.org/project/kp/) using the following command.
```
pip install kp
```
You can also install from master branch using:
```
pip install git+git://github.com/KomputeProject/kompute.git@master
```
For further details you can read the [Python Package documentation](https://kompute.cc/overview/python-package.html) or the [Python Class Reference documentation](https://kompute.cc/overview/python-reference.html).
## C++ Build Overview
The build system provided uses `cmake`, which allows for cross platform builds.
The top level `Makefile` provides a set of optimized configurations for development as well as the docker image build, but you can start a build with the following command:
```
cmake -Bbuild
```
You also are able to add Kompute in your repo with `add_subdirectory` - the [Android example CMakeLists.txt file](https://github.com/KomputeProject/kompute/blob/7c8c0eeba2cdc098349fcd999102bb2cca1bf711/examples/android/android-simple/app/src/main/cpp/CMakeLists.txt#L3) shows how this would be done.
For a more advanced overview of the build configuration check out the [Build System Deep Dive](https://kompute.cc/overview/build-system.html) documentation.
## Kompute Development
We appreciate PRs and Issues. If you want to contribute try checking the "Good first issue" tag, but even using Kompute and reporting issues is a great contribution!
### Contributing
#### Dev Dependencies
* Testing
+ GTest
* Documentation
+ Doxygen (with Dot)
+ Sphynx
#### Development
* Follows Mozilla C++ Style Guide https://www-archive.mozilla.org/hacking/mozilla-style-guide.html
+ Uses post-commit hook to run the linter, you can set it up so it runs the linter before commit
+ All dependencies are defined in vcpkg.json
* Uses cmake as build system, and provides a top level makefile with recommended command
* Uses xxd (or xxd.exe windows 64bit port) to convert shader spirv to header files
* Uses doxygen and sphinx for documentation and autodocs
* Uses vcpkg for finding the dependencies, it's the recommended set up to retrieve the libraries
If you want to run with debug layers you can add them with the `KOMPUTE_ENV_DEBUG_LAYERS` parameter as:
```
export KOMPUTE_ENV_DEBUG_LAYERS="VK_LAYER_LUNARG_api_dump"
```
##### Updating documentation
To update the documentation you will need to:
* Run the gendoxygen target in the build system
* Run the gensphynx target in the build-system
* Push to github pages with `make push_docs_to_ghpages`
##### Running tests
Running the unit tests has been significantly simplified for contributors.
The tests run on CPU, and can be triggered using the ACT command line interface (https://github.com/nektos/act) - once you install the command line (And start the Docker daemon) you just have to type:
```
$ act
[Python Tests/python-tests] 🚀 Start image=axsauze/kompute-builder:0.2
[C++ Tests/cpp-tests ] 🚀 Start image=axsauze/kompute-builder:0.2
[C++ Tests/cpp-tests ] 🐳 docker run image=axsauze/kompute-builder:0.2 entrypoint=["/usr/bin/tail" "-f" "/dev/null"] cmd=[]
[Python Tests/python-tests] 🐳 docker run image=axsauze/kompute-builder:0.2 entrypoint=["/usr/bin/tail" "-f" "/dev/null"] cmd=[]
...
```
The repository contains unit tests for the C++ and Python code, and can be found under the `test/` and `python/test` folder.
The tests are currently run through the CI using Github Actions. It uses the images found in `docker-builders/`.
In order to minimise hardware requirements the tests can run without a GPU, directly in the CPU using [Swiftshader](https://github.com/google/swiftshader).
For more information on how the CI and tests are setup, you can go to the [CI, Docker and Tests Section](https://kompute.cc/overview/ci-tests.html) in the documentation.
## Motivations
This project started after seeing that a lot of new and renowned ML & DL projects like Pytorch, Tensorflow, Alibaba DNN, Tencent NCNN - among others - have either integrated or are looking to integrate the Vulkan SDK to add mobile (and cross-vendor) GPU support.
The Vulkan SDK offers a great low level interface that enables for highly specialized optimizations - however it comes at a cost of highly verbose code which requires 500-2000 lines of code to even begin writing application code. This has resulted in each of these projects having to implement the same baseline to abstract the non-compute related features of the Vulkan SDK. This large amount of non-standardised boiler-plate can result in limited knowledge transfer, higher chance of unique framework implementation bugs being introduced, etc.
We are currently developing Kompute not to hide the Vulkan SDK interface (as it's incredibly well designed) but to augment it with a direct focus on the Vulkan SDK's GPU computing capabilities. [This article](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) provides a high level overview of the motivations of Kompute, together with a set of hands on examples that introduce both GPU computing as well as the core Kompute architecture.

106
kompute/cmake/bin2h.cmake Normal file
View File

@ -0,0 +1,106 @@
##################################################################################
# Based on: https://github.com/sivachandran/cmake-bin2h
#
# Copyright 2020 Sivachandran Paramasivam
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
##################################################################################
include(CMakeParseArguments)
# Function to wrap a given string into multiple lines at the given column position.
# Parameters:
# VARIABLE - The name of the CMake variable holding the string.
# AT_COLUMN - The column position at which string will be wrapped.
function(WRAP_STRING)
set(oneValueArgs VARIABLE AT_COLUMN)
cmake_parse_arguments(WRAP_STRING "${options}" "${oneValueArgs}" "" ${ARGN})
string(LENGTH ${${WRAP_STRING_VARIABLE}} stringLength)
math(EXPR offset "0")
while(stringLength GREATER 0)
if(stringLength GREATER ${WRAP_STRING_AT_COLUMN})
math(EXPR length "${WRAP_STRING_AT_COLUMN}")
else()
math(EXPR length "${stringLength}")
endif()
string(SUBSTRING ${${WRAP_STRING_VARIABLE}} ${offset} ${length} line)
set(lines "${lines}\n${line}")
math(EXPR stringLength "${stringLength} - ${length}")
math(EXPR offset "${offset} + ${length}")
endwhile()
set(${WRAP_STRING_VARIABLE} "${lines}" PARENT_SCOPE)
endfunction()
# Function to embed contents of a file as byte array in C/C++ header file(.h). The header file
# will contain a byte array and integer variable holding the size of the array.
# Parameters
# SOURCE_FILE - The path of source file whose contents will be embedded in the header file.
# VARIABLE_NAME - The name of the variable for the byte array. The string "_SIZE" will be append
# to this name and will be used a variable name for size variable.
# HEADER_FILE - The path of header file.
# APPEND - If specified appends to the header file instead of overwriting it
# NULL_TERMINATE - If specified a null byte(zero) will be append to the byte array. This will be
# useful if the source file is a text file and we want to use the file contents
# as string. But the size variable holds size of the byte array without this
# null byte.
# HEADER_NAMESPACE - The namespace, where the array should be located in.
# IS_BIG_ENDIAN - If set to true, will not revers the byte order for the uint32_t to match the
# big endian system architecture
# Usage:
# bin2h(SOURCE_FILE "Logo.png" HEADER_FILE "Logo.h" VARIABLE_NAME "LOGO_PNG")
function(BIN2H)
set(options APPEND NULL_TERMINATE)
set(oneValueArgs SOURCE_FILE VARIABLE_NAME HEADER_FILE)
cmake_parse_arguments(BIN2H "${options}" "${oneValueArgs}" "" ${ARGN})
# reads source file contents as hex string
file(READ ${BIN2H_SOURCE_FILE} hexString HEX)
string(LENGTH ${hexString} hexStringLength)
# appends null byte if asked
if(BIN2H_NULL_TERMINATE)
set(hexString "${hexString}00")
endif()
# wraps the hex string into multiple lines at column 32(i.e. 16 bytes per line)
wrap_string(VARIABLE hexString AT_COLUMN 32)
math(EXPR arraySize "${hexStringLength} / 8")
# adds '0x' prefix and comma suffix before and after every byte respectively
if(IS_BIG_ENDIAN)
message(STATUS "Interpreting shader in big endian...")
string(REGEX REPLACE "([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])" "0x\\1\\2\\3\\4, " arrayValues ${hexString})
else()
message(STATUS "Interpreting shader in little endian...")
string(REGEX REPLACE "([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])" "0x\\4\\3\\2\\1, " arrayValues ${hexString})
endif()
# removes trailing comma
string(REGEX REPLACE ", $" "" arrayValues ${arrayValues})
# converts the variable name into proper C identifier
string(MAKE_C_IDENTIFIER "${BIN2H_VARIABLE_NAME}" BIN2H_VARIABLE_NAME)
string(TOUPPER "${BIN2H_VARIABLE_NAME}" BIN2H_VARIABLE_NAME)
# declares byte array and the length variables
set(namespaceStart "namespace ${HEADER_NAMESPACE} {")
set(namespaceEnd "} // namespace ${HEADER_NAMESPACE}")
set(arrayIncludes "#pragma once\n#include <array>\n#include <cstdint>")
set(arrayDefinition "const std::array<uint32_t, ${arraySize}> ${BIN2H_VARIABLE_NAME} = { ${arrayValues} };")
set(declarations "${arrayIncludes}\n\n${namespaceStart}\n${arrayDefinition}\n${namespaceEnd}\n\n")
if(BIN2H_APPEND)
file(APPEND ${BIN2H_HEADER_FILE} "${declarations}")
else()
file(WRITE ${BIN2H_HEADER_FILE} "${declarations}")
endif()
endfunction()

View File

@ -0,0 +1,19 @@
cmake_minimum_required(VERSION 3.20)
if(${INPUT_SHADER_FILE} STREQUAL "")
message(FATAL_ERROR "No input file path provided via 'INPUT_SHADER_FILE'.")
endif()
if(${OUTPUT_HEADER_FILE} STREQUAL "")
message(FATAL_ERROR "No output file path provided via 'OUTPUT_HEADER_FILE'.")
endif()
if(${HEADER_NAMESPACE} STREQUAL "")
message(FATAL_ERROR "No header namespace provided via 'HEADER_NAMESPACE'.")
endif()
include(bin2h.cmake)
get_filename_component(BINARY_FILE_CONTENT ${INPUT_SHADER_FILE} NAME)
bin2h(SOURCE_FILE ${INPUT_SHADER_FILE} HEADER_FILE ${OUTPUT_HEADER_FILE} VARIABLE_NAME ${BINARY_FILE_CONTENT} HEADER_NAMESPACE ${HEADER_NAMESPACE})
file(APPEND ${OUTPUT_HEADER_FILE} "\n")

View File

@ -0,0 +1,139 @@
# Current issue: Only checks the result of GPU0
function(check_vulkan_version)
cmake_parse_arguments(VULKAN_CHECK_VERSION "" "INCLUDE_DIR" "" ${ARGN})
message(STATUS "Ensuring the currently installed driver supports the Vulkan version requested by the Vulkan Header.")
# Get the current Vulkan Header version (e.g. 1.2.189).
# This snippet is based on: https://gitlab.kitware.com/cmake/cmake/-/blob/v3.23.1/Modules/FindVulkan.cmake#L140-156
if(VULKAN_CHECK_VERSION_INCLUDE_DIR)
set(VULKAN_CORE_H ${VULKAN_CHECK_VERSION_INCLUDE_DIR}/vulkan/vulkan_core.h)
if(EXISTS ${VULKAN_CORE_H})
file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_VERSION_LINE REGEX "^#define VK_HEADER_VERSION ")
string(REGEX MATCHALL "[0-9]+" VULKAN_HEADER_VERSION "${VULKAN_HEADER_VERSION_LINE}")
file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_VERSION_LINE2 REGEX "^#define VK_HEADER_VERSION_COMPLETE ")
if(NOT ${VULKAN_HEADER_VERSION_LINE2} STREQUAL "")
string(REGEX MATCHALL "[0-9]+" VULKAN_HEADER_VERSION2 "${VULKAN_HEADER_VERSION_LINE2}")
list(LENGTH VULKAN_HEADER_VERSION2 _len)
# Versions >= 1.2.175 have an additional numbers in front of e.g. '0, 1, 2' instead of '1, 2'
if(_len EQUAL 3)
list(REMOVE_AT VULKAN_HEADER_VERSION2 0)
endif()
list(APPEND VULKAN_HEADER_VERSION2 ${VULKAN_HEADER_VERSION})
list(JOIN VULKAN_HEADER_VERSION2 "." VULKAN_HEADER_VERSION)
else()
file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_API_VERSION_1_2 REGEX "^#define VK_API_VERSION_1_2.*")
if(NOT ${VULKAN_HEADER_API_VERSION_1_2} STREQUAL "")
set(VULKAN_HEADER_VERSION "1.2.${VULKAN_HEADER_VERSION}")
else()
file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_API_VERSION_1_1 REGEX "^#define VK_API_VERSION_1_1.*")
if(NOT ${VULKAN_HEADER_API_VERSION_1_1} STREQUAL "")
set(VULKAN_HEADER_VERSION "1.1.${VULKAN_HEADER_VERSION}")
else()
message(FATAL_ERROR "'${VULKAN_CORE_H}' does not contain a supported Vulkan version. Probably because its < 1.2.0.")
endif()
endif()
endif()
else()
message(FATAL_ERROR "'${VULKAN_CORE_H}' does not exist. Try calling 'find_package(Vulkan REQUIRED)' before you call this function or set 'Vulkan_INCLUDE_DIR' manually!")
return()
endif()
else()
message(FATAL_ERROR "Invalid Vulkan include directory given. Try calling 'find_package(Vulkan REQUIRED)' before you call this function or set 'Vulkan_INCLUDE_DIR' manually!")
return()
endif()
message(STATUS "Found Vulkan Header version: ${VULKAN_HEADER_VERSION}")
# Get Vulkan version supported by driver
find_program(VULKAN_INFO_PATH NAMES vulkaninfo)
if(VULKAN_INFO_PATH STREQUAL "VULKAN_INFO_PATH-NOTFOUND")
message(FATAL_ERROR "vulkaninfo not found. The Vulkan SDK might not be installed properly. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON).")
return()
endif()
execute_process(COMMAND "vulkaninfo"
OUTPUT_VARIABLE VULKAN_INFO_OUTPUT
RESULT_VARIABLE VULKAN_INFO_RETURN)
if(NOT ${VULKAN_INFO_RETURN} EQUAL 0)
message(FATAL_ERROR "Running vulkaninfo failed with return code ${VULKAN_INFO_RETURN}. Make sure you have 'vulkan-tools' installed. Result:\n${VULKAN_INFO_OUTPUT}?")
return()
else()
message(STATUS "Running vulkaninfo was successful. Parsing the output...")
endif()
# Check if running vulkaninfo was successfully
string(FIND "${VULKAN_INFO_OUTPUT}" "Vulkan Instance Version" VULKAN_INFO_SUCCESSFUL)
if(VULKAN_INFO_SUCCESSFUL LESS 0)
message(FATAL_ERROR "Running vulkaninfo failed. Make sure you have 'vulkan-tools' installed and DISPLAY is configured. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON). Result:\n${VULKAN_INFO_OUTPUT}?")
endif()
string(REGEX MATCHALL "(GPU[0-9]+)" GPU_IDS "${VULKAN_INFO_OUTPUT}")
if(NOT GPU_IDS)
message(FATAL_ERROR "No GPU supporting Vulkan found in vulkaninfo. Does your GPU (driver) support Vulkan?")
endif()
string(REGEX MATCHALL "apiVersion[ ]*=[ ]*[0-9a-fA-F]*[ ]*[(]*([0-9]+[.][0-9]+[.][0-9]+)[)]*" GPU_API_VERSIONS ${VULKAN_INFO_OUTPUT})
if(NOT GPU_API_VERSIONS)
message(FATAL_ERROR "No valid Vulkan API version found in vulkaninfo. Does your GPU (driver) support Vulkan?")
endif()
# Check length
# message(FATAL_ERROR "GPUS: ${GPU_IDS}")
list(LENGTH GPU_IDS GPU_IDS_LENGTH)
list(LENGTH GPU_API_VERSIONS GPU_API_VERSIONS_LENGTH)
if(NOT ${GPU_IDS_LENGTH} EQUAL ${GPU_API_VERSIONS_LENGTH})
message(FATAL_ERROR "Found ${GPU_IDS_LENGTH} GPUs, but ${GPU_API_VERSIONS_LENGTH} API versions in vulkaninfo. We expected to find an equal amount of them.")
endif()
# Compare versions
set(VALID_GPU "")
set(VALID_VULKAN_VERSION "")
math(EXPR ITER_LEN "${GPU_IDS_LENGTH} - 1")
foreach(INDEX RANGE ${ITER_LEN})
list(GET GPU_IDS ${INDEX} GPU)
list(GET GPU_API_VERSIONS ${INDEX} API_VERSION)
# Extract API version
if(${API_VERSION} MATCHES "apiVersion[ ]*=[ ]*[0-9a-fA-F]*[ ]*[(]*([0-9]+[.][0-9]+[.][0-9]+)[)]*")
set(VULKAN_DRIVER_VERSION ${CMAKE_MATCH_1})
else()
message(FATAL_ERROR "API version match failed. This should not have happened...")
endif()
message(STATUS "${GPU} supports Vulkan API version '${VULKAN_DRIVER_VERSION}'.")
# Compare driver and header version
if(${VULKAN_DRIVER_VERSION} VERSION_LESS ${VULKAN_HEADER_VERSION})
# Version missmatch. Let us check if the minor version is the same.
if(${VULKAN_DRIVER_VERSION} MATCHES "[0-9]+[.]([0-9]+)[.][0-9]+")
set(VULKAN_DRIVER_MINOR_VERSION ${CMAKE_MATCH_1})
else()
message(FATAL_ERROR "Invalid Vulkan driver version '${VULKAN_DRIVER_VERSION}' found. Expected version in the following format: '[0-9]+.[0-9]+.[0-9]+'")
endif()
if(${VULKAN_HEADER_VERSION} MATCHES "[0-9]+[.]([0-9]+)[.][0-9]+")
set(VULKAN_HEADER_MINOR_VERSION ${CMAKE_MATCH_1})
else()
message(FATAL_ERROR "Invalid Vulkan Header version '${VULKAN_HEADER_VERSION}' found. Expected version in the following format: '[0-9]+.[0-9]+.[0-9]+'")
endif()
if(${VULKAN_DRIVER_MINOR_VERSION} EQUAL ${VULKAN_HEADER_MINOR_VERSION})
message(WARNING "Your GPU driver does not support Vulkan > ${VULKAN_DRIVER_VERSION}, but you try to use Vulkan Header ${VULKAN_HEADER_VERSION}. At least your driver supports the same minor version (${VULKAN_DRIVER_MINOR_VERSION}), so this should be fine but keep it in mind in case you encounter any strange behavior.")
set(VALID_GPU ${GPU})
set(VALID_VULKAN_VERSION ${VULKAN_DRIVER_VERSION})
break()
else()
message(STATUS "${GPU} does not support Vulkan > ${VULKAN_DRIVER_VERSION}.")
endif()
else()
set(VALID_GPU ${GPU})
set(VALID_VULKAN_VERSION ${VULKAN_DRIVER_VERSION})
break()
endif()
endforeach()
if("${VALID_GPU}" STREQUAL "")
message(FATAL_ERROR "None of your GPUs supports Vulkan Header ${VULKAN_HEADER_VERSION}. Please try updating your driver, or downgrade your Vulkan headers. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON).")
else()
message("Valid GPU (${VALID_GPU}) for Vulkan header version ${VULKAN_HEADER_VERSION} found. ${VALID_GPU} supports up to Vulkan ${VALID_VULKAN_VERSION}.")
endif()
endfunction()

View File

@ -0,0 +1,35 @@
# Code coverage
set(CMAKE_BUILD_TYPE COVERAGE CACHE INTERNAL "Coverage build enabled")
message(STATUS "Enabling gcov support")
if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
set(COVERAGE_FLAG "--coverage")
endif()
set(CMAKE_CXX_FLAGS_COVERAGE
"-g -O0 ${COVERAGE_FLAG} -fprofile-arcs -ftest-coverage"
CACHE STRING "Flags used by the C++ compiler during coverage builds."
FORCE)
set(CMAKE_C_FLAGS_COVERAGE
"-g -O0 ${COVERAGE_FLAG} -fprofile-arcs -ftest-coverage"
CACHE STRING "Flags used by the C compiler during coverage builds."
FORCE)
set(CMAKE_EXE_LINKER_FLAGS_COVERAGE
""
CACHE STRING "Flags used for linking binaries during coverage builds."
FORCE)
set(CMAKE_SHARED_LINKER_FLAGS_COVERAGE
""
CACHE STRING "Flags used by the shared libraries linker during coverage builds."
FORCE)
set(CODECOV_DIR ${CMAKE_CURRENT_BINARY_DIR}/codecov/)
set(CODECOV_DIR_LCOV ${CODECOV_DIR}lcov/)
set(CODECOV_FILENAME_LCOV_INFO lcov.info)
set(CODECOV_FILENAME_LCOV_INFO_FULL lcov_full.info)
set(CODECOV_DIR_HTML ${CODECOV_DIR}html/)
mark_as_advanced(CMAKE_CXX_FLAGS_COVERAGE
CMAKE_C_FLAGS_COVERAGE
CMAKE_EXE_LINKER_FLAGS_COVERAGE
CMAKE_SHARED_LINKER_FLAGS_COVERAGE)

View File

@ -0,0 +1,15 @@
if(KOMPUTE_OPT_REPO_SUBMODULE_BUILD)
message(FATAL_ERROR "'KOMPUTE_OPT_REPO_SUBMODULE_BUILD' got replaced by 'KOMPUTE_OPT_USE_BUILT_IN_SPDLOG', 'KOMPUTE_OPT_USE_BUILT_IN_FMT', 'KOMPUTE_OPT_USE_BUILT_IN_GOOGLE_TEST', 'KOMPUTE_OPT_USE_BUILT_IN_PYBIND11' and 'KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER'. Please use them instead.")
endif()
if(KOMPUTE_OPT_BUILD_AS_SHARED_LIB)
message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_AS_SHARED_LIB' is deprecated and should not be used. Instead use the default 'BUILD_SHARED_LIBS' CMake switch.")
endif()
if(KOMPUTE_OPT_BUILD_SINGLE_HEADER)
message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_SINGLE_HEADER' is deprecated and should not be used. The single header will now always be build and can be included via '#include<kompute/kompute.h>'.")
endif()
if(KOMPUTE_OPT_ENABLE_SPDLOG)
message(FATAL_ERROR "'KOMPUTE_OPT_ENABLE_SPDLOG' is deprecated and should not be used. It got replaced by 'KOMPUTE_OPT_LOG_LEVEL'. This option can be set to a variety of log levels (e.g. 'Off', 'Trace', 'Debug', 'Default', ...).")
endif()

View File

@ -0,0 +1,8 @@
include(CMakeFindDependencyMacro)
@PACKAGE_INIT@
find_dependency(VULKAN REQUIRED)
include(${CMAKE_CURRENT_LIST_DIR}/komputeTargets.cmake)
check_required_components(kompute)

View File

@ -0,0 +1,43 @@
function(vulkan_compile_shader)
find_program(GLS_LANG_VALIDATOR_PATH NAMES glslangValidator)
if(GLS_LANG_VALIDATOR_PATH STREQUAL "GLS_LANG_VALIDATOR_PATH-NOTFOUND")
message(FATAL_ERROR "glslangValidator not found.")
return()
endif()
cmake_parse_arguments(SHADER_COMPILE "" "INFILE;OUTFILE;NAMESPACE;RELATIVE_PATH" "" ${ARGN})
set(SHADER_COMPILE_INFILE_FULL "${CMAKE_CURRENT_SOURCE_DIR}/${SHADER_COMPILE_INFILE}")
set(SHADER_COMPILE_SPV_FILE_FULL "${CMAKE_CURRENT_BINARY_DIR}/${SHADER_COMPILE_INFILE}.spv")
set(SHADER_COMPILE_HEADER_FILE_FULL "${CMAKE_CURRENT_BINARY_DIR}/${SHADER_COMPILE_OUTFILE}")
if(NOT SHADER_COMPILE_RELATIVE_PATH)
set(SHADER_COMPILE_RELATIVE_PATH "${PROJECT_SOURCE_DIR}/cmake")
endif()
# .comp -> .spv
add_custom_command(OUTPUT "${SHADER_COMPILE_SPV_FILE_FULL}"
COMMAND "${GLS_LANG_VALIDATOR_PATH}"
ARGS "-V"
"${SHADER_COMPILE_INFILE_FULL}"
"-o"
"${SHADER_COMPILE_SPV_FILE_FULL}"
COMMENT "Compile vulkan compute shader from file '${SHADER_COMPILE_INFILE_FULL}' to '${SHADER_COMPILE_SPV_FILE_FULL}'."
MAIN_DEPENDENCY "${SHADER_COMPILE_INFILE_FULL}")
# Check if big or little endian
include (TestBigEndian)
TEST_BIG_ENDIAN(IS_BIG_ENDIAN)
# .spv -> .hpp
add_custom_command(OUTPUT "${SHADER_COMPILE_HEADER_FILE_FULL}"
COMMAND ${CMAKE_COMMAND}
ARGS "-DINPUT_SHADER_FILE=${SHADER_COMPILE_SPV_FILE_FULL}"
"-DOUTPUT_HEADER_FILE=${SHADER_COMPILE_HEADER_FILE_FULL}"
"-DHEADER_NAMESPACE=${SHADER_COMPILE_NAMESPACE}"
"-DIS_BIG_ENDIAN=${IS_BIG_ENDIAN}"
"-P"
"${SHADER_COMPILE_RELATIVE_PATH}/bin_file_to_header.cmake"
WORKING_DIRECTORY "${SHADER_COMPILE_RELATIVE_PATH}"
COMMENT "Converting compiled shader '${SHADER_COMPILE_SPV_FILE_FULL}' to header file '${SHADER_COMPILE_HEADER_FILE_FULL}'."
MAIN_DEPENDENCY "${SHADER_COMPILE_SPV_FILE_FULL}")
endfunction()

View File

@ -0,0 +1,16 @@
# Look for an executable called sphinx-build
find_program(SPHINX_EXECUTABLE
NAMES sphinx-build
DOC "Path to sphinx-build executable")
if(SPHINX_EXECUTABLE STREQUAL "SPHINX_EXECUTABLE-NOTFOUND")
message(FATAL_ERROR "sphinx-build not found.")
endif()
include(FindPackageHandleStandardArgs)
# Handle standard arguments to find_package like REQUIRED and QUIET
find_package_handle_standard_args(
Sphinx
"Failed to find sphinx-build executable"
SPHINX_EXECUTABLE)

819
kompute/external/bin/xxd.c vendored Normal file
View File

@ -0,0 +1,819 @@
/*
As indicated at https://lists.debian.org/debian-legal/2015/01/msg00037.html,
the author has permitted redistribution of xxd under the MIT license, as follows:
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* xxd: my hexdump facility. jw
*
* 2.10.90 changed to word output
* 3.03.93 new indent style, dumb bug inserted and fixed.
* -c option, mls
* 26.04.94 better option parser, -ps, -l, -s added.
* 1.07.94 -r badly needs - as input file. Per default autoskip over
* consecutive lines of zeroes, as unix od does.
* -a shows them too.
* -i dump as c-style #include "file.h"
* 1.11.95 if "xxd -i" knows the filename, an 'unsigned char filename_bits[]'
* array is written in correct c-syntax.
* -s improved, now defaults to absolute seek, relative requires a '+'.
* -r improved, now -r -s -0x... is supported.
* change/suppress leading '\0' bytes.
* -l n improved: stops exactly after n bytes.
* -r improved, better handling of partial lines with trailing garbage.
* -r improved, now -r -p works again!
* -r improved, less flushing, much faster now! (that was silly)
* 3.04.96 Per repeated request of a single person: autoskip defaults to off.
* 15.05.96 -v added. They want to know the version.
* -a fixed, to show last line inf file ends in all zeros.
* -u added: Print upper case hex-letters, as preferred by unix bc.
* -h added to usage message. Usage message extended.
* Now using outfile if specified even in normal mode, aehem.
* No longer mixing of ints and longs. May help doze people.
* Added binify ioctl for same reason. (Enough Doze stress for 1996!)
* 16.05.96 -p improved, removed occasional superfluous linefeed.
* 20.05.96 -l 0 fixed. tried to read anyway.
* 21.05.96 -i fixed. now honours -u, and prepends __ to numeric filenames.
* compile -DWIN32 for NT or W95. George V. Reilly, * -v improved :-)
* support --gnuish-longhorn-options
* 25.05.96 MAC support added: CodeWarrior already uses ``outline'' in Types.h
* which is included by MacHeaders (Axel Kielhorn). Renamed to
* xxdline().
* 7.06.96 -i printed 'int' instead of 'char'. *blush*
* added Bram's OS2 ifdefs...
* 18.07.96 gcc -Wall @ SunOS4 is now slient.
* Added osver for MSDOS/DJGPP/WIN32.
* 29.08.96 Added size_t to strncmp() for Amiga.
* 24.03.97 Windows NT support (Phil Hanna). Clean exit for Amiga WB (Bram)
* 02.04.97 Added -E option, to have EBCDIC translation instead of ASCII
* (azc10@yahoo.com)
* 22.05.97 added -g (group octets) option (jcook@namerica.kla.com).
* 23.09.98 nasty -p -r misfeature fixed: slightly wrong output, when -c was
* missing or wrong.
* 26.09.98 Fixed: 'xxd -i infile outfile' did not truncate outfile.
* 27.10.98 Fixed: -g option parser required blank.
* option -b added: 01000101 binary output in normal format.
* 16.05.00 Added VAXC changes by Stephen P. Wall
* 16.05.00 Improved MMS file and merge for VMS by Zoltan Arpadffy
*
* (c) 1990-1998 by Juergen Weigert (jnweiger@informatik.uni-erlangen.de)
*
* Small changes made afterwards by Bram Moolenaar et al.
*
* Distribute freely and credit me,
* make money and share with me,
* lose money and don't ask me.
*
*
*/
/* Visual Studio 2005 has 'deprecated' many of the standard CRT functions */
#if _MSC_VER >= 1400
# define _CRT_SECURE_NO_DEPRECATE
# define _CRT_NONSTDC_NO_DEPRECATE
#endif
#include <stdio.h>
#ifdef VAXC
# include <file.h>
#else
# include <fcntl.h>
#endif
#ifdef __TSC__
# define MSDOS
#endif
#if !defined(OS2) && defined(__EMX__)
# define OS2
#endif
#if defined(MSDOS) || defined(WIN32) || defined(OS2) || defined(__BORLANDC__) || defined(CYGWIN)
# include <io.h> /* for setmode() */
#else
# ifdef UNIX
# include <unistd.h>
# endif
#endif
#include <stdlib.h>
#include <string.h> /* for strncmp() */
#include <ctype.h> /* for isalnum() */
#if __MWERKS__ && !defined(BEBOX)
# include <unix.h> /* for fdopen() on MAC */
#endif
#if defined(__BORLANDC__) && __BORLANDC__ <= 0x0410 && !defined(fileno)
/* Missing define and prototype grabbed from the BC 4.0 <stdio.h> */
# define fileno(f) ((f)->fd)
FILE _FAR *_Cdecl _FARFUNC fdopen(int __handle, char _FAR *__type);
#endif
/* This corrects the problem of missing prototypes for certain functions
* in some GNU installations (e.g. SunOS 4.1.x).
* Darren Hiebert <darren@hmi.com> (sparc-sun-sunos4.1.3_U1/2.7.2.2)
*/
#if defined(__GNUC__) && defined(__STDC__)
# ifndef __USE_FIXED_PROTOTYPES__
# define __USE_FIXED_PROTOTYPES__
# endif
#endif
#ifndef __USE_FIXED_PROTOTYPES__
/*
* This is historic and works only if the compiler really has no prototypes:
*
* Include prototypes for Sun OS 4.x, when using an ANSI compiler.
* FILE is defined on OS 4.x, not on 5.x (Solaris).
* if __SVR4 is defined (some Solaris versions), don't include this.
*/
#if defined(sun) && defined(FILE) && !defined(__SVR4) && defined(__STDC__)
# define __P(a) a
/* excerpt from my sun_stdlib.h */
extern int fprintf __P((FILE *, char *, ...));
extern int fputs __P((char *, FILE *));
extern int _flsbuf __P((unsigned char, FILE *));
extern int _filbuf __P((FILE *));
extern int fflush __P((FILE *));
extern int fclose __P((FILE *));
extern int fseek __P((FILE *, long, int));
extern int rewind __P((FILE *));
extern void perror __P((char *));
# endif
#endif
extern long int strtol();
extern long int ftell();
char version[] = "xxd V1.10 27oct98 by Juergen Weigert";
#ifdef WIN32
char osver[] = " (Win32)";
#else
# ifdef DJGPP
char osver[] = " (dos 32 bit)";
# else
# ifdef MSDOS
char osver[] = " (dos 16 bit)";
# else
char osver[] = "";
# endif
# endif
#endif
#if !defined(CYGWIN) && (defined(CYGWIN32) || defined(__CYGWIN__) || defined(__CYGWIN32__))
# define CYGWIN
#endif
#if defined(MSDOS) || defined(WIN32) || defined(OS2)
# define BIN_READ(yes) ((yes) ? "rb" : "rt")
# define BIN_WRITE(yes) ((yes) ? "wb" : "wt")
# define BIN_CREAT(yes) ((yes) ? (O_CREAT|O_BINARY) : O_CREAT)
# define BIN_ASSIGN(fp, yes) setmode(fileno(fp), (yes) ? O_BINARY : O_TEXT)
# define PATH_SEP '\\'
#elif defined(CYGWIN)
# define BIN_READ(yes) ((yes) ? "rb" : "rt")
# define BIN_WRITE(yes) ((yes) ? "wb" : "w")
# define BIN_CREAT(yes) ((yes) ? (O_CREAT|O_BINARY) : O_CREAT)
# define BIN_ASSIGN(fp, yes) ((yes) ? (void) setmode(fileno(fp), O_BINARY) : (void) (fp))
# define PATH_SEP '/'
#else
# ifdef VMS
# define BIN_READ(dummy) "r"
# define BIN_WRITE(dummy) "w"
# define BIN_CREAT(dummy) O_CREAT
# define BIN_ASSIGN(fp, dummy) fp
# define PATH_SEP ']'
# define FILE_SEP '.'
# else
# define BIN_READ(dummy) "r"
# define BIN_WRITE(dummy) "w"
# define BIN_CREAT(dummy) O_CREAT
# define BIN_ASSIGN(fp, dummy) fp
# define PATH_SEP '/'
# endif
#endif
/* open has only to arguments on the Mac */
#if __MWERKS__
# define OPEN(name, mode, umask) open(name, mode)
#else
# define OPEN(name, mode, umask) open(name, mode, umask)
#endif
#ifdef AMIGA
# define STRNCMP(s1, s2, l) strncmp(s1, s2, (size_t)l)
#else
# define STRNCMP(s1, s2, l) strncmp(s1, s2, l)
#endif
#ifndef __P
# if defined(__STDC__) || defined(MSDOS) || defined(WIN32) || defined(OS2) \
|| defined(__BORLANDC__)
# define __P(a) a
# else
# define __P(a) ()
# endif
#endif
/* Let's collect some prototypes */
/* CodeWarrior is really picky about missing prototypes */
static void exit_with_usage __P((char *));
static int huntype __P((FILE *, FILE *, FILE *, char *, int, int, long));
static void xxdline __P((FILE *, char *, int));
#define TRY_SEEK /* attempt to use lseek, or skip forward by reading */
#define COLS 256 /* change here, if you ever need more columns */
#define LLEN (11 + (9*COLS-1)/1 + COLS + 2)
char hexxa[] = "0123456789abcdef0123456789ABCDEF", *hexx = hexxa;
/* the different hextypes known by this program: */
#define HEX_NORMAL 0
#define HEX_POSTSCRIPT 1
#define HEX_CINCLUDE 2
#define HEX_BITS 3 /* not hex a dump, but bits: 01111001 */
static void
exit_with_usage(pname)
char *pname;
{
fprintf(stderr, "Usage:\n %s [options] [infile [outfile]]\n", pname);
fprintf(stderr, " or\n %s -r [-s [-]offset] [-c cols] [-ps] [infile [outfile]]\n", pname);
fprintf(stderr, "Options:\n");
fprintf(stderr, " -a toggle autoskip: A single '*' replaces nul-lines. Default off.\n");
fprintf(stderr, " -b binary digit dump (incompatible with -p,-i,-r). Default hex.\n");
fprintf(stderr, " -c cols format <cols> octets per line. Default 16 (-i: 12, -ps: 30).\n");
fprintf(stderr, " -E show characters in EBCDIC. Default ASCII.\n");
fprintf(stderr, " -g number of octets per group in normal output. Default 2.\n");
fprintf(stderr, " -h print this summary.\n");
fprintf(stderr, " -i output in C include file style.\n");
fprintf(stderr, " -l len stop after <len> octets.\n");
fprintf(stderr, " -ps output in postscript plain hexdump style.\n");
fprintf(stderr, " -r reverse operation: convert (or patch) hexdump into binary.\n");
fprintf(stderr, " -r -s off revert with <off> added to file positions found in hexdump.\n");
fprintf(stderr, " -s %sseek start at <seek> bytes abs. %sinfile offset.\n",
#ifdef TRY_SEEK
"[+][-]", "(or +: rel.) ");
#else
"", "");
#endif
fprintf(stderr, " -u use upper case hex letters.\n");
fprintf(stderr, " -v show version: \"%s%s\".\n", version, osver);
exit(1);
}
/*
* Max. cols binary characters are decoded from the input stream per line.
* Two adjacent garbage characters after evaluated data delimit valid data.
* Everything up to the next newline is discarded.
*
* The name is historic and came from 'undo type opt h'.
*/
static int
huntype(fpi, fpo, fperr, pname, cols, hextype, base_off)
FILE *fpi, *fpo, *fperr;
char *pname;
int cols, hextype;
long base_off;
{
int c, ign_garb = 1, n1 = -1, n2 = 0, n3, p = cols;
long have_off = 0, want_off = 0;
rewind(fpi);
while ((c = getc(fpi)) != EOF)
{
if (c == '\r') /* Doze style input file? */
continue;
#if 0 /* this doesn't work when there is normal text after the hex codes in
the last line that looks like hex */
if (c == ' ' || c == '\n' || c == '\t') /* allow multiple spaces */
continue;
#endif
n3 = n2;
n2 = n1;
if (c >= '0' && c <= '9')
n1 = c - '0';
else if (c >= 'a' && c <= 'f')
n1 = c - 'a' + 10;
else if (c >= 'A' && c <= 'F')
n1 = c - 'A' + 10;
else
{
n1 = -1;
if (ign_garb)
continue;
}
ign_garb = 0;
if (p >= cols)
{
if (!hextype)
{
if (n1 < 0)
{
p = 0;
continue;
}
want_off = (want_off << 4) | n1;
continue;
}
else
p = 0;
}
if (base_off + want_off != have_off)
{
fflush(fpo);
#ifdef TRY_SEEK
c = fseek(fpo, base_off + want_off - have_off, 1);
if (c >= 0)
have_off = base_off + want_off;
#endif
if (base_off + want_off < have_off)
{
fprintf(fperr, "%s: sorry, cannot seek backwards.\n", pname);
return 5;
}
for (; have_off < base_off + want_off; have_off++)
putc(0, fpo);
}
if (n2 >= 0 && n1 >= 0)
{
putc((n2 << 4) | n1, fpo);
have_off++;
want_off++;
n1 = -1;
if ((++p >= cols) && !hextype)
{
/* skip rest of line as garbage */
want_off = 0;
while ((c = getc(fpi)) != '\n' && c != EOF)
;
ign_garb = 1;
}
}
else if (n1 < 0 && n2 < 0 && n3 < 0)
{
/* already stumbled into garbage, skip line, wait and see */
if (!hextype)
want_off = 0;
while ((c = getc(fpi)) != '\n' && c != EOF)
;
ign_garb = 1;
}
}
fflush(fpo);
#ifdef TRY_SEEK
fseek(fpo, 0L, 2);
#endif
fclose(fpo);
fclose(fpi);
return 0;
}
/*
* Print line l. If nz is false, xxdline regards the line a line of
* zeroes. If there are three or more consecutive lines of zeroes,
* they are replaced by a single '*' character.
*
* If the output ends with more than two lines of zeroes, you
* should call xxdline again with l being the last line and nz
* negative. This ensures that the last line is shown even when
* it is all zeroes.
*
* If nz is always positive, lines are never suppressed.
*/
static void
xxdline(fp, l, nz)
FILE *fp;
char *l;
int nz;
{
static char z[LLEN+1];
static int zero_seen = 0;
if (!nz && zero_seen == 1)
strcpy(z, l);
if (nz || !zero_seen++)
{
if (nz)
{
if (nz < 0)
zero_seen--;
if (zero_seen == 2)
fputs(z, fp);
if (zero_seen > 2)
fputs("*\n", fp);
}
if (nz >= 0 || zero_seen > 0)
fputs(l, fp);
if (nz)
zero_seen = 0;
}
}
/* This is an EBCDIC to ASCII conversion table */
/* from a proposed BTL standard April 16, 1979 */
static unsigned char etoa64[] =
{
0040,0240,0241,0242,0243,0244,0245,0246,
0247,0250,0325,0056,0074,0050,0053,0174,
0046,0251,0252,0253,0254,0255,0256,0257,
0260,0261,0041,0044,0052,0051,0073,0176,
0055,0057,0262,0263,0264,0265,0266,0267,
0270,0271,0313,0054,0045,0137,0076,0077,
0272,0273,0274,0275,0276,0277,0300,0301,
0302,0140,0072,0043,0100,0047,0075,0042,
0303,0141,0142,0143,0144,0145,0146,0147,
0150,0151,0304,0305,0306,0307,0310,0311,
0312,0152,0153,0154,0155,0156,0157,0160,
0161,0162,0136,0314,0315,0316,0317,0320,
0321,0345,0163,0164,0165,0166,0167,0170,
0171,0172,0322,0323,0324,0133,0326,0327,
0330,0331,0332,0333,0334,0335,0336,0337,
0340,0341,0342,0343,0344,0135,0346,0347,
0173,0101,0102,0103,0104,0105,0106,0107,
0110,0111,0350,0351,0352,0353,0354,0355,
0175,0112,0113,0114,0115,0116,0117,0120,
0121,0122,0356,0357,0360,0361,0362,0363,
0134,0237,0123,0124,0125,0126,0127,0130,
0131,0132,0364,0365,0366,0367,0370,0371,
0060,0061,0062,0063,0064,0065,0066,0067,
0070,0071,0372,0373,0374,0375,0376,0377
};
const char* extract_filename(const char* path) {
const char* filename = strrchr(path, '/');
if (filename) {
return filename + 1;
}
return path;
}
int
main(argc, argv)
int argc;
char *argv[];
{
FILE *fp, *fpo;
int c, e, p = 0, relseek = 1, negseek = 0, revert = 0;
int cols = 0, nonzero = 0, autoskip = 0, hextype = HEX_NORMAL;
int ebcdic = 0;
int octspergrp = -1; /* number of octets grouped in output */
int grplen; /* total chars per octet group */
long length = -1, n = 0, seekoff = 0;
char l[LLEN+1];
char *pname, *pp;
#ifdef AMIGA
/* This program doesn't work when started from the Workbench */
if (argc == 0)
exit(1);
#endif
pname = argv[0];
for (pp = pname; *pp; )
if (*pp++ == PATH_SEP)
pname = pp;
#ifdef FILE_SEP
for (pp = pname; *pp; pp++)
if (*pp == FILE_SEP)
{
*pp = '\0';
break;
}
#endif
while (argc >= 2)
{
pp = argv[1] + (!STRNCMP(argv[1], "--", 2) && argv[1][2]);
if (!STRNCMP(pp, "-a", 2)) autoskip = 1 - autoskip;
else if (!STRNCMP(pp, "-b", 2)) hextype = HEX_BITS;
else if (!STRNCMP(pp, "-u", 2)) hexx = hexxa + 16;
else if (!STRNCMP(pp, "-p", 2)) hextype = HEX_POSTSCRIPT;
else if (!STRNCMP(pp, "-i", 2)) hextype = HEX_CINCLUDE;
else if (!STRNCMP(pp, "-r", 2)) revert++;
else if (!STRNCMP(pp, "-E", 2)) ebcdic++;
else if (!STRNCMP(pp, "-v", 2))
{
fprintf(stderr, "%s%s\n", version, osver);
exit(0);
}
else if (!STRNCMP(pp, "-c", 2))
{
if (pp[2] && STRNCMP("ols", pp + 2, 3))
cols = (int)strtol(pp + 2, NULL, 0);
else
{
if (!argv[2])
exit_with_usage(pname);
cols = (int)strtol(argv[2], NULL, 0);
argv++;
argc--;
}
}
else if (!STRNCMP(pp, "-g", 2))
{
if (pp[2] && STRNCMP("roupsize", pp + 2, 8))
octspergrp = (int)strtol(pp + 2, NULL, 0);
else
{
if (!argv[2])
exit_with_usage(pname);
octspergrp = (int)strtol(argv[2], NULL, 0);
argv++;
argc--;
}
}
else if (!STRNCMP(pp, "-s", 2))
{
relseek = 0;
negseek = 0;
if (pp[2] && STRNCMP("kip", pp+2, 3) && STRNCMP("eek", pp+2, 3))
{
#ifdef TRY_SEEK
if (pp[2] == '+')
relseek++;
if (pp[2+relseek] == '-')
negseek++;
#endif
seekoff = strtol(pp + 2+relseek+negseek, (char **)NULL, 0);
}
else
{
if (!argv[2])
exit_with_usage(pname);
#ifdef TRY_SEEK
if (argv[2][0] == '+')
relseek++;
if (argv[2][relseek] == '-')
negseek++;
#endif
seekoff = strtol(argv[2] + relseek+negseek, (char **)NULL, 0);
argv++;
argc--;
}
}
else if (!STRNCMP(pp, "-l", 2))
{
if (pp[2] && STRNCMP("en", pp + 2, 2))
length = strtol(pp + 2, (char **)NULL, 0);
else
{
if (!argv[2])
exit_with_usage(pname);
length = strtol(argv[2], (char **)NULL, 0);
argv++;
argc--;
}
}
else if (!strcmp(pp, "--")) /* end of options */
{
argv++;
argc--;
break;
}
else if (pp[0] == '-' && pp[1]) /* unknown option */
exit_with_usage(pname);
else
break; /* not an option */
argv++; /* advance to next argument */
argc--;
}
if (!cols)
switch (hextype)
{
case HEX_POSTSCRIPT: cols = 30; break;
case HEX_CINCLUDE: cols = 12; break;
case HEX_BITS: cols = 6; break;
case HEX_NORMAL:
default: cols = 16; break;
}
if (octspergrp < 0)
switch (hextype)
{
case HEX_BITS: octspergrp = 1; break;
case HEX_NORMAL: octspergrp = 2; break;
case HEX_POSTSCRIPT:
case HEX_CINCLUDE:
default: octspergrp = 0; break;
}
if (cols < 1 || ((hextype == HEX_NORMAL || hextype == HEX_BITS)
&& (cols > COLS)))
{
fprintf(stderr, "%s: invalid number of columns (max. %d).\n", pname, COLS);
exit(1);
}
if (octspergrp < 1)
octspergrp = cols;
if (argc > 3)
exit_with_usage(pname);
if (argc == 1 || (argv[1][0] == '-' && !argv[1][1]))
BIN_ASSIGN(fp = stdin, !revert);
else
{
if ((fp = fopen(argv[1], BIN_READ(!revert))) == NULL)
{
fprintf(stderr,"%s: ", pname);
perror(argv[1]);
return 2;
}
}
if (argc < 3 || (argv[2][0] == '-' && !argv[2][1]))
BIN_ASSIGN(fpo = stdout, revert);
else
{
int fd;
int mode = revert ? O_WRONLY : (O_TRUNC|O_WRONLY);
if (((fd = OPEN(argv[2], mode | BIN_CREAT(revert), 0666)) < 0) ||
(fpo = fdopen(fd, BIN_WRITE(revert))) == NULL)
{
fprintf(stderr, "%s: ", pname);
perror(argv[2]);
return 3;
}
rewind(fpo);
}
if (revert)
{
if (hextype && (hextype != HEX_POSTSCRIPT))
{
fprintf(stderr, "%s: sorry, cannot revert this type of hexdump\n", pname);
return -1;
}
return huntype(fp, fpo, stderr, pname, cols, hextype,
negseek ? -seekoff : seekoff);
}
if (seekoff || negseek || !relseek)
{
#ifdef TRY_SEEK
if (relseek)
e = fseek(fp, negseek ? -seekoff : seekoff, 1);
else
e = fseek(fp, negseek ? -seekoff : seekoff, negseek ? 2 : 0);
if (e < 0 && negseek)
{
fprintf(stderr, "%s: sorry cannot seek.\n", pname);
return 4;
}
if (e >= 0)
seekoff = ftell(fp);
else
#endif
{
long s = seekoff;
while (s--)
(void)getc(fp);
}
}
if (hextype == HEX_CINCLUDE)
{
const char* filename = extract_filename(argv[1]);
if (fp != stdin)
{
fprintf(fpo, "unsigned char %s", isdigit((int)filename[0]) ? "__" : "");
for (e = 0; (c = filename[e]) != 0; e++)
putc(isalnum(c) ? c : '_', fpo);
fputs("[] = {\n", fpo);
}
p = 0;
while ((length < 0 || p < length) && (c = getc(fp)) != EOF)
{
fprintf(fpo, (hexx == hexxa) ? "%s0x%02x" : "%s0X%02X",
(p % cols) ? ", " : ",\n "+2*!p, c);
p++;
}
if (p)
fputs("\n};\n"+3*(fp == stdin), fpo);
if (fp != stdin)
{
fprintf(fpo, "unsigned int %s", isdigit((int)filename[0]) ? "__" : "");
for (e = 0; (c = filename[e]) != 0; e++)
putc(isalnum(c) ? c : '_', fpo);
fprintf(fpo, "_len = %d;\n", p);
}
fclose(fp);
fclose(fpo);
return 0;
}
if (hextype == HEX_POSTSCRIPT)
{
p = cols;
while ((length < 0 || n < length) && (e = getc(fp)) != EOF)
{
putchar(hexx[(e >> 4) & 0xf]);
putchar(hexx[(e ) & 0xf]);
n++;
if (!--p)
{
putchar('\n');
p = cols;
}
}
if (p < cols)
putchar('\n');
fclose(fp);
fclose(fpo);
return 0;
}
/* hextype: HEX_NORMAL or HEX_BITS */
if (hextype == HEX_NORMAL)
grplen = octspergrp + octspergrp + 1; /* chars per octet group */
else /* hextype == HEX_BITS */
grplen = 8 * octspergrp + 1;
while ((length < 0 || n < length) && (e = getc(fp)) != EOF)
{
if (p == 0)
{
sprintf(l, "%07lx: ", n + seekoff);
for (c = 9; c < LLEN; l[c++] = ' ');
}
if (hextype == HEX_NORMAL)
{
l[c = (9 + (grplen * p) / octspergrp)] = hexx[(e >> 4) & 0xf];
l[++c] = hexx[ e & 0xf];
}
else /* hextype == HEX_BITS */
{
int i;
c = (9 + (grplen * p) / octspergrp) - 1;
for (i = 7; i >= 0; i--)
l[++c] = (e & (1 << i)) ? '1' : '0';
}
if (ebcdic)
e = (e < 64) ? '.' : etoa64[e-64];
/* When changing this update definition of LLEN above. */
l[11 + (grplen * cols - 1)/octspergrp + p] =
#ifdef __MVS__
(e >= 64)
#else
(e > 31 && e < 127)
#endif
? e : '.';
if (e)
nonzero++;
n++;
if (++p == cols)
{
l[c = (11 + (grplen * cols - 1)/octspergrp + p)] = '\n'; l[++c] = '\0';
xxdline(fpo, l, autoskip ? nonzero : 1);
nonzero = 0;
p = 0;
}
}
if (p)
{
l[c = (11 + (grplen * cols - 1)/octspergrp + p)] = '\n'; l[++c] = '\0';
xxdline(fpo, l, 1);
}
else if (autoskip)
xxdline(fpo, l, -1); /* last chance to flush out suppressed lines */
fclose(fp);
fclose(fpo);
return 0;
}

View File

@ -0,0 +1,28 @@
# General purpose GPU compute framework built on Vulkan to
# support 1000s of cross vendor graphics cards
# (AMD, Qualcomm, NVIDIA & friends). Blazing fast, mobile-enabled,
# asynchronous and optimized for advanced GPU data processing use cases.
# Backed by the Linux Foundation.
#
# Finding this module will define the following variables:
# KOMPUTE_FOUND - True if the core library has been found
# KOMPUTE_LIBRARIES - Path to the core library archive
# KOMPUTE_INCLUDE_DIRS - Path to the include directories. Gives access
# to kompute.h, as a single include which must be included in every
# file that uses this interface. Else it also points to the
# directory for individual includes.
find_path(KOMPUTE_INCLUDE_DIR
NAMES kompute.h)
find_library(KOMPUTE_LIBRARY
NAMES kompute
HINTS ${KOMPUTE_LIBRARY_ROOT})
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(KOMPUTE REQUIRED_VARS KOMPUTE_LIBRARY KOMPUTE_INCLUDE_DIR)
if(KOMPUTE_FOUND)
set(KOMPUTE_LIBRARIES ${KOMPUTE_LIBRARY})
set(KOMPUTE_INCLUDE_DIRS ${KOMPUTE_INCLUDE_DIR})
endif()

145
kompute/op_add.comp Normal file
View File

@ -0,0 +1,145 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
layout(local_size_x = 1) in;
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inAOff;
uint inBOff;
uint outOff;
uint row;
} pcs;
void main() {
const uint i = gl_WorkGroupID.x;
out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i) + pcs.inBOff];
}

145
kompute/op_addrow.comp Normal file
View File

@ -0,0 +1,145 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
layout(local_size_x = 1) in;
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inAOff;
uint inBOff;
uint outOff;
uint row;
} pcs;
void main() {
const uint i = gl_WorkGroupID.x;
out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
}

176
kompute/op_cpy_f16_f16.comp Normal file
View File

@ -0,0 +1,176 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
#define nth 32
#define IN_TYPE float16_t
#define IN_TYPE_SIZE 2
#define OUT_TYPE float16_t
#define OUT_TYPE_SIZE 2
layout(local_size_x = nth) in;
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
layout (push_constant) uniform parameter {
uint inOff;
uint outOff;
int ne00;
int ne01;
int ne02;
uint nb00;
uint nb01;
uint nb02;
uint nb03;
int ne0;
int ne1;
int ne2;
uint nb0;
uint nb1;
uint nb2;
uint nb3;
} pcs;
void main() {
const uint i03 = gl_WorkGroupID.z;
const uint i02 = gl_WorkGroupID.y;
const uint i01 = gl_WorkGroupID.x;
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
out_[dst_data+i00] = OUT_TYPE(in_[src]);
}
}

176
kompute/op_cpy_f16_f32.comp Normal file
View File

@ -0,0 +1,176 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
#define nth 32
#define IN_TYPE float16_t
#define IN_TYPE_SIZE 2
#define OUT_TYPE float
#define OUT_TYPE_SIZE 4
layout(local_size_x = nth) in;
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
layout (push_constant) uniform parameter {
uint inOff;
uint outOff;
int ne00;
int ne01;
int ne02;
uint nb00;
uint nb01;
uint nb02;
uint nb03;
int ne0;
int ne1;
int ne2;
uint nb0;
uint nb1;
uint nb2;
uint nb3;
} pcs;
void main() {
const uint i03 = gl_WorkGroupID.z;
const uint i02 = gl_WorkGroupID.y;
const uint i01 = gl_WorkGroupID.x;
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
out_[dst_data+i00] = OUT_TYPE(in_[src]);
}
}

176
kompute/op_cpy_f32_f16.comp Normal file
View File

@ -0,0 +1,176 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
#define nth 32
#define IN_TYPE float
#define IN_TYPE_SIZE 4
#define OUT_TYPE float16_t
#define OUT_TYPE_SIZE 2
layout(local_size_x = nth) in;
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
layout (push_constant) uniform parameter {
uint inOff;
uint outOff;
int ne00;
int ne01;
int ne02;
uint nb00;
uint nb01;
uint nb02;
uint nb03;
int ne0;
int ne1;
int ne2;
uint nb0;
uint nb1;
uint nb2;
uint nb3;
} pcs;
void main() {
const uint i03 = gl_WorkGroupID.z;
const uint i02 = gl_WorkGroupID.y;
const uint i01 = gl_WorkGroupID.x;
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
out_[dst_data+i00] = OUT_TYPE(in_[src]);
}
}

168
kompute/op_cpy_f32_f32.comp Normal file
View File

@ -0,0 +1,168 @@
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
#define nth 32
#define IN_TYPE float
#define IN_TYPE_SIZE 4
#define OUT_TYPE float
#define OUT_TYPE_SIZE 4
layout(local_size_x = nth) in;
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
layout (push_constant) uniform parameter {
uint inOff;
uint outOff;
int ne00;
int ne01;
int ne02;
uint nb00;
uint nb01;
uint nb02;
uint nb03;
int ne0;
int ne1;
int ne2;
uint nb0;
uint nb1;
uint nb2;
uint nb3;
} pcs;
void main() {
const uint i03 = gl_WorkGroupID.z;
const uint i02 = gl_WorkGroupID.y;
const uint i01 = gl_WorkGroupID.x;
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
out_[dst_data+i00] = OUT_TYPE(in_[src]);
}
}

153
kompute/op_diagmask.comp Normal file
View File

@ -0,0 +1,153 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
layout(local_size_x = 1) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inOff;
uint outOff;
uint n_past;
int ne00;
int ne01;
} pcs;
void main() {
const uint i02 = gl_WorkGroupID.z;
const uint i01 = gl_WorkGroupID.y;
const uint i00 = gl_WorkGroupID.x;
const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00;
if (i00 > pcs.n_past + i01) {
out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000);
} else {
out_[index + pcs.outOff] = in_[index + pcs.inOff];
}
}

142
kompute/op_gelu.comp Normal file
View File

@ -0,0 +1,142 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
layout(local_size_x = 1) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inOff;
uint outOff;
} pcs;
void main() {
const uint i = gl_WorkGroupID.x;
const float x = in_[i + pcs.inOff];
out_[i + pcs.outOff] = 0.5*x*(1.0 + tanh(SQRT_2_OVER_PI*x*(1.0 + GELU_COEF_A*x*x)));
}

150
kompute/op_getrows_f16.comp Normal file
View File

@ -0,0 +1,150 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
layout(local_size_x = 1) in;
layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
layout (push_constant) uniform parameter {
uint inAOff;
uint inBOff;
uint outOff;
int ne00;
int nb01;
int nb1;
} pcs;
void main() {
const uint i = gl_WorkGroupID.x;
const int r = inB[i + pcs.inBOff];
for (int j = 0; j < pcs.ne00; j++) {
out_[i*pcs.nb1 + j + pcs.outOff] = inA[r*pcs.nb01/2+j + pcs.inAOff];
}
}

View File

@ -0,0 +1,179 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
layout(local_size_x = 1) in;
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
layout (push_constant) uniform parameter {
uint inAOff;
uint inBOff;
uint outOff;
int ne00;
int nb01;
int nb1;
} pcs;
#define UNALIGNED_INPUT inA
block_q4_0 get_unaligned_block_q4_0(uint index) {
block_q4_0 fres;
fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
[[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
fres.qs[it] = UNALIGNED_INPUT[index+2+it];
}
return fres;
}
void dequantize_row_q4_0(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
const uint qk = QK4_0;
const uint nb = k / qk;
for (uint i = 0; i < nb; i++) {
const block_q4_0 block = get_unaligned_block_q4_0(x + i*sizeof_block_q4_0);
const float16_t d = block.d;
for (uint j = 0; j < qk/2; ++j) {
const int x0 = (block.qs[j] & 0x0F) - 8;
const int x1 = (block.qs[j] >> 4) - 8;
out_[y+i*qk + j + 0 ] = float(x0)*d;
out_[y+i*qk + j + qk/2] = float(x1)*d;
}
}
}
void main() {
const uint i = gl_WorkGroupID.x;
const int r = inB[i + pcs.inBOff];
dequantize_row_q4_0(uint(r*pcs.nb01) + pcs.inAOff, uint(i*pcs.nb1/4) + pcs.outOff, pcs.ne00);
}

View File

@ -0,0 +1,181 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
layout(local_size_x = 1) in;
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
layout (push_constant) uniform parameter {
uint inAOff;
uint inBOff;
uint outOff;
int ne00;
int nb01;
int nb1;
} pcs;
#define UNALIGNED_INPUT inA
block_q4_1 get_unaligned_block_q4_1(uint index) {
block_q4_1 fres;
fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
fres.m = u8BufToFloat16(UNALIGNED_INPUT, index+2);
[[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
fres.qs[it] = UNALIGNED_INPUT[index+4+it];
}
return fres;
}
void dequantize_row_q4_1(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
const uint qk = QK4_1;
const uint nb = k / qk;
for (uint i = 0; i < nb; i++) {
const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_0);
const float16_t d = block.d;
const float16_t m = block.m;
for (uint j = 0; j < qk/2; ++j) {
const int x0 = (block.qs[j] & 0x0F);
const int x1 = (block.qs[j] >> 4);
out_[y+i*qk + j + 0 ] = float(x0)*d + m;
out_[y+i*qk + j + qk/2] = float(x1)*d + m;
}
}
}
void main() {
const uint i = gl_WorkGroupID.x;
const int r = inB[i + pcs.inBOff];
dequantize_row_q4_1(uint(r*pcs.nb01) + pcs.inAOff, uint(i*pcs.nb1/4) + pcs.outOff, pcs.ne00);
}

145
kompute/op_mul.comp Normal file
View File

@ -0,0 +1,145 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
layout(local_size_x = 1) in;
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inAOff;
uint inBOff;
uint outOff;
uint row;
} pcs;
void main() {
const uint i = gl_WorkGroupID.x;
out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i) + pcs.inBOff];
}

177
kompute/op_mul_mat_f16.comp Normal file
View File

@ -0,0 +1,177 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
layout(local_size_x = 64) in;
layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
layout (push_constant) uniform parameter {
uint inAOff;
uint inBOff;
uint outOff;
int ne00;
uint nb01;
uint nb02;
uint nb11;
uint nb12;
int ne0;
int ne1;
} pcs;
shared float sum[gl_WorkGroupSize.x];
void main() {
const uint r0 = gl_WorkGroupID.x;
const uint r1 = gl_WorkGroupID.y;
const uint im = gl_WorkGroupID.z;
const uint x = (r0*pcs.nb01 + im*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // based from inB
sum[gl_LocalInvocationID.x] = 0.0;
for (uint i = gl_LocalInvocationID.x; i < pcs.ne00; i += gl_WorkGroupSize.x) {
sum[gl_LocalInvocationID.x] += float(inA[x+i]) * float(inB[y+i]);
}
// accumulate the sum from all threads in the threadgroup
barrier();
memoryBarrierShared();
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
if (gl_LocalInvocationID.x < i) {
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
}
barrier();
memoryBarrierShared();
}
if (gl_LocalInvocationID.x == 0) {
out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = sum[0];
}
}

View File

@ -0,0 +1,195 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
layout(local_size_x = 8, local_size_y = 8) in;
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
layout (push_constant) uniform parameter {
uint inAOff;
uint inBOff;
uint outOff;
int ne00;
int ne10;
int ne0;
} pcs;
shared float sum[64];
void main() {
const uint nb = uint(pcs.ne00/QK4_0);
const uint r0 = gl_WorkGroupID.x;
const uint r1 = gl_WorkGroupID.y;
const uint x = r0*nb; // Based from inA without base offset
const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
const uint ix = gl_LocalInvocationID.y/4; // 0 or 1
const uint iy = gl_LocalInvocationID.y - 4*ix; // 0...3
const uint first = 4 * iy;
float sumf = 0.0;
for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
const uint index = (x+i)*sizeof_block_q4_0+pcs.inAOff;
const float d = float(u8BufToFloat16(inA, index));
const uint xl = first; // Based from bl->qs
const uint yl = y + i * QK4_0 + first; // Based from inB
vec2 acc = vec2(0.0, 0.0);
for (int j = 0; j < 4; ++j) {
const uint8_t b = inA[index+2+xl+j];
acc.x += inB[yl+j] * (b & 0xF) + inB[yl+j+16] * (b >> 4);
acc.y += inB[yl+j] + inB[yl+j+16];
}
sumf += d * (acc.x - 8.*acc.y);
}
sum[ith] = sumf;
//
// Accumulate the sum from all threads in the threadgroup
//
barrier();
if (ith == 0) {
float sumTotal = 0.0;
for (uint i = 0; i < nth; ++i) {
sumTotal += sum[i];
}
out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sumTotal;
}
}

View File

@ -0,0 +1,218 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
layout(local_size_x = 8, local_size_y = 8) in;
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
layout (push_constant) uniform parameter {
uint inAOff;
uint inBOff;
uint outOff;
int ne00;
int ne10;
int ne0;
} pcs;
shared float sum[gl_WorkGroupSize.x*gl_WorkGroupSize.y];
#define UNALIGNED_INPUT inA
block_q4_1 get_unaligned_block_q4_1(uint index) {
block_q4_1 fres;
fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
fres.m = u8BufToFloat16(UNALIGNED_INPUT, index+2);
[[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
fres.qs[it] = UNALIGNED_INPUT[index+4+it];
}
return fres;
}
void main() {
const uint nb = uint(pcs.ne00/QK4_1);
const uint r0 = gl_WorkGroupID.x;
const uint r1 = gl_WorkGroupID.y;
const uint x = r0*nb; // Based from inA without base offset
const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
const uint ix = gl_LocalInvocationID.y/4; // 0 or 1
const uint iy = gl_LocalInvocationID.y - 4*ix; // 0...3
const uint first = 4 * iy;
float sumf = 0.0;
for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
//TODO: Removing the use of pointers has been quite hairy here. If something goes wrong here, this is most likely it:
const block_q4_1 block = get_unaligned_block_q4_1((x+i)*sizeof_block_q4_1+pcs.inAOff);
const float d = float(block.d);
const float m = float(block.m);
const uint xl = first; // Based from bl->qs
const uint yl = y + i * QK4_1 + first; // Based from inB
vec2 acc = vec2(0.0, 0.0);
for (int j = 0; j < 4; ++j) {
acc.x += inB[yl+j] * (d * (block.qs[xl+j] & 0xF) + m);
acc.y += inB[yl+j+16] * (d * (block.qs[xl+j] >> 4) + m);
}
sumf += d * (acc.x - acc.y);
}
sum[ith] = sumf;
//
// Accumulate the sum from all threads in the threadgroup
//
barrier();
memoryBarrierShared();
if (ith%4 == 0) {
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
}
barrier();
memoryBarrierShared();
if (ith%16 == 0) {
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
}
barrier();
memoryBarrierShared();
if (ith == 0) {
for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sum[0];
}
}

145
kompute/op_mulrow.comp Normal file
View File

@ -0,0 +1,145 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
layout(local_size_x = 1) in;
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inAOff;
uint inBOff;
uint outOff;
uint row;
} pcs;
void main() {
const uint i = gl_WorkGroupID.x;
out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i % pcs.row) + pcs.inBOff];
}

209
kompute/op_norm.comp Normal file
View File

@ -0,0 +1,209 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
#define nth 256
layout(local_size_x = nth) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inOff;
uint outOff;
uint ne00;
uint nb01;
float eps;
} pcs;
shared float sum[nth];
void main() {
const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
// MEAN
// parallel sum
sum[gl_LocalInvocationID.x] = 0.0;
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
sum[gl_LocalInvocationID.x] += in_[x+i00];
}
// reduce
barrier();
memoryBarrierShared();
[[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
if (gl_LocalInvocationID.x < i) {
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
}
barrier();
memoryBarrierShared();
}
// broadcast
if (gl_LocalInvocationID.x == 0) {
sum[0] /= float(pcs.ne00);
}
barrier();
memoryBarrierShared();
const float mean = sum[0];
// recenter
const uint y = (gl_WorkGroupID.x*pcs.ne00/4) + pcs.outOff; // Based from out_
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
out_[y+i00] = in_[x+i00] - mean;
}
// VARIANCE
// parallel sum
sum[gl_LocalInvocationID.x] = 0.0;
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
}
// reduce
barrier();
memoryBarrierShared();
[[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
if (gl_LocalInvocationID.x < i) {
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
}
barrier();
memoryBarrierShared();
}
// broadcast
if (gl_LocalInvocationID.x == 0) {
sum[0] /= float(pcs.ne00);
}
barrier();
memoryBarrierShared();
const float variance = sum[0];
const float scale = 1.0f/sqrt(variance + pcs.eps);
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
out_[y+i00] *= scale;
}
}

141
kompute/op_relu.comp Normal file
View File

@ -0,0 +1,141 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
layout(local_size_x = 1) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inOff;
uint outOff;
} pcs;
void main() {
const uint i = gl_WorkGroupID.x;
out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]);
}

178
kompute/op_rmsnorm.comp Normal file
View File

@ -0,0 +1,178 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
#define nth 256
layout(local_size_x = nth) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inOff;
uint outOff;
uint ne00;
uint nb01;
float eps;
} pcs;
shared float sum[nth];
void main() {
const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
// parallel sum
sum[gl_LocalInvocationID.x] = 0.0;
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
}
// reduce
barrier();
memoryBarrierShared();
[[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
if (gl_LocalInvocationID.x < i) {
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
}
barrier();
memoryBarrierShared();
}
// broadcast
if (gl_LocalInvocationID.x == 0) {
sum[0] /= float(pcs.ne00);
}
barrier();
memoryBarrierShared();
const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
const uint y = (gl_WorkGroupID.x*pcs.ne00/4) + pcs.outOff; // Based from out_
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
out_[y+i00] = in_[x+i00] * scale;
}
}

183
kompute/op_rope.comp Normal file
View File

@ -0,0 +1,183 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
layout(local_size_x = 1) in;
layout (binding = 0) readonly buffer tensorIn { float in_[]; };
layout (binding = 1) writeonly buffer tensorOut { float out_[]; };
layout (push_constant) uniform parameter {
uint inOff;
uint outOff;
uint n_past;
int n_dims;
int mode;
float freq_base;
float freq_scale;
uint nb00;
uint nb01;
uint nb02;
uint nb03;
int ne0;
uint nb0;
uint nb1;
uint nb2;
uint nb3;
} pcs;
void main() {
const uint i3 = gl_WorkGroupID.z;
const uint i2 = gl_WorkGroupID.y;
const uint i1 = gl_WorkGroupID.x;
const bool is_neox = (pcs.mode & 2) != 0;
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
const uint p = ((pcs.mode & 1) == 0 ? pcs.n_past + i2 : i2);
float theta = pcs.freq_scale * float(p);
if (!is_neox) {
for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
const float cos_theta = cos(theta);
const float sin_theta = sin(theta);
theta *= theta_scale;
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inOff; // Based from in
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
const float x0 = in_[src];
const float x1 = in_[src+1];
out_[dst_data] = x0*cos_theta - x1*sin_theta;
out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
}
} else {
// TODO: implement
}
}

142
kompute/op_scale.comp Normal file
View File

@ -0,0 +1,142 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
layout(local_size_x = 1) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inOff;
uint outOff;
float scale;
} pcs;
void main() {
const uint i = gl_WorkGroupID.x;
out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
}

141
kompute/op_silu.comp Normal file
View File

@ -0,0 +1,141 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
layout(local_size_x = 1) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inOff;
uint outOff;
} pcs;
void main() {
const uint i = gl_WorkGroupID.x;
const float x = in_[i + pcs.inOff];
out_[i + pcs.outOff] = x / (1.0 + exp(-x));
}

197
kompute/op_softmax.comp Normal file
View File

@ -0,0 +1,197 @@
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#version 450
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
#define BM 128
#define BN 128
#define BK 8
#define TM 8
#define TN 8
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
#define sizeof_block_q4_1 0x14
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
#ifndef QK_K
#define QK_K 256
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
struct block_q2_K {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
};
// 84 bytes / block
struct block_q3_K {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
#if QK_K == 64
uint8_t scales[2];
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
float16_t d; // super-block scale
};
#if QK_K == 64
typedef struct {
float16_t d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
struct block_q4_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
};
#endif
#if QK_K == 64
struct block_q5_K {
float16_t d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
#else
struct block_q5_K {
float16_t d; // super-block scale for quantized scales
float16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
};
// 176 bytes / block
#endif
struct block_q6_K {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
// 210 bytes / block
#define nth 32
layout(local_size_x = nth) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inOff;
uint outOff;
int ne00;
int ne01;
int ne02;
} pcs;
shared float buf[nth];
void main() {
const uint i03 = gl_WorkGroupID.z;
const uint i02 = gl_WorkGroupID.y;
const uint i01 = gl_WorkGroupID.x;
const uint extra_off = i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00;
const uint psrc0 = extra_off + pcs.inOff; // Based from in_
const uint pdst = extra_off + pcs.outOff; // Based from out_
// parallel max
buf[gl_LocalInvocationID.x] = uintBitsToFloat(0xFF800000);
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], in_[psrc0 + i00]);
}
// reduce
barrier();
memoryBarrierShared();
[[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
if (gl_LocalInvocationID.x < i) {
buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], buf[gl_LocalInvocationID.x + i]);
}
barrier();
memoryBarrierShared();
}
// broadcast
const float max_ = buf[0];
// parallel sum
buf[gl_LocalInvocationID.x] = 0.0;
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
buf[gl_LocalInvocationID.x] += exp(in_[psrc0 + i00] - max_);
}
// reduce
barrier();
memoryBarrierShared();
[[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
if (gl_LocalInvocationID.x < i) {
buf[gl_LocalInvocationID.x] += buf[gl_LocalInvocationID.x + i];
}
barrier();
memoryBarrierShared();
}
// broadcast
const float sum = buf[0];
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
out_[pdst + i00] = exp(in_[psrc0 + i00] - max_) / sum;
}
}

View File

@ -0,0 +1,148 @@
"""
Script to handle conversion of compute shaders to spirv and to headers
"""
import os
import sys
import logging
import click
import subprocess
logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
is_windows = sys.platform.startswith('win')
CWD=os.path.dirname(os.path.abspath(__file__))
XXD_LINUX_CMD="xxd"
XXD_WINDOWS_CMD=os.path.abspath(os.path.join(CWD, "..\\external\\bin\\", "xxd.exe"))
SHADER_GENERATED_NOTICE = """/*
THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT
---
Copyright 2020 The Institute for Ethical AI & Machine Learning
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
"""
@click.command()
@click.option(
"--shader-path",
"-p",
envvar="KOMPUTE_SHADER_PATH",
required=True,
help="The path for the directory to build and convert shaders",
)
@click.option(
"--shader-binary",
"-s",
envvar="KOMPUTE_SHADER_BINARY",
required=True,
help="The path for the directory to build and convert shaders",
)
@click.option(
"--header-path",
"-c",
envvar="KOMPUTE_HEADER_PATH",
default="",
required=False,
help="The (optional) output file for the cpp header files",
)
@click.option(
"--verbose",
"-v",
envvar="KOMPUTE_HEADER_PATH",
default=False,
is_flag=True,
help="Enable versbosity if flag is provided",
)
def run_cli(
shader_path: str = None,
shader_binary: str = None,
header_path: bool = None,
verbose: bool = None,
):
"""
CLI function for shader generation
"""
if verbose:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.WARNING)
logger.debug(f"Starting script with variables: {locals()}")
if is_windows:
logger.debug(f"Running on windows, converting input paths")
shader_path = shader_path.replace("/", "\\")
header_path = header_path.replace("/", "\\")
shader_files = []
for root, directory, files in os.walk(shader_path):
for file in files:
if file.endswith(".comp"):
shader_files.append(os.path.join(root, file))
run_cmd = lambda *args: subprocess.check_output([*args]).decode()
logger.debug(f"Output spirv path: {shader_path}")
logger.debug(f"Converting files to spirv: {shader_files}")
spirv_files = []
for file in shader_files:
logger.debug(f"Converting to spirv: {file}")
spirv_file = f"{file}.spv"
run_cmd(shader_binary, "-V", file, "-o", spirv_file)
spirv_files.append(spirv_file)
# Create cpp files if header_path provided
if header_path:
logger.debug(f"Header path provided. Converting bin files to hpp.")
logger.debug(f"Output header path: {shader_path}")
# Check if xxd command options are available
if is_windows:
xxd_cmd = XXD_WINDOWS_CMD
else:
xxd_cmd = XXD_LINUX_CMD
for file in spirv_files:
print(xxd_cmd)
header_data = str(run_cmd(xxd_cmd, "-i", file))
# Ensuring the variable is a static const unsigned
header_data = header_data.replace("unsigned", "static const unsigned")
if is_windows:
raw_file_name = file.split("\\")[-1]
else:
raw_file_name = file.split("/")[-1]
file_name = f"shader{raw_file_name}"
header_file = file_name.replace(".comp.spv", ".hpp")
header_file_define = "SHADEROP_" + header_file.replace(".", "_").upper()
logger.debug(f"Converting to hpp: {file_name}")
with open(os.path.join(header_path, header_file), "w+", newline='\n') as fstream:
fstream.write(f"{SHADER_GENERATED_NOTICE}\n")
fstream.write(f"#ifndef {header_file_define}\n")
fstream.write(f"#define {header_file_define}\n\n")
fstream.write("namespace kp {\n")
fstream.write("namespace shader_data {\n")
fstream.write(f"{header_data}")
fstream.write("}\n")
fstream.write("}\n")
fstream.write(f"#endif // define {header_file_define}\n")
if __name__ == "__main__":
run_cli()

View File

@ -0,0 +1,11 @@
# CLI dependencies
click==7.1.2
# Dev dependencies
black==19.10b0
quom==1.2.0
Sphinx==3.2.1
sphinx_material==0.0.30
breathe==4.20.0
m2r2==0.2.5
git+git://github.com/pybind/pybind11_mkdoc.git@master

93
kompute/setup.py Normal file
View File

@ -0,0 +1,93 @@
import os
import re
import platform
import sys
import sysconfig
import subprocess
from setuptools import setup, Extension
from setuptools.command.build_ext import build_ext
from distutils.version import LooseVersion
curr_dir = os.path.abspath(os.path.dirname(__file__))
with open(os.path.join(curr_dir, 'README.md'), encoding='utf-8') as f:
long_description = f.read()
class CMakeExtension(Extension):
def __init__(self, name, sourcedir=''):
Extension.__init__(self, name, sources=[])
self.sourcedir = os.path.abspath(sourcedir)
class CMakeBuild(build_ext):
def run(self):
try:
out = subprocess.check_output(['cmake', '--version'])
except OSError:
raise RuntimeError("CMake must be installed to build the following extensions: " +
", ".join(e.name for e in self.extensions))
cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1))
if cmake_version < '3.15':
raise RuntimeError("CMake >= 3.15 is required")
for ext in self.extensions:
self.build_extension(ext)
def build_extension(self, ext):
extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
# required for auto-detection of auxiliary "native" libs
if not extdir.endswith(os.path.sep):
extdir += os.path.sep
cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
'-DKOMPUTE_OPT_BUILD_PYTHON=ON',
'-DKOMPUTE_OPT_LOG_LEVEL=Off',
'-DKOMPUTE_OPT_USE_SPDLOG=Off',
'-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
'-DPYTHON_EXECUTABLE=' + sys.executable,
'-DPYTHON_INCLUDE_DIR=' + sysconfig.get_path('include'),
'-DPYTHON_LIBRARY=' + sysconfig.get_path('stdlib'),
]
cfg = 'Debug' if self.debug else 'Release'
build_args = ['--config', cfg]
env = os.environ.copy()
oldCxxFlags = env.get('CXXFLAGS', '')
env['CXXFLAGS'] = f'{oldCxxFlags} -DVERSION_INFO=\\"{self.distribution.get_version()}\\"'
if platform.system() == "Windows":
cmake_args += [f'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}']
if sys.maxsize > 2**32:
cmake_args += ['-A', 'x64']
build_args += ['--', '/m']
else:
env['CXXFLAGS'] += ' -fPIC'
cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
build_args += ['--', '-j']
# Optional environment variable to limit the number of parallel jobs for GitHub actions to reduce RAM usage
if 'KOMPUTE_PYTHON_NUM_PARALLEL_THREADS' in env:
build_args += env['KOMPUTE_PYTHON_NUM_PARALLEL_THREADS']
if not os.path.exists(self.build_temp):
os.makedirs(self.build_temp)
subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
setup(
name='kp',
version='0.8.1',
author='Alejandro Saucedo',
description='Kompute: Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU processing usecases.',
long_description=long_description,
long_description_content_type='text/markdown',
ext_modules=[CMakeExtension('kp')],
install_requires=[
"numpy<2.0.0"
],
cmdclass=dict(build_ext=CMakeBuild),
zip_safe=False,
include_package_data=True,
)

450
kompute/src/Algorithm.cpp Normal file
View File

@ -0,0 +1,450 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#include <fstream>
#include "kompute/Algorithm.hpp"
namespace kp {
Algorithm::~Algorithm()
{
KP_LOG_DEBUG("Kompute Algorithm Destructor started");
this->destroy();
}
bool
Algorithm::isInit()
{
return this->mPipeline && this->mPipelineCache && this->mPipelineLayout &&
this->mDescriptorPool && this->mDescriptorSet &&
this->mDescriptorSetLayout && this->mShaderModule;
}
void
Algorithm::destroy()
{
// We don't have to free memory on destroy as it's freed by the
// commandBuffer destructor if (this->mPushConstantsData) {
// free(this->mPushConstantsData);
// }
// if (this->mSpecializationConstantsData) {
// free(this->mSpecializationConstantsData);
// }
if (!this->mDevice) {
KP_LOG_WARN("Kompute Algorithm destroy function reached with null "
"Device pointer");
return;
}
if (this->mFreePipeline && this->mPipeline) {
KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline");
if (!this->mPipeline) {
KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
"pipeline but it is null");
}
this->mDevice->destroy(
*this->mPipeline,
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->mPipeline = nullptr;
}
if (this->mFreePipelineCache && this->mPipelineCache) {
KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline cache");
if (!this->mPipelineCache) {
KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
"pipeline cache but it is null");
}
this->mDevice->destroy(
*this->mPipelineCache,
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->mPipelineCache = nullptr;
}
if (this->mFreePipelineLayout && this->mPipelineLayout) {
KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline layout");
if (!this->mPipelineLayout) {
KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
"pipeline layout but it is null");
}
this->mDevice->destroy(
*this->mPipelineLayout,
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->mPipelineLayout = nullptr;
}
if (this->mFreeShaderModule && this->mShaderModule) {
KP_LOG_DEBUG("Kompute Algorithm Destroying shader module");
if (!this->mShaderModule) {
KP_LOG_WARN("Kompute Algorithm Error requested to destroy shader "
"module but it is null");
}
this->mDevice->destroy(
*this->mShaderModule,
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->mShaderModule = nullptr;
}
freeParameters();
}
void
Algorithm::freeParameters()
{
if (this->mFreeDescriptorSetLayout && this->mDescriptorSetLayout) {
KP_LOG_DEBUG("Kompute Algorithm Destroying Descriptor Set Layout");
if (!this->mDescriptorSetLayout) {
KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
"descriptor set layout but it is null");
}
this->mDevice->destroy(
*this->mDescriptorSetLayout,
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->mDescriptorSetLayout = nullptr;
}
}
void
Algorithm::createParameters()
{
KP_LOG_DEBUG("Kompute Algorithm createParameters started");
if (!*this->mDescriptorPool) {
KP_LOG_ERROR("Kompute Algorithm can not create descriptor pool");
return;
}
std::vector<vk::DescriptorSetLayoutBinding> descriptorSetBindings;
for (size_t i = 0; i < this->mTensors.size(); i++) {
descriptorSetBindings.push_back(
vk::DescriptorSetLayoutBinding(i, // Binding index
vk::DescriptorType::eStorageBuffer,
1, // Descriptor count
vk::ShaderStageFlagBits::eCompute));
}
// This is the component that is fed into the pipeline
vk::DescriptorSetLayoutCreateInfo descriptorSetLayoutInfo(
vk::DescriptorSetLayoutCreateFlags(),
static_cast<uint32_t>(descriptorSetBindings.size()),
descriptorSetBindings.data());
KP_LOG_DEBUG("Kompute Algorithm creating descriptor set layout");
this->mDescriptorSetLayout = std::make_shared<vk::DescriptorSetLayout>();
vk::Result result = this->mDevice->createDescriptorSetLayout(
&descriptorSetLayoutInfo, nullptr, this->mDescriptorSetLayout.get());
if (result != vk::Result::eSuccess) {
KP_LOG_ERROR("Failed to create descriptor set layout. Error code: {}", vk::to_string(result));
} else {
this->mFreeDescriptorSetLayout = true;
KP_LOG_DEBUG("Successfully allocated descriptor set layout.");
}
vk::DescriptorSetAllocateInfo descriptorSetAllocateInfo(
*this->mDescriptorPool,
1, // Descriptor set layout count
this->mDescriptorSetLayout.get());
KP_LOG_DEBUG("Kompute Algorithm allocating descriptor sets");
this->mDescriptorSet = std::make_shared<vk::DescriptorSet>();
result = this->mDevice->allocateDescriptorSets(&descriptorSetAllocateInfo,
this->mDescriptorSet.get());
if (result != vk::Result::eSuccess) {
KP_LOG_ERROR("Failed to allocate descriptor sets. Error code: {}", vk::to_string(result));
} else {
this->mFreeDescriptorSet = true;
KP_LOG_DEBUG("Successfully allocated descriptor sets.");
}
this->mFreeDescriptorSet = true;
KP_LOG_DEBUG("Kompute Algorithm updating descriptor sets");
for (size_t i = 0; i < this->mTensors.size(); i++) {
std::vector<vk::WriteDescriptorSet> computeWriteDescriptorSets;
vk::DescriptorBufferInfo descriptorBufferInfo =
this->mTensors[i]->constructDescriptorBufferInfo();
computeWriteDescriptorSets.push_back(
vk::WriteDescriptorSet(*this->mDescriptorSet,
i, // Destination binding
0, // Destination array element
1, // Descriptor count
vk::DescriptorType::eStorageBuffer,
nullptr, // Descriptor image info
&descriptorBufferInfo));
this->mDevice->updateDescriptorSets(computeWriteDescriptorSets,
nullptr);
}
KP_LOG_DEBUG("Kompute Algorithm successfully run init");
}
void
Algorithm::updateParameters()
{
KP_LOG_DEBUG("Kompute Algorithm updateParameters started");
if (!*this->mDescriptorPool) {
KP_LOG_ERROR("Kompute Algorithm can not create descriptor pool");
return;
}
vk::DescriptorSetAllocateInfo descriptorSetAllocateInfo(
*this->mDescriptorPool,
1, // Descriptor set layout count
this->mDescriptorSetLayout.get());
KP_LOG_DEBUG("Kompute Algorithm allocating descriptor sets");
this->mDescriptorSet = std::make_shared<vk::DescriptorSet>();
vk::Result result = this->mDevice->allocateDescriptorSets(&descriptorSetAllocateInfo,
this->mDescriptorSet.get());
if (result != vk::Result::eSuccess) {
KP_LOG_ERROR("Failed to allocate descriptor sets. Error code: {}", vk::to_string(result));
} else {
this->mFreeDescriptorSet = true;
KP_LOG_DEBUG("Successfully allocated descriptor sets.");
}
this->mFreeDescriptorSet = true;
KP_LOG_DEBUG("Kompute Algorithm updating descriptor sets");
for (size_t i = 0; i < this->mTensors.size(); i++) {
std::vector<vk::WriteDescriptorSet> computeWriteDescriptorSets;
vk::DescriptorBufferInfo descriptorBufferInfo =
this->mTensors[i]->constructDescriptorBufferInfo();
computeWriteDescriptorSets.push_back(
vk::WriteDescriptorSet(*this->mDescriptorSet,
i, // Destination binding
0, // Destination array element
1, // Descriptor count
vk::DescriptorType::eStorageBuffer,
nullptr, // Descriptor image info
&descriptorBufferInfo));
this->mDevice->updateDescriptorSets(computeWriteDescriptorSets,
nullptr);
}
KP_LOG_DEBUG("Kompute Algorithm successfully run init");
}
void
Algorithm::createShaderModule()
{
KP_LOG_DEBUG("Kompute Algorithm createShaderModule started");
vk::ShaderModuleCreateInfo shaderModuleInfo(vk::ShaderModuleCreateFlags(),
sizeof(uint32_t) *
this->mSpirv.size(),
this->mSpirv.data());
KP_LOG_DEBUG("Kompute Algorithm Creating shader module. ShaderFileSize: {}",
this->mSpirv.size());
this->mFreeShaderModule = true;
this->mShaderModule = std::make_shared<vk::ShaderModule>();
this->mDevice->createShaderModule(
&shaderModuleInfo, nullptr, this->mShaderModule.get());
this->mFreeShaderModule = true;
KP_LOG_DEBUG("Kompute Algorithm create shader module success");
}
void
Algorithm::createPipeline()
{
KP_LOG_DEBUG("Kompute Algorithm calling create Pipeline");
vk::PipelineLayoutCreateInfo pipelineLayoutInfo(
vk::PipelineLayoutCreateFlags(),
1, // Set layout count
this->mDescriptorSetLayout.get());
vk::PushConstantRange pushConstantRange;
if (this->mPushConstantsSize) {
pushConstantRange.setStageFlags(vk::ShaderStageFlagBits::eCompute);
pushConstantRange.setOffset(0);
pushConstantRange.setSize(this->mPushConstantsDataTypeMemorySize *
this->mPushConstantsSize);
pipelineLayoutInfo.setPushConstantRangeCount(1);
pipelineLayoutInfo.setPPushConstantRanges(&pushConstantRange);
}
this->mPipelineLayout = std::make_shared<vk::PipelineLayout>();
this->mDevice->createPipelineLayout(
&pipelineLayoutInfo, nullptr, this->mPipelineLayout.get());
this->mFreePipelineLayout = true;
std::vector<vk::SpecializationMapEntry> specializationEntries;
for (uint32_t i = 0; i < this->mSpecializationConstantsSize; i++) {
vk::SpecializationMapEntry specializationEntry(
static_cast<uint32_t>(i),
static_cast<uint32_t>(
this->mSpecializationConstantsDataTypeMemorySize * i),
this->mSpecializationConstantsDataTypeMemorySize);
specializationEntries.push_back(specializationEntry);
}
// This passes ownership of the memory so we remove ownership from
// specialization container by using "transferDataOwnership"
vk::SpecializationInfo specializationInfo(
static_cast<uint32_t>(specializationEntries.size()),
specializationEntries.data(),
this->mSpecializationConstantsDataTypeMemorySize *
this->mSpecializationConstantsSize,
this->mSpecializationConstantsData);
vk::PipelineShaderStageCreateInfo shaderStage(
vk::PipelineShaderStageCreateFlags(),
vk::ShaderStageFlagBits::eCompute,
*this->mShaderModule,
"main",
&specializationInfo);
static std::shared_ptr<vk::PipelineCache> globalPipelineCache = std::make_shared<vk::PipelineCache>();
if(!*globalPipelineCache) {
vk::PipelineCacheCreateInfo pipelineCacheInfo =
vk::PipelineCacheCreateInfo();
this->mPipelineCache = globalPipelineCache;
this->mFreePipelineCache = true;
this->mDevice->createPipelineCache(
&pipelineCacheInfo, nullptr, globalPipelineCache.get());
}
vk::ComputePipelineCreateInfo pipelineInfo(vk::PipelineCreateFlags(),
shaderStage,
*this->mPipelineLayout,
vk::Pipeline(),
0);
#ifdef KOMPUTE_CREATE_PIPELINE_RESULT_VALUE
vk::ResultValue<vk::Pipeline> pipelineResult =
this->mDevice->createComputePipeline(*globalPipelineCache, pipelineInfo);
if (pipelineResult.result != vk::Result::eSuccess) {
throw std::runtime_error("Failed to create pipeline result: " +
vk::to_string(pipelineResult.result));
}
vk::Pipeline& pipeline = pipelineResult.value;
this->mPipeline = std::make_shared<vk::Pipeline>(pipeline);
this->mFreePipeline = true;
#else
vk::Pipeline pipeline =
this->mDevice->createComputePipeline(*globalPipelineCache, pipelineInfo)
.value;
this->mPipeline = std::make_shared<vk::Pipeline>(pipeline);
this->mFreePipeline = true;
#endif
// TODO: Update to consistent
// this->mPipeline = std::make_shared<vk::Pipeline>();
// this->mDevice->createComputePipelines(
// *this->mPipelineCache, 1, &pipelineInfo, nullptr,
// this->mPipeline.get());
KP_LOG_DEBUG("Kompute Algorithm Create Pipeline Success");
}
void
Algorithm::recordBindCore(const vk::CommandBuffer& commandBuffer)
{
KP_LOG_DEBUG("Kompute Algorithm binding pipeline");
commandBuffer.bindPipeline(vk::PipelineBindPoint::eCompute,
*this->mPipeline);
KP_LOG_DEBUG("Kompute Algorithm binding descriptor sets");
commandBuffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
*this->mPipelineLayout,
0, // First set
*this->mDescriptorSet,
nullptr // Dispatcher
);
}
void
Algorithm::recordBindPush(const vk::CommandBuffer& commandBuffer)
{
if (this->mPushConstantsSize) {
KP_LOG_DEBUG("Kompute Algorithm binding push constants memory size: {}",
this->mPushConstantsSize *
this->mPushConstantsDataTypeMemorySize);
commandBuffer.pushConstants(*this->mPipelineLayout,
vk::ShaderStageFlagBits::eCompute,
0,
this->mPushConstantsSize *
this->mPushConstantsDataTypeMemorySize,
this->mPushConstantsData);
}
}
void
Algorithm::recordDispatch(const vk::CommandBuffer& commandBuffer)
{
KP_LOG_DEBUG("Kompute Algorithm recording dispatch");
commandBuffer.dispatch(
this->mWorkgroup[0], this->mWorkgroup[1], this->mWorkgroup[2]);
}
void
Algorithm::setWorkgroup(const Workgroup& workgroup, uint32_t minSize)
{
KP_LOG_INFO("Kompute OpAlgoCreate setting dispatch size");
// The dispatch size is set up based on either explicitly provided template
// parameters or by default it would take the shape and size of the tensors
if (workgroup[0] > 0) {
// If at least the x value is provided we use mainly the parameters
// provided
this->mWorkgroup = { workgroup[0],
workgroup[1] > 0 ? workgroup[1] : 1,
workgroup[2] > 0 ? workgroup[2] : 1 };
} else {
this->mWorkgroup = { minSize, 1, 1 };
}
KP_LOG_INFO("Kompute OpAlgoCreate set dispatch size X: {}, Y: {}, Z: {}",
this->mWorkgroup[0],
this->mWorkgroup[1],
this->mWorkgroup[2]);
}
const Workgroup&
Algorithm::getWorkgroup()
{
return this->mWorkgroup;
}
const std::vector<std::shared_ptr<Tensor>>&
Algorithm::getTensors()
{
return this->mTensors;
}
void Algorithm::setTensors(const std::vector<std::shared_ptr<Tensor>>& tensors)
{
this->mTensors = tensors;
}
}

View File

@ -0,0 +1,82 @@
# SPDX-License-Identifier: Apache-2.0
cmake_minimum_required(VERSION 3.20)
if(KOMPUTE_OPT_ANDROID_BUILD)
find_library(android android)
endif()
cmake_minimum_required(VERSION 3.20)
add_library(kompute Algorithm.cpp
Manager.cpp
OpAlgoDispatch.cpp
OpMemoryBarrier.cpp
OpTensorCopy.cpp
OpTensorSyncDevice.cpp
OpTensorSyncLocal.cpp
OpBufferSyncDevice.cpp
OpBufferSyncLocal.cpp
Sequence.cpp
Tensor.cpp
Core.cpp)
add_library(kompute::kompute ALIAS kompute)
# Set version for shared libraries.
set_target_properties(kompute
PROPERTIES
VERSION ${${PROJECT_NAME}_VERSION}
SOVERSION ${${PROJECT_NAME}_VERSION_MAJOR})
# Import GNU common install directory variables
include(GNUInstallDirs)
install(TARGETS kompute
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
# Include CMake helpers for package config files
# Follow this installation guideline: https://cmake.org/cmake/help/latest/manual/cmake-packages.7.html
include(CMakePackageConfigHelpers)
configure_package_config_file(${PROJECT_SOURCE_DIR}/cmake/komputeConfig.cmake.in
"${PROJECT_BINARY_DIR}/kompute/komputeConfig.cmake"
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/kompute)
install(FILES ${PROJECT_BINARY_DIR}/kompute/komputeConfig.cmake
${PROJECT_BINARY_DIR}/kompute/komputeConfigVersion.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/kompute)
# ####################################################
# Linking
# ####################################################
if(KOMPUTE_OPT_ANDROID_BUILD)
target_link_libraries(kompute PUBLIC vulkanAndroid
android
kp_logger
kp_shader
fmt::fmt)
else()
target_link_libraries(kompute PUBLIC Vulkan::Vulkan
kp_logger
kp_shader
fmt::fmt)
endif()
if(KOMPUTE_OPT_BUILD_PYTHON)
include_directories(${PYTHON_INCLUDE_DIRS})
target_link_libraries(kompute PRIVATE pybind11::headers ${PYTHON_LIBRARIES})
endif()
if(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER)
target_link_libraries(kompute PUBLIC Vulkan-Headers)
endif()
# ####################################################
# Misc
# ####################################################
add_subdirectory(logger)
add_subdirectory(shaders)
add_subdirectory(include)

27
kompute/src/Core.cpp Normal file
View File

@ -0,0 +1,27 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#include "kompute/Core.hpp"
#if VK_USE_PLATFORM_ANDROID_KHR
#ifndef KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
#define KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
/**
* Ensures support for dynamic loading of Vulkan functions on Android.
* Acts as a default store for loaded functions.
* More information:
* https://github.com/KhronosGroup/Vulkan-Hpp#vulkan_hpp_default_dispatcher
**/
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
#endif // !KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
#endif // VK_USE_PLATFORM_ANDROID_KHR
namespace kp {
} // namespace kp

493
kompute/src/Manager.cpp Normal file
View File

@ -0,0 +1,493 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#include "kompute/Manager.hpp"
#include "fmt/format.h"
#include "kompute/logger/Logger.hpp"
#include <fmt/core.h>
#include <iterator>
#include <set>
#include <sstream>
#include <string>
namespace kp {
#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
static VKAPI_ATTR VkBool32 VKAPI_CALL
debugMessageCallback(VkDebugReportFlagsEXT /*flags*/,
VkDebugReportObjectTypeEXT /*objectType*/,
uint64_t /*object*/,
size_t /*location*/,
int32_t /*messageCode*/,
#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_DEBUG
const char* pLayerPrefix,
const char* pMessage,
#else
const char* /*pLayerPrefix*/,
const char* /*pMessage*/,
#endif
void* /*pUserData*/)
{
KP_LOG_DEBUG("[VALIDATION]: {} - {}", pLayerPrefix, pMessage);
return VK_FALSE;
}
#endif
Manager::Manager()
{
this->mManageResources = true;
// Make sure the logger is setup
#if !KOMPUTE_OPT_LOG_LEVEL_DISABLED
logger::setupLogger();
#endif
this->createInstance();
}
void Manager::initializeDevice(uint32_t physicalDeviceIndex,
const std::vector<uint32_t>& familyQueueIndices,
const std::vector<std::string>& desiredExtensions)
{
this->createDevice(
familyQueueIndices, physicalDeviceIndex, desiredExtensions);
}
Manager::~Manager()
{
KP_LOG_DEBUG("Kompute Manager Destructor started");
this->destroy();
}
void
Manager::destroy()
{
KP_LOG_DEBUG("Kompute Manager destroy() started");
if (this->mDevice == nullptr) {
KP_LOG_ERROR(
"Kompute Manager destructor reached with null Device pointer");
return;
}
if (this->mManageResources && this->mManagedSequences.size()) {
KP_LOG_DEBUG("Kompute Manager explicitly running destructor for "
"managed sequences");
for (const std::weak_ptr<Sequence>& weakSq : this->mManagedSequences) {
if (std::shared_ptr<Sequence> sq = weakSq.lock()) {
sq->destroy();
}
}
this->mManagedSequences.clear();
}
if (this->mManageResources && this->mManagedAlgorithms.size()) {
KP_LOG_DEBUG("Kompute Manager explicitly freeing algorithms");
for (const std::weak_ptr<Algorithm>& weakAlgorithm :
this->mManagedAlgorithms) {
if (std::shared_ptr<Algorithm> algorithm = weakAlgorithm.lock()) {
algorithm->destroy();
}
}
this->mManagedAlgorithms.clear();
}
if (this->mManageResources && this->mManagedTensors.size()) {
KP_LOG_DEBUG("Kompute Manager explicitly freeing tensors");
for (const std::weak_ptr<Tensor>& weakTensor : this->mManagedTensors) {
if (std::shared_ptr<Tensor> tensor = weakTensor.lock()) {
tensor->destroy();
}
}
this->mManagedTensors.clear();
}
if (this->mFreeDevice) {
KP_LOG_INFO("Destroying device");
this->mDevice->destroy(
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->mDevice = nullptr;
KP_LOG_DEBUG("Kompute Manager Destroyed Device");
}
if (this->mInstance == nullptr) {
KP_LOG_ERROR(
"Kompute Manager destructor reached with null Instance pointer");
return;
}
#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
if (this->mDebugReportCallback) {
this->mInstance->destroyDebugReportCallbackEXT(
this->mDebugReportCallback, nullptr, this->mDebugDispatcher);
KP_LOG_DEBUG("Kompute Manager Destroyed Debug Report Callback");
}
#endif
if (this->mFreeInstance) {
this->mInstance->destroy(
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->mInstance = nullptr;
KP_LOG_DEBUG("Kompute Manager Destroyed Instance");
}
}
void
Manager::createInstance()
{
KP_LOG_DEBUG("Kompute Manager creating instance");
this->mFreeInstance = true;
vk::ApplicationInfo applicationInfo;
applicationInfo.pApplicationName = "Kompute";
applicationInfo.pEngineName = "Kompute";
applicationInfo.apiVersion = KOMPUTE_VK_API_VERSION;
applicationInfo.engineVersion = KOMPUTE_VK_API_VERSION;
applicationInfo.applicationVersion = KOMPUTE_VK_API_VERSION;
std::vector<const char*> applicationExtensions;
#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
applicationExtensions.push_back(VK_EXT_DEBUG_REPORT_EXTENSION_NAME);
#endif
vk::InstanceCreateInfo computeInstanceCreateInfo;
computeInstanceCreateInfo.pApplicationInfo = &applicationInfo;
if (!applicationExtensions.empty()) {
computeInstanceCreateInfo.enabledExtensionCount =
(uint32_t)applicationExtensions.size();
computeInstanceCreateInfo.ppEnabledExtensionNames =
applicationExtensions.data();
}
#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
KP_LOG_DEBUG("Kompute Manager adding debug validation layers");
// We'll identify the layers that are supported
std::vector<const char*> validLayerNames;
std::vector<const char*> desiredLayerNames = {
"VK_LAYER_LUNARG_assistant_layer",
"VK_LAYER_LUNARG_standard_validation",
"VK_LAYER_KHRONOS_validation",
};
std::vector<std::string> envLayerNames;
const char* envLayerNamesVal = std::getenv("KOMPUTE_ENV_DEBUG_LAYERS");
if (envLayerNamesVal != nullptr && *envLayerNamesVal != '\0') {
KP_LOG_DEBUG("Kompute Manager adding environment layers: {}",
envLayerNamesVal);
std::istringstream iss(envLayerNamesVal);
std::istream_iterator<std::string> beg(iss);
std::istream_iterator<std::string> end;
envLayerNames = std::vector<std::string>(beg, end);
for (const std::string& layerName : envLayerNames) {
desiredLayerNames.push_back(layerName.c_str());
}
KP_LOG_DEBUG("Desired layers: {}", fmt::join(desiredLayerNames, ", "));
}
// Identify the valid layer names based on the desiredLayerNames
{
std::set<std::string> uniqueLayerNames;
std::vector<vk::LayerProperties> availableLayerProperties =
vk::enumerateInstanceLayerProperties();
for (vk::LayerProperties layerProperties : availableLayerProperties) {
std::string layerName(layerProperties.layerName.data());
uniqueLayerNames.insert(layerName);
}
KP_LOG_DEBUG("Available layers: {}", fmt::join(uniqueLayerNames, ", "));
for (const char* desiredLayerName : desiredLayerNames) {
if (uniqueLayerNames.count(desiredLayerName) != 0) {
validLayerNames.push_back(desiredLayerName);
}
}
}
if (!validLayerNames.empty()) {
KP_LOG_DEBUG(
"Kompute Manager Initializing instance with valid layers: {}",
fmt::join(validLayerNames, ", "));
computeInstanceCreateInfo.enabledLayerCount =
static_cast<uint32_t>(validLayerNames.size());
computeInstanceCreateInfo.ppEnabledLayerNames = validLayerNames.data();
} else {
KP_LOG_WARN("Kompute Manager no valid layer names found from desired "
"layer names");
}
#endif
#if VK_USE_PLATFORM_ANDROID_KHR
vk::DynamicLoader dl;
PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr =
dl.getProcAddress<PFN_vkGetInstanceProcAddr>("vkGetInstanceProcAddr");
VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
#endif // VK_USE_PLATFORM_ANDROID_KHR
this->mInstance = std::make_shared<vk::Instance>();
vk::createInstance(
&computeInstanceCreateInfo, nullptr, this->mInstance.get());
#if VK_USE_PLATFORM_ANDROID_KHR
VULKAN_HPP_DEFAULT_DISPATCHER.init(*this->mInstance);
#endif // VK_USE_PLATFORM_ANDROID_KHR
KP_LOG_DEBUG("Kompute Manager Instance Created");
#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
KP_LOG_DEBUG("Kompute Manager adding debug callbacks");
if (validLayerNames.size() > 0) {
vk::DebugReportFlagsEXT debugFlags =
vk::DebugReportFlagBitsEXT::eError |
vk::DebugReportFlagBitsEXT::eWarning;
vk::DebugReportCallbackCreateInfoEXT debugCreateInfo = {};
debugCreateInfo.pfnCallback =
(PFN_vkDebugReportCallbackEXT)debugMessageCallback;
debugCreateInfo.flags = debugFlags;
this->mDebugDispatcher.init(*this->mInstance, &vkGetInstanceProcAddr);
this->mDebugReportCallback =
this->mInstance->createDebugReportCallbackEXT(
debugCreateInfo, nullptr, this->mDebugDispatcher);
}
#endif
}
void
Manager::clear()
{
if (this->mManageResources) {
this->mManagedTensors.erase(
std::remove_if(begin(this->mManagedTensors),
end(this->mManagedTensors),
[](std::weak_ptr<Tensor> t) { return t.expired(); }),
end(this->mManagedTensors));
this->mManagedAlgorithms.erase(
std::remove_if(
begin(this->mManagedAlgorithms),
end(this->mManagedAlgorithms),
[](std::weak_ptr<Algorithm> t) { return t.expired(); }),
end(this->mManagedAlgorithms));
this->mManagedSequences.erase(
std::remove_if(begin(this->mManagedSequences),
end(this->mManagedSequences),
[](std::weak_ptr<Sequence> t) { return t.expired(); }),
end(this->mManagedSequences));
}
}
void
Manager::createDevice(const std::vector<uint32_t>& familyQueueIndices,
uint32_t physicalDeviceIndex,
const std::vector<std::string>& desiredExtensions)
{
KP_LOG_DEBUG("Kompute Manager creating Device");
if (this->mInstance == nullptr) {
throw std::runtime_error("Kompute Manager instance is null");
}
this->mFreeDevice = true;
// Getting an integer that says how many vuklan devices we have
std::vector<vk::PhysicalDevice> physicalDevices =
this->mInstance->enumeratePhysicalDevices();
uint32_t deviceCount = physicalDevices.size();
// This means there are no devices at all
if (deviceCount == 0) {
throw std::runtime_error("Failed to find GPUs with Vulkan support! "
"Maybe you haven't installed vulkan drivers?");
}
// This means that we're exceeding our device limit, for
// example if we have 2 devices, just physicalDeviceIndex
// 0 and 1 are acceptable. Hence, physicalDeviceIndex should
// always be less than deviceCount, else we raise an error
if (!(deviceCount > physicalDeviceIndex)) {
throw std::runtime_error("There is no such physical index or device, "
"please use your existing device");
}
vk::PhysicalDevice physicalDevice = physicalDevices[physicalDeviceIndex];
this->mPhysicalDevice =
std::make_shared<vk::PhysicalDevice>(physicalDevice);
#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_INFO
vk::PhysicalDeviceProperties physicalDeviceProperties =
physicalDevice.getProperties();
#endif
KP_LOG_INFO("Using physical device index {} found {}",
physicalDeviceIndex,
physicalDeviceProperties.deviceName);
if (familyQueueIndices.empty()) {
// Find compute queue
std::vector<vk::QueueFamilyProperties> allQueueFamilyProperties =
physicalDevice.getQueueFamilyProperties();
uint32_t computeQueueFamilyIndex = 0;
bool computeQueueSupported = false;
for (uint32_t i = 0; i < allQueueFamilyProperties.size(); i++) {
vk::QueueFamilyProperties queueFamilyProperties =
allQueueFamilyProperties[i];
if (queueFamilyProperties.queueFlags &
vk::QueueFlagBits::eCompute) {
computeQueueFamilyIndex = i;
computeQueueSupported = true;
break;
}
}
if (!computeQueueSupported) {
throw std::runtime_error("Compute queue is not supported");
}
this->mComputeQueueFamilyIndices.push_back(computeQueueFamilyIndex);
} else {
this->mComputeQueueFamilyIndices = familyQueueIndices;
}
std::unordered_map<uint32_t, uint32_t> familyQueueCounts;
std::unordered_map<uint32_t, std::vector<float>> familyQueuePriorities;
for (const auto& value : this->mComputeQueueFamilyIndices) {
familyQueueCounts[value]++;
familyQueuePriorities[value].push_back(1.0f);
}
std::unordered_map<uint32_t, uint32_t> familyQueueIndexCount;
std::vector<vk::DeviceQueueCreateInfo> deviceQueueCreateInfos;
for (const auto& familyQueueInfo : familyQueueCounts) {
// Setting the device count to 0
familyQueueIndexCount[familyQueueInfo.first] = 0;
// Creating the respective device queue
vk::DeviceQueueCreateInfo deviceQueueCreateInfo(
vk::DeviceQueueCreateFlags(),
familyQueueInfo.first,
familyQueueInfo.second,
familyQueuePriorities[familyQueueInfo.first].data());
deviceQueueCreateInfos.push_back(deviceQueueCreateInfo);
}
KP_LOG_DEBUG("Kompute Manager desired extension layers {}",
fmt::join(desiredExtensions, ", "));
std::vector<vk::ExtensionProperties> deviceExtensions =
this->mPhysicalDevice->enumerateDeviceExtensionProperties();
std::set<std::string> uniqueExtensionNames;
for (const vk::ExtensionProperties& ext : deviceExtensions) {
uniqueExtensionNames.insert(ext.extensionName);
}
KP_LOG_DEBUG("Kompute Manager available extensions {}",
fmt::join(uniqueExtensionNames, ", "));
std::vector<const char*> validExtensions;
for (const std::string& ext : desiredExtensions) {
if (uniqueExtensionNames.count(ext) != 0) {
validExtensions.push_back(ext.c_str());
}
}
if (desiredExtensions.size() != validExtensions.size()) {
KP_LOG_ERROR("Kompute Manager not all extensions were added: {}",
fmt::join(validExtensions, ", "));
}
vk::PhysicalDeviceFeatures features;
features.shaderInt16 = true;
vk::PhysicalDeviceVulkan11Features features11;
features11.uniformAndStorageBuffer16BitAccess = true;
features11.storageBuffer16BitAccess = true;
features11.pNext = nullptr;
vk::PhysicalDeviceVulkan12Features features12;
features12.storageBuffer8BitAccess = true;
features12.uniformAndStorageBuffer8BitAccess = true;
features12.shaderFloat16 = true;
features12.shaderInt8 = true;
features12.pNext = &features11;
vk::DeviceCreateInfo deviceCreateInfo(vk::DeviceCreateFlags(),
deviceQueueCreateInfos.size(),
deviceQueueCreateInfos.data(),
{},
{},
validExtensions.size(),
validExtensions.data(),
&features);
deviceCreateInfo.pNext = &features12;
this->mDevice = std::make_shared<vk::Device>();
vk::Result r = physicalDevice.createDevice(
&deviceCreateInfo, nullptr, this->mDevice.get());
if (r != vk::Result::eSuccess) {
KP_LOG_ERROR("Kompute Manager could not create device");
}
KP_LOG_DEBUG("Kompute Manager device created");
for (const uint32_t& familyQueueIndex : this->mComputeQueueFamilyIndices) {
std::shared_ptr<vk::Queue> currQueue = std::make_shared<vk::Queue>();
this->mDevice->getQueue(familyQueueIndex,
familyQueueIndexCount[familyQueueIndex],
currQueue.get());
familyQueueIndexCount[familyQueueIndex]++;
this->mComputeQueues.push_back(currQueue);
}
KP_LOG_DEBUG("Kompute Manager compute queue obtained");
}
std::shared_ptr<Sequence>
Manager::sequence(uint32_t queueIndex, uint32_t totalTimestamps)
{
KP_LOG_DEBUG("Kompute Manager sequence() with queueIndex: {}", queueIndex);
std::shared_ptr<Sequence> sq{ new kp::Sequence(
this->mPhysicalDevice,
this->mDevice,
this->mComputeQueues[queueIndex],
this->mComputeQueueFamilyIndices[queueIndex],
totalTimestamps) };
if (this->mManageResources) {
this->mManagedSequences.push_back(sq);
}
return sq;
}
vk::PhysicalDeviceProperties
Manager::getDeviceProperties() const
{
return this->mPhysicalDevice->getProperties();
}
std::vector<vk::PhysicalDevice>
Manager::listDevices() const
{
return this->mInstance->enumeratePhysicalDevices();
}
std::shared_ptr<vk::Instance>
Manager::getVkInstance() const
{
return this->mInstance;
}
}

View File

@ -0,0 +1,65 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#include "kompute/operations/OpAlgoDispatch.hpp"
namespace kp {
OpAlgoDispatch::~OpAlgoDispatch()
{
KP_LOG_DEBUG("Kompute OpAlgoDispatch destructor started");
if (this->mPushConstantsData) {
KP_LOG_DEBUG("Kompute freeing push constants data");
free(this->mPushConstantsData);
}
}
void
OpAlgoDispatch::record(const vk::CommandBuffer& commandBuffer)
{
KP_LOG_DEBUG("Kompute OpAlgoDispatch record called");
// Barrier to ensure the data is finished writing to buffer memory
for (const std::shared_ptr<Tensor>& tensor :
this->mAlgorithm->getTensors()) {
tensor->recordPrimaryBufferMemoryBarrier(
commandBuffer,
vk::AccessFlagBits::eTransferWrite,
vk::AccessFlagBits::eShaderRead,
vk::PipelineStageFlagBits::eTransfer,
vk::PipelineStageFlagBits::eComputeShader);
}
if (this->mPushConstantsSize) {
this->mAlgorithm->setPushConstants(
this->mPushConstantsData,
this->mPushConstantsSize,
this->mPushConstantsDataTypeMemorySize);
}
this->mAlgorithm->recordBindCore(commandBuffer);
this->mAlgorithm->recordBindPush(commandBuffer);
this->mAlgorithm->recordDispatch(commandBuffer);
}
void
OpAlgoDispatch::preEval(const vk::CommandBuffer& /*commandBuffer*/)
{
KP_LOG_DEBUG("Kompute OpAlgoDispatch preEval called");
}
void
OpAlgoDispatch::postEval(const vk::CommandBuffer& /*commandBuffer*/)
{
KP_LOG_DEBUG("Kompute OpAlgoDispatch postSubmit called");
}
}

View File

@ -0,0 +1,51 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#include "kompute/operations/OpBufferSyncDevice.hpp"
namespace kp {
OpBufferSyncDevice::OpBufferSyncDevice(
vk::Buffer *primaryBuffer,
vk::Buffer *stagingBuffer,
vk::DeviceSize size)
: mPrimaryBuffer(primaryBuffer)
, mStagingBuffer(stagingBuffer)
, mSize(size)
{
KP_LOG_DEBUG("Kompute OpBufferSyncDevice constructor with params");
}
OpBufferSyncDevice::~OpBufferSyncDevice()
{
KP_LOG_DEBUG("Kompute OpBufferSyncDevice destructor started");
}
void
OpBufferSyncDevice::record(const vk::CommandBuffer& commandBuffer)
{
KP_LOG_DEBUG("Kompute OpBufferSyncDevice record called");
vk::BufferCopy copyRegion(0, 0, mSize);
commandBuffer.copyBuffer(*mStagingBuffer, *mPrimaryBuffer, copyRegion);
}
void
OpBufferSyncDevice::preEval(const vk::CommandBuffer& /*commandBuffer*/)
{
KP_LOG_DEBUG("Kompute OpBufferSyncDevice preEval called");
}
void
OpBufferSyncDevice::postEval(const vk::CommandBuffer& /*commandBuffer*/)
{
KP_LOG_DEBUG("Kompute OpBufferSyncDevice postEval called");
}
}

View File

@ -0,0 +1,51 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#include "kompute/operations/OpBufferSyncLocal.hpp"
namespace kp {
OpBufferSyncLocal::OpBufferSyncLocal(
vk::Buffer *primaryBuffer,
vk::Buffer *stagingBuffer,
vk::DeviceSize size)
: mPrimaryBuffer(primaryBuffer)
, mStagingBuffer(stagingBuffer)
, mSize(size)
{
KP_LOG_DEBUG("Kompute OpBufferSyncLocal constructor with params");
}
OpBufferSyncLocal::~OpBufferSyncLocal()
{
KP_LOG_DEBUG("Kompute OpBufferSyncLocal destructor started");
}
void
OpBufferSyncLocal::record(const vk::CommandBuffer& commandBuffer)
{
KP_LOG_DEBUG("Kompute OpBufferSyncLocal record called");
vk::BufferCopy copyRegion(0, 0, mSize);
commandBuffer.copyBuffer(*mPrimaryBuffer, *mStagingBuffer, copyRegion);
}
void
OpBufferSyncLocal::preEval(const vk::CommandBuffer& /*commandBuffer*/)
{
KP_LOG_DEBUG("Kompute OpBufferSyncLocal preEval called");
}
void
OpBufferSyncLocal::postEval(const vk::CommandBuffer& /*commandBuffer*/)
{
KP_LOG_DEBUG("Kompute OpBufferSyncLocal postEval called");
}
}

View File

@ -0,0 +1,74 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#include "kompute/operations/OpMemoryBarrier.hpp"
namespace kp {
OpMemoryBarrier::OpMemoryBarrier(
const std::vector<std::shared_ptr<Tensor>>& tensors,
const vk::AccessFlagBits& srcAccessMask,
const vk::AccessFlagBits& dstAccessMask,
const vk::PipelineStageFlagBits& srcStageMask,
const vk::PipelineStageFlagBits& dstStageMask,
bool barrierOnPrimary)
: mSrcAccessMask(srcAccessMask)
, mDstAccessMask(dstAccessMask)
, mSrcStageMask(srcStageMask)
, mDstStageMask(dstStageMask)
, mBarrierOnPrimary(barrierOnPrimary)
, mTensors(tensors)
{
KP_LOG_DEBUG("Kompute OpMemoryBarrier constructor");
}
OpMemoryBarrier::~OpMemoryBarrier()
{
KP_LOG_DEBUG("Kompute OpMemoryBarrier destructor started");
}
void
OpMemoryBarrier::record(const vk::CommandBuffer& commandBuffer)
{
KP_LOG_DEBUG("Kompute OpMemoryBarrier record called");
// Barrier to ensure the data is finished writing to buffer memory
if (this->mBarrierOnPrimary) {
for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
tensor->recordPrimaryBufferMemoryBarrier(commandBuffer,
this->mSrcAccessMask,
this->mDstAccessMask,
this->mSrcStageMask,
this->mDstStageMask);
}
} else {
for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
tensor->recordStagingBufferMemoryBarrier(commandBuffer,
this->mSrcAccessMask,
this->mDstAccessMask,
this->mSrcStageMask,
this->mDstStageMask);
}
}
}
void
OpMemoryBarrier::preEval(const vk::CommandBuffer& /*commandBuffer*/)
{
KP_LOG_DEBUG("Kompute OpMemoryBarrier preEval called");
}
void
OpMemoryBarrier::postEval(const vk::CommandBuffer& /*commandBuffer*/)
{
KP_LOG_DEBUG("Kompute OpMemoryBarrier postSubmit called");
}
}

View File

@ -0,0 +1,90 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#include "kompute/operations/OpTensorCopy.hpp"
#include "kompute/Tensor.hpp"
namespace kp {
OpTensorCopy::OpTensorCopy(const std::vector<std::shared_ptr<Tensor>>& tensors)
{
KP_LOG_DEBUG("Kompute OpTensorCopy constructor with params");
this->mTensors = tensors;
if (this->mTensors.size() < 2) {
throw std::runtime_error(
"Kompute OpTensorCopy called with less than 2 tensor");
}
kp::Tensor::TensorDataTypes dataType = this->mTensors[0]->dataType();
uint32_t size = this->mTensors[0]->size();
for (const std::shared_ptr<Tensor>& tensor : tensors) {
if (tensor->dataType() != dataType) {
throw std::runtime_error(fmt::format(
"Attempting to copy tensors of different types from {} to {}",
Tensor::toString(dataType),
Tensor::toString(tensor->dataType())));
}
if (tensor->size() != size) {
throw std::runtime_error(fmt::format(
"Attempting to copy tensors of different sizes from {} to {}",
size,
tensor->size()));
}
}
}
OpTensorCopy::~OpTensorCopy()
{
KP_LOG_DEBUG("Kompute OpTensorCopy destructor started");
}
void
OpTensorCopy::record(const vk::CommandBuffer& commandBuffer)
{
KP_LOG_DEBUG("Kompute OpTensorCopy record called");
// We iterate from the second tensor onwards and record a copy to all
for (size_t i = 1; i < this->mTensors.size(); i++) {
this->mTensors[i]->recordCopyFrom(commandBuffer, this->mTensors[0]);
}
}
void
OpTensorCopy::preEval(const vk::CommandBuffer& /*commandBuffer*/)
{
KP_LOG_DEBUG("Kompute OpTensorCopy preEval called");
}
void
OpTensorCopy::postEval(const vk::CommandBuffer& /*commandBuffer*/)
{
KP_LOG_DEBUG("Kompute OpTensorCopy postEval called");
// Do not copy on CPU side if source is storage tensor
if (this->mTensors[0]->tensorType() == kp::Tensor::TensorTypes::eStorage)
{
KP_LOG_DEBUG("Kompute OpTensorCopy not copying tensor source given it's of eStorage type");
return;
}
void* data = this->mTensors[0]->rawData();
// Copy the data from the first tensor into all the tensors
for (size_t i = 1; i < this->mTensors.size(); i++) {
if (this->mTensors[i]->tensorType() == kp::Tensor::TensorTypes::eStorage) {
KP_LOG_DEBUG("Kompute OpTensorCopy not copying to tensor dest given it's of eStorage type");
continue;
}
this->mTensors[i]->setRawData(data);
}
}
}

View File

@ -0,0 +1,61 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#include "kompute/operations/OpTensorSyncDevice.hpp"
namespace kp {
OpTensorSyncDevice::OpTensorSyncDevice(
const std::vector<std::shared_ptr<Tensor>>& tensors)
: mPrimaryBuffer(nullptr)
, mStagingBuffer(nullptr)
{
KP_LOG_DEBUG("Kompute OpTensorSyncDevice constructor with params");
if (tensors.size() < 1) {
throw std::runtime_error(
"Kompute OpTensorSyncDevice called with less than 1 tensor");
}
this->mTensors = tensors;
}
OpTensorSyncDevice::~OpTensorSyncDevice()
{
KP_LOG_DEBUG("Kompute OpTensorSyncDevice destructor started");
this->mTensors.clear();
}
void
OpTensorSyncDevice::record(const vk::CommandBuffer& commandBuffer)
{
KP_LOG_DEBUG("Kompute OpTensorSyncDevice record called");
for (size_t i = 0; i < this->mTensors.size(); i++) {
if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
this->mTensors[i]->recordCopyFromStagingToDevice(commandBuffer);
}
}
}
void
OpTensorSyncDevice::preEval(const vk::CommandBuffer& /*commandBuffer*/)
{
KP_LOG_DEBUG("Kompute OpTensorSyncDevice preEval called");
}
void
OpTensorSyncDevice::postEval(const vk::CommandBuffer& /*commandBuffer*/)
{
KP_LOG_DEBUG("Kompute OpTensorSyncDevice postEval called");
}
}

View File

@ -0,0 +1,76 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#include "kompute/Tensor.hpp"
#include "kompute/operations/OpTensorSyncLocal.hpp"
namespace kp {
OpTensorSyncLocal::OpTensorSyncLocal(
const std::vector<std::shared_ptr<Tensor>>& tensors)
{
KP_LOG_DEBUG("Kompute OpTensorSyncLocal constructor with params");
if (tensors.size() < 1) {
throw std::runtime_error(
"Kompute OpTensorSyncLocal called with less than 1 tensor");
}
this->mTensors = tensors;
}
OpTensorSyncLocal::~OpTensorSyncLocal()
{
KP_LOG_DEBUG("Kompute OpTensorSyncLocal destructor started");
}
void
OpTensorSyncLocal::record(const vk::CommandBuffer& commandBuffer)
{
KP_LOG_DEBUG("Kompute OpTensorSyncLocal record called");
for (size_t i = 0; i < this->mTensors.size(); i++) {
if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
this->mTensors[i]->recordPrimaryBufferMemoryBarrier(
commandBuffer,
vk::AccessFlagBits::eShaderWrite,
vk::AccessFlagBits::eTransferRead,
vk::PipelineStageFlagBits::eComputeShader,
vk::PipelineStageFlagBits::eTransfer);
this->mTensors[i]->recordCopyFromDeviceToStaging(commandBuffer);
this->mTensors[i]->recordPrimaryBufferMemoryBarrier(
commandBuffer,
vk::AccessFlagBits::eTransferWrite,
vk::AccessFlagBits::eHostRead,
vk::PipelineStageFlagBits::eTransfer,
vk::PipelineStageFlagBits::eHost);
}
}
}
void
OpTensorSyncLocal::preEval(const vk::CommandBuffer& /*commandBuffer*/)
{
KP_LOG_DEBUG("Kompute OpTensorSyncLocal preEval called");
}
void
OpTensorSyncLocal::postEval(const vk::CommandBuffer& /*commandBuffer*/)
{
KP_LOG_DEBUG("Kompute OpTensorSyncLocal postEval called");
KP_LOG_DEBUG("Kompute OpTensorSyncLocal mapping data into tensor local");
}
}

396
kompute/src/Sequence.cpp Normal file
View File

@ -0,0 +1,396 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#include "kompute/Sequence.hpp"
namespace kp {
Sequence::Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
std::shared_ptr<vk::Device> device,
std::shared_ptr<vk::Queue> computeQueue,
uint32_t queueIndex,
uint32_t totalTimestamps)
{
KP_LOG_DEBUG("Kompute Sequence Constructor with existing device & queue");
this->mPhysicalDevice = physicalDevice;
this->mDevice = device;
this->mComputeQueue = computeQueue;
this->mQueueIndex = queueIndex;
this->createCommandPool();
this->createCommandBuffer();
if (totalTimestamps > 0)
this->createTimestampQueryPool(totalTimestamps +
1); //+1 for the first one
}
Sequence::~Sequence()
{
KP_LOG_DEBUG("Kompute Sequence Destructor started");
if (this->mDevice) {
this->destroy();
}
}
void
Sequence::begin()
{
KP_LOG_DEBUG("Kompute sequence called BEGIN");
if (this->isRecording()) {
KP_LOG_DEBUG("Kompute Sequence begin called when already recording");
return;
}
if (this->isRunning()) {
throw std::runtime_error(
"Kompute Sequence begin called when sequence still running");
}
KP_LOG_INFO("Kompute Sequence command now started recording");
this->mCommandBuffer->begin(vk::CommandBufferBeginInfo());
this->mRecording = true;
// latch the first timestamp before any commands are submitted
if (this->timestampQueryPool)
this->mCommandBuffer->writeTimestamp(
vk::PipelineStageFlagBits::eAllCommands,
*this->timestampQueryPool,
0);
}
void
Sequence::end()
{
KP_LOG_DEBUG("Kompute Sequence calling END");
if (this->isRunning()) {
throw std::runtime_error(
"Kompute Sequence begin called when sequence still running");
}
if (!this->isRecording()) {
KP_LOG_WARN("Kompute Sequence end called when not recording");
return;
} else {
KP_LOG_INFO("Kompute Sequence command recording END");
this->mCommandBuffer->end();
this->mRecording = false;
}
}
void
Sequence::clear()
{
KP_LOG_DEBUG("Kompute Sequence calling clear");
if (this->isRecording()) {
this->end();
}
}
std::shared_ptr<Sequence>
Sequence::eval()
{
KP_LOG_DEBUG("Kompute sequence EVAL BEGIN");
return this->evalAsync()->evalAwait();
}
std::shared_ptr<Sequence>
Sequence::eval(std::shared_ptr<OpBase> op)
{
this->clear();
return this->record(op)->eval();
}
std::shared_ptr<Sequence>
Sequence::evalAsync()
{
if (this->isRecording()) {
this->end();
}
if (this->mIsRunning) {
throw std::runtime_error(
"Kompute Sequence evalAsync called when an eval async was "
"called without successful wait");
}
this->mIsRunning = true;
for (size_t i = 0; i < this->mOperations.size(); i++) {
this->mOperations[i]->preEval(*this->mCommandBuffer);
}
vk::SubmitInfo submitInfo(
0, nullptr, nullptr, 1, this->mCommandBuffer.get());
this->mFence = this->mDevice->createFence(vk::FenceCreateInfo());
KP_LOG_DEBUG(
"Kompute sequence submitting command buffer into compute queue");
this->mComputeQueue->submit(1, &submitInfo, this->mFence);
return shared_from_this();
}
std::shared_ptr<Sequence>
Sequence::evalAsync(std::shared_ptr<OpBase> op)
{
this->clear();
this->record(op);
this->evalAsync();
return shared_from_this();
}
std::shared_ptr<Sequence>
Sequence::evalAwait(uint64_t waitFor)
{
if (!this->mIsRunning) {
KP_LOG_WARN("Kompute Sequence evalAwait called without existing eval");
return shared_from_this();
}
vk::Result result =
this->mDevice->waitForFences(1, &this->mFence, VK_TRUE, waitFor);
this->mDevice->destroy(
this->mFence, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->mIsRunning = false;
if (result == vk::Result::eTimeout) {
KP_LOG_WARN("Kompute Sequence evalAwait reached timeout of {}",
waitFor);
return shared_from_this();
}
for (size_t i = 0; i < this->mOperations.size(); i++) {
this->mOperations[i]->postEval(*this->mCommandBuffer);
}
return shared_from_this();
}
bool
Sequence::isRunning() const
{
return this->mIsRunning;
}
bool
Sequence::isRecording() const
{
return this->mRecording;
}
bool
Sequence::isInit() const
{
return this->mDevice && this->mCommandPool && this->mCommandBuffer &&
this->mComputeQueue;
}
void
Sequence::rerecord()
{
this->end();
std::vector<std::shared_ptr<OpBase>> ops = this->mOperations;
this->mOperations.clear();
for (const std::shared_ptr<kp::OpBase>& op : ops) {
this->record(op);
}
}
void
Sequence::destroy()
{
KP_LOG_DEBUG("Kompute Sequence destroy called");
if (!this->mDevice) {
KP_LOG_WARN("Kompute Sequence destroy called "
"with null Device pointer");
return;
}
if (this->mFreeCommandBuffer) {
KP_LOG_INFO("Freeing CommandBuffer");
if (!this->mCommandBuffer) {
KP_LOG_WARN("Kompute Sequence destroy called with null "
"CommandPool pointer");
return;
}
this->mDevice->freeCommandBuffers(
*this->mCommandPool, 1, this->mCommandBuffer.get());
this->mCommandBuffer = nullptr;
this->mFreeCommandBuffer = false;
KP_LOG_DEBUG("Kompute Sequence Freed CommandBuffer");
}
if (this->mFreeCommandPool) {
KP_LOG_INFO("Destroying CommandPool");
if (this->mCommandPool == nullptr) {
KP_LOG_WARN("Kompute Sequence destroy called with null "
"CommandPool pointer");
return;
}
this->mDevice->destroy(
*this->mCommandPool,
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->mCommandPool = nullptr;
this->mFreeCommandPool = false;
KP_LOG_DEBUG("Kompute Sequence Destroyed CommandPool");
}
if (this->mOperations.size()) {
KP_LOG_INFO("Kompute Sequence clearing operations buffer");
this->mOperations.clear();
}
if (this->timestampQueryPool) {
KP_LOG_INFO("Destroying QueryPool");
this->mDevice->destroy(
*this->timestampQueryPool,
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->timestampQueryPool = nullptr;
KP_LOG_DEBUG("Kompute Sequence Destroyed QueryPool");
}
if (this->mDevice) {
this->mDevice = nullptr;
}
if (this->mPhysicalDevice) {
this->mPhysicalDevice = nullptr;
}
if (this->mComputeQueue) {
this->mComputeQueue = nullptr;
}
}
std::shared_ptr<Sequence>
Sequence::record(std::shared_ptr<OpBase> op)
{
KP_LOG_DEBUG("Kompute Sequence record function started");
this->begin();
KP_LOG_DEBUG(
"Kompute Sequence running record on OpBase derived class instance");
op->record(*this->mCommandBuffer);
this->mOperations.push_back(op);
if (this->timestampQueryPool)
this->mCommandBuffer->writeTimestamp(
vk::PipelineStageFlagBits::eAllCommands,
*this->timestampQueryPool,
this->mOperations.size());
return shared_from_this();
}
void
Sequence::createCommandPool()
{
KP_LOG_DEBUG("Kompute Sequence creating command pool");
if (!this->mDevice) {
throw std::runtime_error("Kompute Sequence device is null");
}
this->mFreeCommandPool = true;
vk::CommandPoolCreateInfo commandPoolInfo(vk::CommandPoolCreateFlags(),
this->mQueueIndex);
this->mCommandPool = std::make_shared<vk::CommandPool>();
this->mDevice->createCommandPool(
&commandPoolInfo, nullptr, this->mCommandPool.get());
KP_LOG_DEBUG("Kompute Sequence Command Pool Created");
}
void
Sequence::createCommandBuffer()
{
KP_LOG_DEBUG("Kompute Sequence creating command buffer");
if (!this->mDevice) {
throw std::runtime_error("Kompute Sequence device is null");
}
if (!this->mCommandPool) {
throw std::runtime_error("Kompute Sequence command pool is null");
}
this->mFreeCommandBuffer = true;
vk::CommandBufferAllocateInfo commandBufferAllocateInfo(
*this->mCommandPool, vk::CommandBufferLevel::ePrimary, 1);
this->mCommandBuffer = std::make_shared<vk::CommandBuffer>();
this->mDevice->allocateCommandBuffers(&commandBufferAllocateInfo,
this->mCommandBuffer.get());
KP_LOG_DEBUG("Kompute Sequence Command Buffer Created");
}
void
Sequence::createTimestampQueryPool(uint32_t totalTimestamps)
{
KP_LOG_DEBUG("Kompute Sequence creating query pool");
if (!this->isInit()) {
throw std::runtime_error(
"createTimestampQueryPool() called on uninitialized Sequence");
}
if (!this->mPhysicalDevice) {
throw std::runtime_error("Kompute Sequence physical device is null");
}
vk::PhysicalDeviceProperties physicalDeviceProperties =
this->mPhysicalDevice->getProperties();
if (physicalDeviceProperties.limits.timestampComputeAndGraphics) {
vk::QueryPoolCreateInfo queryPoolInfo;
queryPoolInfo.setQueryCount(totalTimestamps);
queryPoolInfo.setQueryType(vk::QueryType::eTimestamp);
this->timestampQueryPool = std::make_shared<vk::QueryPool>(
this->mDevice->createQueryPool(queryPoolInfo));
KP_LOG_DEBUG("Query pool for timestamps created");
} else {
throw std::runtime_error("Device does not support timestamps");
}
}
std::vector<std::uint64_t>
Sequence::getTimestamps()
{
if (!this->timestampQueryPool)
throw std::runtime_error("Timestamp latching not enabled");
const auto n = this->mOperations.size() + 1;
std::vector<std::uint64_t> timestamps(n, 0);
this->mDevice->getQueryPoolResults(
*this->timestampQueryPool,
0,
n,
timestamps.size() * sizeof(std::uint64_t),
timestamps.data(),
sizeof(uint64_t),
vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
return timestamps;
}
}

451
kompute/src/Tensor.cpp Normal file
View File

@ -0,0 +1,451 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#include "kompute/Tensor.hpp"
namespace kp {
std::string
Tensor::toString(Tensor::TensorDataTypes dt)
{
switch (dt) {
case TensorDataTypes::eBool:
return "eBool";
case TensorDataTypes::eInt:
return "eInt";
case TensorDataTypes::eUnsignedInt:
return "eUnsignedInt";
case TensorDataTypes::eFloat:
return "eFloat";
case TensorDataTypes::eDouble:
return "eDouble";
default:
return "unknown";
}
}
std::string
Tensor::toString(Tensor::TensorTypes dt)
{
switch (dt) {
case TensorTypes::eDevice:
return "eDevice";
case TensorTypes::eHost:
return "eHost";
case TensorTypes::eStorage:
return "eStorage";
default:
return "unknown";
}
}
Tensor::Tensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
std::shared_ptr<vk::Device> device,
void* data,
uint32_t elementTotalCount,
uint32_t elementMemorySize,
const TensorDataTypes& dataType,
vk::DeviceMemory *primaryMemory,
vk::Buffer *primaryBuffer,
vk::DeviceMemory *stagingMemory,
vk::Buffer *stagingBuffer,
vk::DeviceSize offset,
const TensorTypes& tensorType)
{
KP_LOG_DEBUG("Kompute Tensor constructor data length: {}, and type: {}",
elementTotalCount,
Tensor::toString(tensorType));
this->mPhysicalDevice = physicalDevice;
this->mDevice = device;
this->mDataType = dataType;
this->mTensorType = tensorType;
this->rebuild(data, elementTotalCount, elementMemorySize, primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, offset);
}
Tensor::~Tensor()
{
KP_LOG_DEBUG("Kompute Tensor destructor started. Type: {}",
Tensor::toString(this->tensorType()));
if (this->mDevice) {
this->destroy();
}
KP_LOG_DEBUG("Kompute Tensor destructor success");
}
void
Tensor::rebuild(void* /*data*/,
uint32_t elementTotalCount,
uint64_t memorySize,
vk::DeviceMemory *primaryMemory,
vk::Buffer *primaryBuffer,
vk::DeviceMemory *stagingMemory,
vk::Buffer *stagingBuffer,
vk::DeviceSize offset)
{
KP_LOG_DEBUG("Kompute Tensor rebuilding with size {}", elementTotalCount);
this->mSize = elementTotalCount;
this->mMemorySize = memorySize;
this->mOffset = offset;
if (this->mPrimaryBuffer || this->mPrimaryMemory) {
KP_LOG_DEBUG(
"Kompute Tensor destroying existing resources before rebuild");
this->destroy();
}
this->setGPUResources(primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, offset);
}
Tensor::TensorTypes
Tensor::tensorType()
{
return this->mTensorType;
}
bool
Tensor::isInit()
{
return this->mDevice && this->mPrimaryBuffer && this->mPrimaryMemory &&
this->mRawData;
}
uint32_t
Tensor::size()
{
return this->mSize;
}
uint64_t
Tensor::memorySize()
{
return this->mMemorySize;
}
kp::Tensor::TensorDataTypes
Tensor::dataType()
{
return this->mDataType;
}
void*
Tensor::rawData()
{
return this->mRawData;
}
void
Tensor::setRawData(const void* data)
{
memcpy(this->mRawData, data, this->memorySize());
}
void
Tensor::recordCopyFrom(const vk::CommandBuffer& commandBuffer,
std::shared_ptr<Tensor> copyFromTensor)
{
vk::DeviceSize bufferSize(this->memorySize());
vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
KP_LOG_DEBUG("Kompute Tensor recordCopyFrom data size {}.", bufferSize);
this->recordCopyBuffer(commandBuffer,
copyFromTensor->mPrimaryBuffer,
this->mPrimaryBuffer,
bufferSize,
copyRegion);
}
void
Tensor::recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer)
{
if (!this->mStagingBuffer)
return;
vk::DeviceSize bufferSize(this->memorySize());
vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
KP_LOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
this->recordCopyBuffer(commandBuffer,
this->mStagingBuffer,
this->mPrimaryBuffer,
bufferSize,
copyRegion);
}
void
Tensor::recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer)
{
if (!this->mStagingBuffer)
return;
vk::DeviceSize bufferSize(this->memorySize());
vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
KP_LOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
this->recordCopyBuffer(commandBuffer,
this->mPrimaryBuffer,
this->mStagingBuffer,
bufferSize,
copyRegion);
}
void
Tensor::recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
vk::Buffer *bufferFrom,
vk::Buffer *bufferTo,
vk::DeviceSize /*bufferSize*/,
vk::BufferCopy copyRegion)
{
commandBuffer.copyBuffer(*bufferFrom, *bufferTo, copyRegion);
}
void
Tensor::recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
vk::AccessFlagBits srcAccessMask,
vk::AccessFlagBits dstAccessMask,
vk::PipelineStageFlagBits srcStageMask,
vk::PipelineStageFlagBits dstStageMask)
{
KP_LOG_DEBUG("Kompute Tensor recording PRIMARY buffer memory barrier");
this->recordBufferMemoryBarrier(commandBuffer,
*this->mPrimaryBuffer,
srcAccessMask,
dstAccessMask,
srcStageMask,
dstStageMask);
}
void
Tensor::recordStagingBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
vk::AccessFlagBits srcAccessMask,
vk::AccessFlagBits dstAccessMask,
vk::PipelineStageFlagBits srcStageMask,
vk::PipelineStageFlagBits dstStageMask)
{
if (!this->mStagingBuffer)
return;
KP_LOG_DEBUG("Kompute Tensor recording STAGING buffer memory barrier");
this->recordBufferMemoryBarrier(commandBuffer,
*this->mStagingBuffer,
srcAccessMask,
dstAccessMask,
srcStageMask,
dstStageMask);
}
void
Tensor::recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
const vk::Buffer& buffer,
vk::AccessFlagBits srcAccessMask,
vk::AccessFlagBits dstAccessMask,
vk::PipelineStageFlagBits srcStageMask,
vk::PipelineStageFlagBits dstStageMask)
{
KP_LOG_DEBUG("Kompute Tensor recording buffer memory barrier");
vk::DeviceSize bufferSize = this->memorySize();
vk::BufferMemoryBarrier bufferMemoryBarrier;
bufferMemoryBarrier.buffer = buffer;
bufferMemoryBarrier.size = bufferSize;
bufferMemoryBarrier.srcAccessMask = srcAccessMask;
bufferMemoryBarrier.dstAccessMask = dstAccessMask;
bufferMemoryBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
bufferMemoryBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
commandBuffer.pipelineBarrier(srcStageMask,
dstStageMask,
vk::DependencyFlags(),
nullptr,
bufferMemoryBarrier,
nullptr);
}
vk::DescriptorBufferInfo
Tensor::constructDescriptorBufferInfo()
{
KP_LOG_DEBUG("Kompute Tensor construct descriptor buffer info size {}",
this->memorySize());
vk::DeviceSize bufferSize = this->memorySize();
return vk::DescriptorBufferInfo(*this->mPrimaryBuffer,
mOffset, // offset
bufferSize);
}
vk::BufferUsageFlags
Tensor::getPrimaryBufferUsageFlags()
{
switch (this->mTensorType) {
case TensorTypes::eDevice:
return vk::BufferUsageFlagBits::eStorageBuffer |
vk::BufferUsageFlagBits::eTransferSrc |
vk::BufferUsageFlagBits::eTransferDst;
break;
case TensorTypes::eHost:
return vk::BufferUsageFlagBits::eStorageBuffer |
vk::BufferUsageFlagBits::eTransferSrc |
vk::BufferUsageFlagBits::eTransferDst;
break;
case TensorTypes::eStorage:
return vk::BufferUsageFlagBits::eStorageBuffer;
break;
default:
throw std::runtime_error("Kompute Tensor invalid tensor type");
}
}
vk::MemoryPropertyFlags
Tensor::getPrimaryMemoryPropertyFlags()
{
switch (this->mTensorType) {
case TensorTypes::eDevice:
return vk::MemoryPropertyFlagBits::eDeviceLocal;
break;
case TensorTypes::eHost:
return vk::MemoryPropertyFlagBits::eHostVisible |
vk::MemoryPropertyFlagBits::eHostCoherent;
break;
case TensorTypes::eStorage:
return vk::MemoryPropertyFlagBits::eDeviceLocal;
break;
default:
throw std::runtime_error("Kompute Tensor invalid tensor type");
}
}
vk::BufferUsageFlags
Tensor::getStagingBufferUsageFlags()
{
switch (this->mTensorType) {
case TensorTypes::eDevice:
return vk::BufferUsageFlagBits::eTransferSrc |
vk::BufferUsageFlagBits::eTransferDst;
break;
default:
throw std::runtime_error("Kompute Tensor invalid tensor type");
}
}
vk::MemoryPropertyFlags
Tensor::getStagingMemoryPropertyFlags()
{
switch (this->mTensorType) {
case TensorTypes::eDevice:
return vk::MemoryPropertyFlagBits::eHostVisible |
vk::MemoryPropertyFlagBits::eHostCoherent;
break;
default:
throw std::runtime_error("Kompute Tensor invalid tensor type");
}
}
void
Tensor::setGPUResources(vk::DeviceMemory *primaryMemory,
vk::Buffer *primaryBuffer,
vk::DeviceMemory *stagingMemory,
vk::Buffer *stagingBuffer,
vk::DeviceSize /*offset*/)
{
KP_LOG_DEBUG("Kompute Tensor creating buffer");
if (!this->mPhysicalDevice) {
throw std::runtime_error("Kompute Tensor phyisical device is null");
}
if (!this->mDevice) {
throw std::runtime_error("Kompute Tensor device is null");
}
KP_LOG_DEBUG("Kompute Tensor creating primary buffer and memory");
this->mPrimaryBuffer = primaryBuffer;
this->mPrimaryMemory = primaryMemory;
if (this->mTensorType == TensorTypes::eDevice) {
KP_LOG_DEBUG("Kompute Tensor creating staging buffer and memory");
this->mStagingBuffer = stagingBuffer;
this->mStagingMemory = stagingMemory;
}
KP_LOG_DEBUG("Kompute Tensor buffer & memory creation successful");
}
void
Tensor::destroy()
{
KP_LOG_DEBUG("Kompute Tensor started destroy()");
// Setting raw data to null regardless whether device is available to
// invalidate Tensor
this->mRawData = nullptr;
this->mSize = 0;
this->mMemorySize = 0;
if (!this->mDevice) {
KP_LOG_WARN(
"Kompute Tensor destructor reached with null Device pointer");
return;
}
if (this->mDevice) {
this->mDevice = nullptr;
}
KP_LOG_DEBUG("Kompute Tensor successful destroy()");
}
template<>
Tensor::TensorDataTypes
TensorT<bool>::dataType()
{
return Tensor::TensorDataTypes::eBool;
}
template<>
Tensor::TensorDataTypes
TensorT<int32_t>::dataType()
{
return Tensor::TensorDataTypes::eInt;
}
template<>
Tensor::TensorDataTypes
TensorT<uint32_t>::dataType()
{
return Tensor::TensorDataTypes::eUnsignedInt;
}
template<>
Tensor::TensorDataTypes
TensorT<float>::dataType()
{
return Tensor::TensorDataTypes::eFloat;
}
template<>
Tensor::TensorDataTypes
TensorT<double>::dataType()
{
return Tensor::TensorDataTypes::eDouble;
}
}

View File

@ -0,0 +1,46 @@
cmake_minimum_required(VERSION 3.20)
# ####################################################
# Kompute
# ####################################################
target_include_directories(kompute PUBLIC $<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
target_sources(kompute PRIVATE
# Header files (useful in IDEs)
kompute/Algorithm.hpp
kompute/Core.hpp
kompute/Kompute.hpp
kompute/Manager.hpp
kompute/Sequence.hpp
kompute/Tensor.hpp
kompute/operations/OpAlgoDispatch.hpp
kompute/operations/OpBase.hpp
kompute/operations/OpMemoryBarrier.hpp
kompute/operations/OpMult.hpp
kompute/operations/OpTensorCopy.hpp
kompute/operations/OpTensorSyncDevice.hpp
kompute/operations/OpTensorSyncLocal.hpp
kompute/operations/OpBufferSyncDevice.hpp
kompute/operations/OpBufferSyncLocal.hpp
kompute/logger/Logger.hpp
)
install(DIRECTORY kompute DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
# ####################################################
# Logger
# ####################################################
target_include_directories(kp_logger PUBLIC $<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
target_sources(kp_logger PRIVATE
# Header files (useful in IDEs)
kompute/logger/Logger.hpp
)
install(DIRECTORY logger DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

View File

@ -0,0 +1,338 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#pragma once
#include "kompute/Core.hpp"
#include "fmt/format.h"
#include "kompute/Tensor.hpp"
#include "logger/Logger.hpp"
namespace kp {
/**
Abstraction for compute shaders that are run on top of tensors grouped via
ParameterGroups (which group descriptorsets)
*/
class Algorithm
{
public:
/**
* Main constructor for algorithm with configuration parameters to create
* the underlying resources.
*
* @param device The Vulkan device to use for creating resources
* @param tensors (optional) The tensors to use to create the descriptor
* resources
* @param spirv (optional) The spirv code to use to create the algorithm
* @param workgroup (optional) The kp::Workgroup to use for the dispatch
* which defaults to kp::Workgroup(tensor[0].size(), 1, 1) if not set.
* @param specializationConstants (optional) The templatable param is to be
* used to initialize the specialization constants which cannot be changed
* once set.
* @param pushConstants (optional) This templatable param is to be used
* when initializing the pipeline, which set the size of the push constants
* - these can be modified but all new values must have the same data type
* and length as otherwise it will result in errors.
*/
template<typename S = float, typename P = float>
Algorithm(std::shared_ptr<vk::Device> device,
vk::DescriptorPool *pool,
const std::vector<std::shared_ptr<Tensor>>& tensors = {},
const std::vector<uint32_t>& spirv = {},
const Workgroup& workgroup = {},
const std::vector<S>& specializationConstants = {},
const std::vector<P>& pushConstants = {})
{
KP_LOG_DEBUG("Kompute Algorithm Constructor with device");
this->mDevice = device;
this->mDescriptorPool = pool;
if (tensors.size() && spirv.size()) {
KP_LOG_INFO(
"Kompute Algorithm initialising with tensor size: {} and "
"spirv size: {}",
tensors.size(),
spirv.size());
this->rebuild(tensors,
spirv,
workgroup,
specializationConstants,
pushConstants);
} else {
KP_LOG_INFO(
"Kompute Algorithm constructor with empty tensors and or "
"spirv so not rebuilding vulkan components");
}
}
/**
* Rebuild function to reconstruct algorithm with configuration parameters
* to create the underlying resources.
*
* @param tensors The tensors to use to create the descriptor resources
* @param spirv The spirv code to use to create the algorithm
* @param workgroup (optional) The kp::Workgroup to use for the dispatch
* which defaults to kp::Workgroup(tensor[0].size(), 1, 1) if not set.
* @param specializationConstants (optional) The std::vector<float> to use
* to initialize the specialization constants which cannot be changed once
* set.
* @param pushConstants (optional) The std::vector<float> to use when
* initializing the pipeline, which set the size of the push constants -
* these can be modified but all new values must have the same vector size
* as this initial value.
*/
template<typename S = float, typename P = float>
void rebuild(const std::vector<std::shared_ptr<Tensor>>& tensors,
const std::vector<uint32_t>& spirv,
const Workgroup& workgroup = {},
const std::vector<S>& specializationConstants = {},
const std::vector<P>& pushConstants = {})
{
KP_LOG_DEBUG("Kompute Algorithm rebuild started");
this->mTensors = tensors;
this->mSpirv = spirv;
if (specializationConstants.size()) {
if (this->mSpecializationConstantsData) {
free(this->mSpecializationConstantsData);
}
uint32_t memorySize =
sizeof(decltype(specializationConstants.back()));
uint32_t size = specializationConstants.size();
uint32_t totalSize = size * memorySize;
this->mSpecializationConstantsData = malloc(totalSize);
memcpy(this->mSpecializationConstantsData,
specializationConstants.data(),
totalSize);
this->mSpecializationConstantsDataTypeMemorySize = memorySize;
this->mSpecializationConstantsSize = size;
}
if (pushConstants.size()) {
if (this->mPushConstantsData) {
free(this->mPushConstantsData);
}
uint32_t memorySize = sizeof(decltype(pushConstants.back()));
uint32_t size = pushConstants.size();
uint32_t totalSize = size * memorySize;
this->mPushConstantsData = malloc(totalSize);
memcpy(this->mPushConstantsData, pushConstants.data(), totalSize);
this->mPushConstantsDataTypeMemorySize = memorySize;
this->mPushConstantsSize = size;
}
this->setWorkgroup(
workgroup, this->mTensors.size() ? this->mTensors[0]->size() : 1);
// Descriptor pool is created first so if available then destroy all
// before rebuild
if (this->isInit()) {
this->destroy();
}
this->createParameters();
this->createShaderModule();
this->createPipeline();
}
/**
* Destructor for Algorithm which is responsible for freeing and desroying
* respective pipelines and owned parameter groups.
*/
~Algorithm();
/**
* Records the dispatch function with the provided template parameters or
* alternatively using the size of the tensor by default.
*
* @param commandBuffer Command buffer to record the algorithm resources to
*/
void recordDispatch(const vk::CommandBuffer& commandBuffer);
/**
* Records command that binds the "core" algorithm components which consist
* of binding the pipeline and binding the descriptorsets.
*
* @param commandBuffer Command buffer to record the algorithm resources to
*/
void recordBindCore(const vk::CommandBuffer& commandBuffer);
/**
* Records command that binds the push constants to the command buffer
* provided
* - it is required that the pushConstants provided are of the same size as
* the ones provided during initialization.
*
* @param commandBuffer Command buffer to record the algorithm resources to
*/
void recordBindPush(const vk::CommandBuffer& commandBuffer);
/**
* function that checks all the gpu resource components to verify if these
* have been created and returns true if all are valid.
*
* @returns returns true if the algorithm is currently initialized.
*/
bool isInit();
/**
* Sets the work group to use in the recordDispatch
*
* @param workgroup The kp::Workgroup value to use to update the algorithm.
* It must have a value greater than 1 on the x value (index 1) otherwise it
* will be initialized on the size of the first tensor (ie.
* this->mTensor[0]->size())
*/
void setWorkgroup(const Workgroup& workgroup, uint32_t minSize = 1);
/**
* Sets the push constants to the new value provided to use in the next
* bindPush()
*
* @param pushConstants The templatable vector is to be used to set the push
* constants to use in the next bindPush(...) calls. The constants provided
* must be of the same size as the ones created during initialization.
*/
template<typename T>
void setPushConstants(const std::vector<T>& pushConstants)
{
uint32_t memorySize = sizeof(decltype(pushConstants.back()));
uint32_t size = pushConstants.size();
this->setPushConstants(pushConstants.data(), size, memorySize);
}
void updateDescriptors(vk::DescriptorPool *pool)
{
this->mDescriptorPool = pool;
this->setWorkgroup(
this->mWorkgroup, this->mTensors.size() ? this->mTensors[0]->size() : 1);
this->updateParameters(); // TODO: See if we can reduce this
}
/**
* Sets the push constants to the new value provided to use in the next
* bindPush() with the raw memory block location and memory size to be used.
*
* @param data The raw data point to copy the data from, without modifying
* the pointer.
* @param size The number of data elements provided in the data
* @param memorySize The memory size of each of the data elements in bytes.
*/
void setPushConstants(const void* data, uint32_t size, uint32_t memorySize)
{
uint32_t totalSize = memorySize * size;
uint32_t previousTotalSize =
this->mPushConstantsDataTypeMemorySize * this->mPushConstantsSize;
if (totalSize != previousTotalSize) {
throw std::runtime_error(fmt::format(
"Kompute Algorithm push "
"constant total memory size provided is {} but expected {} bytes",
totalSize,
previousTotalSize));
}
if (this->mPushConstantsData) {
free(this->mPushConstantsData);
}
this->mPushConstantsData = malloc(totalSize);
memcpy(this->mPushConstantsData, data, totalSize);
this->mPushConstantsDataTypeMemorySize = memorySize;
this->mPushConstantsSize = size;
}
/**
* Gets the current workgroup from the algorithm.
*
* @param The kp::Constant to use to set the push constants to use in the
* next bindPush(...) calls. The constants provided must be of the same size
* as the ones created during initialization.
*/
const Workgroup& getWorkgroup();
/**
* Gets the specialization constants of the current algorithm.
*
* @returns The std::vector<float> currently set for specialization
* constants
*/
template<typename T>
const std::vector<T> getSpecializationConstants()
{
return { (T*)this->mSpecializationConstantsData,
((T*)this->mSpecializationConstantsData) +
this->mSpecializationConstantsSize };
}
/**
* Gets the specialization constants of the current algorithm.
*
* @returns The std::vector<float> currently set for push constants
*/
template<typename T>
const std::vector<T> getPushConstants()
{
return { (T*)this->mPushConstantsData,
((T*)this->mPushConstantsData) + this->mPushConstantsSize };
}
/**
* Gets the current tensors that are used in the algorithm.
*
* @returns The list of tensors used in the algorithm.
*/
const std::vector<std::shared_ptr<Tensor>>& getTensors();
void setTensors(const std::vector<std::shared_ptr<Tensor>>& tensors);
void destroy();
private:
// -------------- NEVER OWNED RESOURCES
std::shared_ptr<vk::Device> mDevice;
std::vector<std::shared_ptr<Tensor>> mTensors;
// -------------- OPTIONALLY OWNED RESOURCES
std::shared_ptr<vk::DescriptorSetLayout> mDescriptorSetLayout;
bool mFreeDescriptorSetLayout = false;
vk::DescriptorPool *mDescriptorPool = nullptr;
std::shared_ptr<vk::DescriptorSet> mDescriptorSet;
bool mFreeDescriptorSet = false;
std::shared_ptr<vk::ShaderModule> mShaderModule;
bool mFreeShaderModule = false;
std::shared_ptr<vk::PipelineLayout> mPipelineLayout;
bool mFreePipelineLayout = false;
std::shared_ptr<vk::PipelineCache> mPipelineCache;
bool mFreePipelineCache = false;
std::shared_ptr<vk::Pipeline> mPipeline;
bool mFreePipeline = false;
// -------------- ALWAYS OWNED RESOURCES
std::vector<uint32_t> mSpirv;
void* mSpecializationConstantsData = nullptr;
uint32_t mSpecializationConstantsDataTypeMemorySize = 0;
uint32_t mSpecializationConstantsSize = 0;
void* mPushConstantsData = nullptr;
uint32_t mPushConstantsDataTypeMemorySize = 0;
uint32_t mPushConstantsSize = 0;
Workgroup mWorkgroup;
// Create util functions
void createShaderModule();
void createPipeline();
// Parameters
void freeParameters();
void createParameters();
void updateParameters();
};
} // End namespace kp

View File

@ -0,0 +1,39 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#pragma once
#include <vulkan/vulkan.hpp>
// Typedefs to simplify interaction with core types
namespace kp {
typedef std::array<uint32_t, 3> Workgroup;
typedef std::vector<float> Constants;
}
// Must be after vulkan is included
#ifndef KOMPUTE_VK_API_VERSION
#ifndef KOMPUTE_VK_API_MAJOR_VERSION
#define KOMPUTE_VK_API_MAJOR_VERSION 1
#endif // KOMPUTE_VK_API_MAJOR_VERSION
#ifndef KOMPUTE_VK_API_MINOR_VERSION
#define KOMPUTE_VK_API_MINOR_VERSION 2
#endif // KOMPUTE_VK_API_MINOR_VERSION
#define KOMPUTE_VK_API_VERSION \
VK_MAKE_VERSION( \
KOMPUTE_VK_API_MAJOR_VERSION, KOMPUTE_VK_API_MINOR_VERSION, 0)
#endif // KOMPUTE_VK_API_VERSION
#if defined(KOMPUTE_BUILD_PYTHON)
#include <pybind11/pybind11.h>
namespace py = pybind11;
// from python/src/main.cpp
extern py::object kp_trace, kp_debug, kp_info, kp_warning, kp_error;
#endif

View File

@ -0,0 +1,21 @@
#pragma once
#include "Algorithm.hpp"
#include "Core.hpp"
#include "Manager.hpp"
#include "Sequence.hpp"
#include "Tensor.hpp"
#include "operations/OpAlgoDispatch.hpp"
#include "operations/OpBase.hpp"
#include "operations/OpMemoryBarrier.hpp"
#include "operations/OpMult.hpp"
#include "operations/OpTensorCopy.hpp"
#include "operations/OpTensorSyncDevice.hpp"
#include "operations/OpTensorSyncLocal.hpp"
#include "operations/OpBufferSyncDevice.hpp"
#include "operations/OpBufferSyncLocal.hpp"
// Will be build by CMake and placed inside the build directory
#include "ShaderLogisticRegression.hpp"
#include "ShaderOpMult.hpp"

View File

@ -0,0 +1,267 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#pragma once
#include <set>
#include <unordered_map>
#include "kompute/Core.hpp"
#include "kompute/Sequence.hpp"
#include "logger/Logger.hpp"
#define KP_DEFAULT_SESSION "DEFAULT"
namespace kp {
/**
Base orchestrator which creates and manages device and child components
*/
class Manager
{
public:
/**
Base constructor.
*/
Manager();
/**
* Manager destructor which would ensure all owned resources are destroyed
* unless explicitly stated that resources should not be destroyed or freed.
*/
~Manager();
bool hasDevice() const {
return this->mDevice.get();
}
/**
* Initialize a device.
*
* @param physicalDeviceIndex The index of the physical device to use
* @param familyQueueIndices (Optional) List of queue indices to add for
* explicit allocation
* @param desiredExtensions The desired extensions to load from
* physicalDevice
*/
void initializeDevice(uint32_t physicalDeviceIndex,
const std::vector<uint32_t>& familyQueueIndices = {},
const std::vector<std::string>& desiredExtensions = {});
/**
* Create a managed sequence that will be destroyed by this manager
* if it hasn't been destroyed by its reference count going to zero.
*
* @param queueIndex The queue to use from the available queues
* @param nrOfTimestamps The maximum number of timestamps to allocate.
* If zero (default), disables latching of timestamps.
* @returns Shared pointer with initialised sequence
*/
std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0,
uint32_t totalTimestamps = 0);
/**
* Create a managed tensor that will be destroyed by this manager
* if it hasn't been destroyed by its reference count going to zero.
*
* @param data The data to initialize the tensor with
* @param tensorType The type of tensor to initialize
* @returns Shared pointer with initialised tensor
*/
template<typename T>
std::shared_ptr<TensorT<T>> tensorT(
const std::vector<T>& data,
vk::DeviceMemory *primaryMemory,
vk::Buffer *primaryBuffer,
vk::DeviceMemory *stagingMemory,
vk::Buffer *stagingBuffer,
Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
{
KP_LOG_DEBUG("Kompute Manager tensor creation triggered");
std::shared_ptr<TensorT<T>> tensor{ new kp::TensorT<T>(
this->mPhysicalDevice, this->mDevice, data, primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, tensorType) };
if (this->mManageResources) {
this->mManagedTensors.push_back(tensor);
}
return tensor;
}
std::shared_ptr<Tensor> tensor(
void* data,
uint32_t elementTotalCount,
uint64_t memorySize,
const Tensor::TensorDataTypes& dataType,
vk::DeviceMemory *primaryMemory,
vk::Buffer *primaryBuffer,
vk::DeviceMemory *stagingMemory,
vk::Buffer *stagingBuffer,
vk::DeviceSize offset,
Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
{
std::shared_ptr<Tensor> tensor{ new kp::Tensor(this->mPhysicalDevice,
this->mDevice,
data,
elementTotalCount,
memorySize,
dataType,
primaryMemory,
primaryBuffer,
stagingMemory,
stagingBuffer,
offset,
tensorType) };
if (this->mManageResources) {
this->mManagedTensors.push_back(tensor);
}
return tensor;
}
/**
* Default non-template function that can be used to create algorithm
* objects which provides default types to the push and spec constants as
* floats.
*
* @param tensors (optional) The tensors to initialise the algorithm with
* @param spirv (optional) The SPIRV bytes for the algorithm to dispatch
* @param workgroup (optional) kp::Workgroup for algorithm to use, and
* defaults to (tensor[0].size(), 1, 1)
* @param specializationConstants (optional) float vector to use for
* specialization constants, and defaults to an empty constant
* @param pushConstants (optional) float vector to use for push constants,
* and defaults to an empty constant
* @returns Shared pointer with initialised algorithm
*/
std::shared_ptr<Algorithm> algorithm(
vk::DescriptorPool *pool,
const std::vector<std::shared_ptr<Tensor>>& tensors = {},
const std::vector<uint32_t>& spirv = {},
const Workgroup& workgroup = {},
const std::vector<float>& specializationConstants = {},
const std::vector<float>& pushConstants = {})
{
return this->algorithm<>(
pool, tensors, spirv, workgroup, specializationConstants, pushConstants);
}
/**
* Create a managed algorithm that will be destroyed by this manager
* if it hasn't been destroyed by its reference count going to zero.
*
* @param tensors (optional) The tensors to initialise the algorithm with
* @param spirv (optional) The SPIRV bytes for the algorithm to dispatch
* @param workgroup (optional) kp::Workgroup for algorithm to use, and
* defaults to (tensor[0].size(), 1, 1)
* @param specializationConstants (optional) templatable vector parameter to
* use for specialization constants, and defaults to an empty constant
* @param pushConstants (optional) templatable vector parameter to use for
* push constants, and defaults to an empty constant
* @returns Shared pointer with initialised algorithm
*/
template<typename S = float, typename P = float>
std::shared_ptr<Algorithm> algorithm(
vk::DescriptorPool *pool,
const std::vector<std::shared_ptr<Tensor>>& tensors,
const std::vector<uint32_t>& spirv,
const Workgroup& workgroup,
const std::vector<S>& specializationConstants,
const std::vector<P>& pushConstants)
{
KP_LOG_DEBUG("Kompute Manager algorithm creation triggered");
std::shared_ptr<Algorithm> algorithm{ new kp::Algorithm(
this->mDevice,
pool,
tensors,
spirv,
workgroup,
specializationConstants,
pushConstants) };
if (this->mManageResources) {
this->mManagedAlgorithms.push_back(algorithm);
}
return algorithm;
}
/**
* Destroy the GPU resources and all managed resources by manager.
**/
void destroy();
/**
* Run a pseudo-garbage collection to release all the managed resources
* that have been already freed due to these reaching to zero ref count.
**/
void clear();
/**
* Information about the current device.
*
* @return vk::PhysicalDeviceProperties containing information about the
*device
**/
vk::PhysicalDeviceProperties getDeviceProperties() const;
/**
* List the devices available in the current vulkan instance.
*
* @return vector of physical devices containing their respective properties
**/
std::vector<vk::PhysicalDevice> listDevices() const;
/**
* The current Vulkan instance.
*
* @return a shared pointer to the current Vulkan instance held by this
*object
**/
std::shared_ptr<vk::Instance> getVkInstance() const;
std::shared_ptr<vk::Device> device() const { return mDevice; }
std::shared_ptr<vk::PhysicalDevice> physicalDevice() const { return mPhysicalDevice; }
private:
// -------------- OPTIONALLY OWNED RESOURCES
std::shared_ptr<vk::Instance> mInstance = nullptr;
bool mFreeInstance = false;
std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
std::shared_ptr<vk::Device> mDevice = nullptr;
bool mFreeDevice = false;
// -------------- ALWAYS OWNED RESOURCES
std::vector<std::weak_ptr<Tensor>> mManagedTensors;
std::vector<std::weak_ptr<Sequence>> mManagedSequences;
std::vector<std::weak_ptr<Algorithm>> mManagedAlgorithms;
std::vector<uint32_t> mComputeQueueFamilyIndices;
std::vector<std::shared_ptr<vk::Queue>> mComputeQueues;
bool mManageResources = false;
#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
vk::DebugReportCallbackEXT mDebugReportCallback;
vk::DispatchLoaderDynamic mDebugDispatcher;
#endif
// Create functions
void createInstance();
void createDevice(const std::vector<uint32_t>& familyQueueIndices = {},
uint32_t physicalDeviceIndex = 0,
const std::vector<std::string>& desiredExtensions = {});
};
} // End namespace kp

View File

@ -0,0 +1,313 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#pragma once
#include "kompute/Core.hpp"
#include "kompute/operations/OpAlgoDispatch.hpp"
#include "kompute/operations/OpBase.hpp"
namespace kp {
/**
* Container of operations that can be sent to GPU as batch
*/
class Sequence : public std::enable_shared_from_this<Sequence>
{
public:
/**
* Main constructor for sequence which requires core vulkan components to
* generate all dependent resources.
*
* @param physicalDevice Vulkan physical device
* @param device Vulkan logical device
* @param computeQueue Vulkan compute queue
* @param queueIndex Vulkan compute queue index in device
* @param totalTimestamps Maximum number of timestamps to allocate
*/
Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
std::shared_ptr<vk::Device> device,
std::shared_ptr<vk::Queue> computeQueue,
uint32_t queueIndex,
uint32_t totalTimestamps = 0);
/**
* Destructor for sequence which is responsible for cleaning all subsequent
* owned operations.
*/
~Sequence();
/**
* Record function for operation to be added to the GPU queue in batch. This
* template requires classes to be derived from the OpBase class. This
* function also requires the Sequence to be recording, otherwise it will
* not be able to add the operation.
*
* @param op Object derived from kp::BaseOp that will be recoreded by the
* sequence which will be used when the operation is evaluated.
* @return shared_ptr<Sequence> of the Sequence class itself
*/
std::shared_ptr<Sequence> record(std::shared_ptr<OpBase> op);
/**
* Record function for operation to be added to the GPU queue in batch. This
* template requires classes to be derived from the OpBase class. This
* function also requires the Sequence to be recording, otherwise it will
* not be able to add the operation.
*
* @param tensors Vector of tensors to use for the operation
* @param TArgs Template parameters that are used to initialise operation
* which allows for extensible configurations on initialisation.
* @return shared_ptr<Sequence> of the Sequence class itself
*/
template<typename T, typename... TArgs>
std::shared_ptr<Sequence> record(
std::vector<std::shared_ptr<Tensor>> tensors,
TArgs&&... params)
{
std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
return this->record(op);
}
/**
* Record function for operation to be added to the GPU queue in batch. This
* template requires classes to be derived from the OpBase class. This
* function also requires the Sequence to be recording, otherwise it will
* not be able to add the operation.
*
* @param algorithm Algorithm to use for the record often used for OpAlgo
* operations
* @param TArgs Template parameters that are used to initialise operation
* which allows for extensible configurations on initialisation.
* @return shared_ptr<Sequence> of the Sequence class itself
*/
template<typename T, typename... TArgs>
std::shared_ptr<Sequence> record(std::shared_ptr<Algorithm> algorithm,
TArgs&&... params)
{
std::shared_ptr<T> op{ new T(algorithm,
std::forward<TArgs>(params)...) };
return this->record(op);
}
/**
* Eval sends all the recorded and stored operations in the vector of
* operations into the gpu as a submit job synchronously (with a barrier).
*
* @return shared_ptr<Sequence> of the Sequence class itself
*/
std::shared_ptr<Sequence> eval();
/**
* Resets all the recorded and stored operations, records the operation
* provided and submits into the gpu as a submit job synchronously (with a
* barrier).
*
* @return shared_ptr<Sequence> of the Sequence class itself
*/
std::shared_ptr<Sequence> eval(std::shared_ptr<OpBase> op);
/**
* Eval sends all the recorded and stored operations in the vector of
* operations into the gpu as a submit job with a barrier.
*
* @param tensors Vector of tensors to use for the operation
* @param TArgs Template parameters that are used to initialise operation
* which allows for extensible configurations on initialisation.
* @return shared_ptr<Sequence> of the Sequence class itself
*/
template<typename T, typename... TArgs>
std::shared_ptr<Sequence> eval(std::vector<std::shared_ptr<Tensor>> tensors,
TArgs&&... params)
{
std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
return this->eval(op);
}
template<typename T, typename... TArgs>
std::shared_ptr<Sequence> eval(vk::Buffer *primaryBuffer,
vk::Buffer *stagingBuffer,
vk::DeviceSize size,
TArgs&&... params)
{
std::shared_ptr<T> op{ new T(primaryBuffer, stagingBuffer, size, std::forward<TArgs>(params)...) };
return this->eval(op);
}
/**
* Eval sends all the recorded and stored operations in the vector of
* operations into the gpu as a submit job with a barrier.
*
* @param algorithm Algorithm to use for the record often used for OpAlgo
* operations
* @param TArgs Template parameters that are used to initialise operation
* which allows for extensible configurations on initialisation.
* @return shared_ptr<Sequence> of the Sequence class itself
*/
template<typename T, typename... TArgs>
std::shared_ptr<Sequence> eval(std::shared_ptr<Algorithm> algorithm,
TArgs&&... params)
{
std::shared_ptr<T> op{ new T(algorithm,
std::forward<TArgs>(params)...) };
return this->eval(op);
}
/**
* Eval Async sends all the recorded and stored operations in the vector of
* operations into the gpu as a submit job without a barrier. EvalAwait()
* must ALWAYS be called after to ensure the sequence is terminated
* correctly.
*
* @return Boolean stating whether execution was successful.
*/
std::shared_ptr<Sequence> evalAsync();
/**
* Clears currnet operations to record provided one in the vector of
* operations into the gpu as a submit job without a barrier. EvalAwait()
* must ALWAYS be called after to ensure the sequence is terminated
* correctly.
*
* @return Boolean stating whether execution was successful.
*/
std::shared_ptr<Sequence> evalAsync(std::shared_ptr<OpBase> op);
/**
* Eval sends all the recorded and stored operations in the vector of
* operations into the gpu as a submit job with a barrier.
*
* @param tensors Vector of tensors to use for the operation
* @param TArgs Template parameters that are used to initialise operation
* which allows for extensible configurations on initialisation.
* @return shared_ptr<Sequence> of the Sequence class itself
*/
template<typename T, typename... TArgs>
std::shared_ptr<Sequence> evalAsync(
std::vector<std::shared_ptr<Tensor>> tensors,
TArgs&&... params)
{
std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
return this->evalAsync(op);
}
/**
* Eval sends all the recorded and stored operations in the vector of
* operations into the gpu as a submit job with a barrier.
*
* @param algorithm Algorithm to use for the record often used for OpAlgo
* operations
* @param TArgs Template parameters that are used to initialise operation
* which allows for extensible configurations on initialisation.
* @return shared_ptr<Sequence> of the Sequence class itself
*/
template<typename T, typename... TArgs>
std::shared_ptr<Sequence> evalAsync(std::shared_ptr<Algorithm> algorithm,
TArgs&&... params)
{
std::shared_ptr<T> op{ new T(algorithm,
std::forward<TArgs>(params)...) };
return this->evalAsync(op);
}
/**
* Eval Await waits for the fence to finish processing and then once it
* finishes, it runs the postEval of all operations.
*
* @param waitFor Number of milliseconds to wait before timing out.
* @return shared_ptr<Sequence> of the Sequence class itself
*/
std::shared_ptr<Sequence> evalAwait(uint64_t waitFor = UINT64_MAX);
/**
* Clear function clears all operations currently recorded and starts
* recording again.
*/
void clear();
/**
* Return the timestamps that were latched at the beginning and
* after each operation during the last eval() call.
*/
std::vector<std::uint64_t> getTimestamps();
/**
* Begins recording commands for commands to be submitted into the command
* buffer.
*/
void begin();
/**
* Ends the recording and stops recording commands when the record command
* is sent.
*/
void end();
/**
* Returns true if the sequence is currently in recording activated.
*
* @return Boolean stating if recording ongoing.
*/
bool isRecording() const;
/**
* Returns true if the sequence has been initialised, and it's based on the
* GPU resources being referenced.
*
* @return Boolean stating if is initialized
*/
bool isInit() const;
/**
* Clears command buffer and triggers re-record of all the current
* operations saved, which is useful if the underlying kp::Tensors or
* kp::Algorithms are modified and need to be re-recorded.
*/
void rerecord();
/**
* Returns true if the sequence is currently running - mostly used for async
* workloads.
*
* @return Boolean stating if currently running.
*/
bool isRunning() const;
/**
* Destroys and frees the GPU resources which include the buffer and memory
* and sets the sequence as init=False.
*/
void destroy();
private:
// -------------- NEVER OWNED RESOURCES
std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
std::shared_ptr<vk::Device> mDevice = nullptr;
std::shared_ptr<vk::Queue> mComputeQueue = nullptr;
uint32_t mQueueIndex = -1;
// -------------- OPTIONALLY OWNED RESOURCES
std::shared_ptr<vk::CommandPool> mCommandPool = nullptr;
bool mFreeCommandPool = false;
std::shared_ptr<vk::CommandBuffer> mCommandBuffer = nullptr;
bool mFreeCommandBuffer = false;
// -------------- ALWAYS OWNED RESOURCES
vk::Fence mFence;
std::vector<std::shared_ptr<OpBase>> mOperations{};
std::shared_ptr<vk::QueryPool> timestampQueryPool = nullptr;
// State
bool mRecording = false;
bool mIsRunning = false;
// Create functions
void createCommandPool();
void createCommandBuffer();
void createTimestampQueryPool(uint32_t totalTimestamps);
};
} // End namespace kp

View File

@ -0,0 +1,306 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (c) 2023 Nomic, Inc. All rights reserved.
*
* This software is licensed under the terms of the Software for Open Models License (SOM),
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
*/
#pragma once
#include "kompute/Core.hpp"
#include "logger/Logger.hpp"
#include <memory>
#include <string>
namespace kp {
/**
* Structured data used in GPU operations.
*
* Tensors are the base building block in Kompute to perform operations across
* GPUs. Each tensor would have a respective Vulkan memory and buffer, which
* would be used to store their respective data. The tensors can be used for GPU
* data storage or transfer.
*/
class Tensor
{
public:
/**
* Type for tensors created: Device allows memory to be transferred from
* staging buffers. Staging are host memory visible. Storage are device
* visible but are not set up to transfer or receive data (only for shader
* storage).
*/
enum class TensorTypes
{
eDevice = 0, ///< Type is device memory, source and destination
eHost = 1, ///< Type is host memory, source and destination
eStorage = 2, ///< Type is Device memory (only)
};
enum class TensorDataTypes
{
eBool = 0,
eInt = 1,
eUnsignedInt = 2,
eFloat = 3,
eDouble = 4,
};
static std::string toString(TensorDataTypes dt);
static std::string toString(TensorTypes dt);
/**
* Constructor with data provided which would be used to create the
* respective vulkan buffer and memory.
*
* @param physicalDevice The physical device to use to fetch properties
* @param device The device to use to create the buffer and memory from
* @param data Non-zero-sized vector of data that will be used by the
* tensor
* @param tensorTypes Type for the tensor which is of type TensorTypes
*/
Tensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
std::shared_ptr<vk::Device> device,
void* data,
uint32_t elementTotalCount,
uint32_t memorySize,
const TensorDataTypes& dataType,
vk::DeviceMemory *primaryMemory,
vk::Buffer *primaryBuffer,
vk::DeviceMemory *stagingMemory,
vk::Buffer *stagingBuffer,
vk::DeviceSize offset,
const TensorTypes& tensorType = TensorTypes::eDevice);
/**
* Destructor which is in charge of freeing vulkan resources unless they
* have been provided externally.
*/
virtual ~Tensor();
/**
* Function to trigger reinitialisation of the tensor buffer and memory with
* new data as well as new potential device type.
*
* @param data Vector of data to use to initialise vector from
* @param tensorType The type to use for the tensor
*/
void rebuild(void* data,
uint32_t elementTotalCount,
uint64_t memorySize,
vk::DeviceMemory *primaryMemory,
vk::Buffer *primaryBuffer,
vk::DeviceMemory *stagingMemory,
vk::Buffer *stagingBuffer,
vk::DeviceSize offset);
/**
* Destroys and frees the GPU resources which include the buffer and memory.
*/
void destroy();
/**
* Check whether tensor is initialized based on the created gpu resources.
*
* @returns Boolean stating whether tensor is initialized
*/
bool isInit();
/**
* Retrieve the tensor type of the Tensor
*
* @return Tensor type of tensor
*/
TensorTypes tensorType();
/**
* Records a copy from the memory of the tensor provided to the current
* thensor. This is intended to pass memory into a processing, to perform
* a staging buffer transfer, or to gather output (between others).
*
* @param commandBuffer Vulkan Command Buffer to record the commands into
* @param copyFromTensor Tensor to copy the data from
*/
void recordCopyFrom(const vk::CommandBuffer& commandBuffer,
std::shared_ptr<Tensor> copyFromTensor);
/**
* Records a copy from the internal staging memory to the device memory
* using an optional barrier to wait for the operation. This function would
* only be relevant for kp::Tensors of type eDevice.
*
* @param commandBuffer Vulkan Command Buffer to record the commands into
*/
void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer);
/**
* Records a copy from the internal device memory to the staging memory
* using an optional barrier to wait for the operation. This function would
* only be relevant for kp::Tensors of type eDevice.
*
* @param commandBuffer Vulkan Command Buffer to record the commands into
*/
void recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer);
/**
* Records the buffer memory barrier into the primary buffer and command
* buffer which ensures that relevant data transfers are carried out
* correctly.
*
* @param commandBuffer Vulkan Command Buffer to record the commands into
* @param srcAccessMask Access flags for source access mask
* @param dstAccessMask Access flags for destination access mask
* @param scrStageMask Pipeline stage flags for source stage mask
* @param dstStageMask Pipeline stage flags for destination stage mask
*/
void recordPrimaryBufferMemoryBarrier(
const vk::CommandBuffer& commandBuffer,
vk::AccessFlagBits srcAccessMask,
vk::AccessFlagBits dstAccessMask,
vk::PipelineStageFlagBits srcStageMask,
vk::PipelineStageFlagBits dstStageMask);
/**
* Records the buffer memory barrier into the staging buffer and command
* buffer which ensures that relevant data transfers are carried out
* correctly.
*
* @param commandBuffer Vulkan Command Buffer to record the commands into
* @param srcAccessMask Access flags for source access mask
* @param dstAccessMask Access flags for destination access mask
* @param scrStageMask Pipeline stage flags for source stage mask
* @param dstStageMask Pipeline stage flags for destination stage mask
*/
void recordStagingBufferMemoryBarrier(
const vk::CommandBuffer& commandBuffer,
vk::AccessFlagBits srcAccessMask,
vk::AccessFlagBits dstAccessMask,
vk::PipelineStageFlagBits srcStageMask,
vk::PipelineStageFlagBits dstStageMask);
/**
* Constructs a vulkan descriptor buffer info which can be used to specify
* and reference the underlying buffer component of the tensor without
* exposing it.
*
* @return Descriptor buffer info with own buffer
*/
vk::DescriptorBufferInfo constructDescriptorBufferInfo();
/**
* Returns the size/magnitude of the Tensor, which will be the total number
* of elements across all dimensions
*
* @return Unsigned integer representing the total number of elements
*/
uint32_t size();
/**
* Returns the total memory size of the data contained by the Tensor object
*
* @return Unsigned integer representing the memory of the tensor in bytes.
*/
uint64_t memorySize();
/**
* Retrieve the data type of the tensor (host, device, storage)
*
* @return Data type of tensor of type kp::Tensor::TensorDataTypes
*/
TensorDataTypes dataType();
/**
* Retrieve the raw data via the pointer to the memory that contains the raw
* memory of this current tensor. This tensor gets changed to a nullptr when
* the Tensor is removed.
*
* @return Pointer to raw memory containing raw bytes data of Tensor.
*/
void* rawData();
/**
* Sets / resets the data of the tensor which is directly done on the GPU
* host visible memory available by the tensor.
*/
void setRawData(const void* data);
/**
* Template to return the pointer data converted by specific type, which
* would be any of the supported types including float, double, int32,
* uint32 and bool.
*
* @return Pointer to raw memory containing raw bytes data of Tensor.
*/
template<typename T>
T* data()
{
return (T*)this->mRawData;
}
/**
* Template to get the data of the current tensor as a vector of specific
* type, which would be any of the supported types including float, double,
* int32, uint32 and bool.
*
* @return Vector of type provided by template.
*/
template<typename T>
std::vector<T> vector()
{
return { (T*)this->mRawData, ((T*)this->mRawData) + this->size() };
}
protected:
// -------------- ALWAYS OWNED RESOURCES
TensorTypes mTensorType;
TensorDataTypes mDataType;
uint32_t mSize = 0;
uint64_t mMemorySize = 0;
vk::DeviceSize mOffset = 0;
void* mRawData = nullptr;
private:
// -------------- NEVER OWNED RESOURCES
std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice;
std::shared_ptr<vk::Device> mDevice;
vk::Buffer *mPrimaryBuffer = nullptr;
vk::Buffer *mStagingBuffer = nullptr;
vk::DeviceMemory *mPrimaryMemory = nullptr;
vk::DeviceMemory *mStagingMemory = nullptr;
void setGPUResources(vk::DeviceMemory *primaryMemory,
vk::Buffer *primaryBuffer,
vk::DeviceMemory *stagingMemory,
vk::Buffer *stagingBuffer,
vk::DeviceSize offset);
void recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
vk::Buffer *bufferFrom,
vk::Buffer *bufferTo,
vk::DeviceSize bufferSize,
vk::BufferCopy copyRegion);
void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
const vk::Buffer& buffer,
vk::AccessFlagBits srcAccessMask,
vk::AccessFlagBits dstAccessMask,
vk::PipelineStageFlagBits srcStageMask,
vk::PipelineStageFlagBits dstStageMask);
// Private util functions
vk::BufferUsageFlags getPrimaryBufferUsageFlags();
vk::MemoryPropertyFlags getPrimaryMemoryPropertyFlags();
vk::BufferUsageFlags getStagingBufferUsageFlags();
vk::MemoryPropertyFlags getStagingMemoryPropertyFlags();
};
template<typename T>
class TensorT : public Tensor
{
public:
~TensorT() { KP_LOG_DEBUG("Kompute TensorT destructor"); }
TensorDataTypes dataType();
};
} // End namespace kp

View File

@ -0,0 +1,197 @@
#pragma once
#define KOMPUTE_LOG_LEVEL_TRACE 0
#define KOMPUTE_LOG_LEVEL_DEBUG 1
#define KOMPUTE_LOG_LEVEL_INFO 2
#define KOMPUTE_LOG_LEVEL_WARN 3
#define KOMPUTE_LOG_LEVEL_ERROR 4
#define KOMPUTE_LOG_LEVEL_CRITICAL 5
#define KOMPUTE_LOG_LEVEL_OFF 6
// Logging is disabled entirely.
#if KOMPUTE_OPT_LOG_LEVEL_DISABLED
#define KP_LOG_TRACE(...)
#define KP_LOG_DEBUG(...)
#define KP_LOG_INFO(...)
#define KP_LOG_WARN(...)
#define KP_LOG_ERROR(...)
#else
#if !KOMPUTE_OPT_USE_SPDLOG
#if VK_USE_PLATFORM_ANDROID_KHR
#include <android/log.h>
#include <fmt/core.h>
static const char* KOMPUTE_LOG_TAG = "KomputeLog";
#else
#if KOMPUTE_BUILD_PYTHON
#include <pybind11/pybind11.h>
namespace py = pybind11;
// from python/src/main.cpp
extern py::object kp_trace, kp_debug, kp_info, kp_warning, kp_error;
#else
#include <fmt/core.h>
#endif // KOMPUTE_BUILD_PYTHON
#endif // VK_USE_PLATFORM_ANDROID_KHR
#else
#include <spdlog/spdlog.h>
#endif // !KOMPUTE_OPT_USE_SPDLOG
#include <set>
#include <string>
#include <vector>
namespace logger {
// Setup the logger, note the loglevel can not be set below the CMake log level
// (To change this use -DKOMPUTE_OPT_LOG_LEVEL=...)
void
setupLogger();
// Logging is enabled, but we do not use Spdlog. So we use fmt in case nothing
// else is defined, overriding logging.
#if !KOMPUTE_OPT_USE_SPDLOG
#ifndef KP_LOG_TRACE
#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_TRACE
#if VK_USE_PLATFORM_ANDROID_KHR
#define KP_LOG_TRACE(...) \
((void)__android_log_write( \
ANDROID_LOG_VERBOSE, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
#else
#if KOMPUTE_BUILD_PYTHON
#define KP_LOG_DEBUG(...) kp_trace(fmt::format(__VA_ARGS__))
#else
#define KP_LOG_TRACE(...) \
fmt::print("[{} {}] [trace] [{}:{}] {}\n", \
__DATE__, \
__TIME__, \
__FILE__, \
__LINE__, \
fmt::format(__VA_ARGS__))
#endif // KOMPUTE_BUILD_PYTHON
#endif // VK_USE_PLATFORM_ANDROID_KHR
#else
#define KP_LOG_TRACE(...)
#endif
#endif // !KP_LOG_TRACE
#ifndef KP_LOG_DEBUG
#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_DEBUG
#if VK_USE_PLATFORM_ANDROID_KHR
#define KP_LOG_DEBUG(...) \
((void)__android_log_write( \
ANDROID_LOG_DEBUG, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
#else
#if KOMPUTE_BUILD_PYTHON
#define KP_LOG_DEBUG(...) kp_debug(fmt::format(__VA_ARGS__))
#else
#ifdef __FILE_NAME__ // gcc 12 provides only file name without path
#define KP_LOG_DEBUG(...) \
fmt::print("[{} {}] [debug] [{}:{}] {}\n", \
__DATE__, \
__TIME__, \
__FILE_NAME__, \
__LINE__, \
fmt::format(__VA_ARGS__))
#else
#define KP_LOG_DEBUG(...) \
fmt::print("[{} {}] [debug] [{}:{}] {}\n", \
__DATE__, \
__TIME__, \
__FILE__, \
__LINE__, \
fmt::format(__VA_ARGS__))
#endif // __FILE__NAME__
#endif // KOMPUTE_BUILD_PYTHON
#endif // VK_USE_PLATFORM_ANDROID_KHR
#else
#define KP_LOG_DEBUG(...)
#endif
#endif // !KP_LOG_DEBUG
#ifndef KP_LOG_INFO
#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_INFO
#if VK_USE_PLATFORM_ANDROID_KHR
#define KP_LOG_INFO(...) \
((void)__android_log_write( \
ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
#else
#if KOMPUTE_BUILD_PYTHON
#define KP_LOG_DEBUG(...) kp_info(fmt::format(__VA_ARGS__))
#else
#define KP_LOG_INFO(...) \
fmt::print("[{} {}] [info] [{}:{}] {}\n", \
__DATE__, \
__TIME__, \
__FILE__, \
__LINE__, \
fmt::format(__VA_ARGS__))
#endif // KOMPUTE_BUILD_PYTHON
#endif // VK_USE_PLATFORM_ANDROID_KHR
#else
#define KP_LOG_INFO(...)
#endif
#endif // !KP_LOG_INFO
#ifndef KP_LOG_WARN
#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_WARN
#if VK_USE_PLATFORM_ANDROID_KHR
#define KP_LOG_WARN(...) \
((void)__android_log_write( \
ANDROID_LOG_WARN, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
#else
#if KOMPUTE_BUILD_PYTHON
#define KP_LOG_DEBUG(...) kp_warning(fmt::format(__VA_ARGS__))
#else
#define KP_LOG_WARN(...) \
fmt::print("[{} {}] [warn] [{}:{}] {}\n", \
__DATE__, \
__TIME__, \
__FILE__, \
__LINE__, \
fmt::format(__VA_ARGS__))
#endif // KOMPUTE_BUILD_PYTHON
#endif // VK_USE_PLATFORM_ANDROID_KHR
#else
#define KP_LOG_WARN(...)
#endif
#endif // !KP_LOG_WARN
#ifndef KP_LOG_ERROR
#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_ERROR
#if VK_USE_PLATFORM_ANDROID_KHR
#define KP_LOG_ERROR(...) \
((void)__android_log_write( \
ANDROID_LOG_ERROR, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
#else
#if KOMPUTE_BUILD_PYTHON
#define KP_LOG_DEBUG(...) kp_error(fmt::format(__VA_ARGS__))
#else
#define KP_LOG_ERROR(...) \
fmt::print("[{} {}] [error] [{}:{}] {}\n", \
__DATE__, \
__TIME__, \
__FILE__, \
__LINE__, \
fmt::format(__VA_ARGS__))
#endif // KOMPUTE_BUILD_PYTHON
#endif // VK_USE_PLATFORM_ANDROID_KHR
#else
#define KP_LOG_ERROR(...)
#endif
#endif // !KP_LOG_ERROR
#else
#define KP_LOG_TRACE(...) SPDLOG_TRACE(__VA_ARGS__)
#define KP_LOG_DEBUG(...) SPDLOG_DEBUG(__VA_ARGS__)
#define KP_LOG_INFO(...) SPDLOG_INFO(__VA_ARGS__)
#define KP_LOG_WARN(...) SPDLOG_WARN(__VA_ARGS__)
#define KP_LOG_ERROR(...) SPDLOG_ERROR(__VA_ARGS__)
void
setLogLevel(spdlog::level::level_enum level);
spdlog::level::level_enum
getLogLevel();
#endif // !KOMPUTE_OPT_USE_SPDLOG
} // namespace logger
#endif // KOMPUTE_OPT_LOG_LEVEL_DISABLED

View File

@ -0,0 +1,86 @@
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "kompute/Algorithm.hpp"
#include "kompute/Core.hpp"
#include "kompute/Tensor.hpp"
#include "kompute/operations/OpBase.hpp"
namespace kp {
/**
* Operation that provides a general abstraction that simplifies the use of
* algorithm and parameter components which can be used with shaders.
* By default it enables the user to provide a dynamic number of tensors
* which are then passed as inputs.
*/
class OpAlgoDispatch : public OpBase
{
public:
/**
* Constructor that stores the algorithm to use as well as the relevant
* push constants to override when recording.
*
* @param algorithm The algorithm object to use for dispatch
* @param pushConstants The push constants to use for override
*/
template<typename T = float>
OpAlgoDispatch(const std::shared_ptr<kp::Algorithm>& algorithm,
const std::vector<T>& pushConstants = {})
{
KP_LOG_DEBUG("Kompute OpAlgoDispatch constructor");
this->mAlgorithm = algorithm;
if (pushConstants.size()) {
uint32_t memorySize = sizeof(decltype(pushConstants.back()));
uint32_t size = pushConstants.size();
uint32_t totalSize = size * memorySize;
this->mPushConstantsData = malloc(totalSize);
memcpy(this->mPushConstantsData, pushConstants.data(), totalSize);
this->mPushConstantsDataTypeMemorySize = memorySize;
this->mPushConstantsSize = size;
}
}
/**
* Default destructor, which is in charge of destroying the algorithm
* components but does not destroy the underlying tensors
*/
virtual ~OpAlgoDispatch() override;
/**
* This records the commands that are to be sent to the GPU. This includes
* the barriers that ensure the memory has been copied before going in and
* out of the shader, as well as the dispatch operation that sends the
* shader processing to the gpu. This function also records the GPU memory
* copy of the output data for the staging buffer so it can be read by the
* host.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void record(const vk::CommandBuffer& commandBuffer) override;
/**
* Does not perform any preEval commands.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
/**
* Does not perform any postEval commands.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
private:
// -------------- ALWAYS OWNED RESOURCES
std::shared_ptr<Algorithm> mAlgorithm;
void* mPushConstantsData = nullptr;
uint32_t mPushConstantsDataTypeMemorySize = 0;
uint32_t mPushConstantsSize = 0;
};
} // End namespace kp

View File

@ -0,0 +1,62 @@
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "kompute/Algorithm.hpp"
#include "kompute/Core.hpp"
#include "kompute/Tensor.hpp"
namespace kp {
/**
* Base Operation which provides the high level interface that Kompute
* operations implement in order to perform a set of actions in the GPU.
*
* Operations can perform actions on tensors, and optionally can also own an
* Algorithm with respective parameters. kp::Operations with kp::Algorithms
* would inherit from kp::OpBaseAlgo.
*/
class OpBase
{
public:
/**
* Default destructor for OpBase class. This OpBase destructor class should
* always be called to destroy and free owned resources unless it is
* intended to destroy the resources in the parent class.
*/
virtual ~OpBase() { KP_LOG_DEBUG("Kompute OpBase destructor started"); }
/**
* The record function is intended to only send a record command or run
* commands that are expected to record operations that are to be submitted
* as a batch into the GPU.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void record(const vk::CommandBuffer& commandBuffer) = 0;
/**
* Pre eval is called before the Sequence has called eval and submitted the
* commands to the GPU for processing, and can be used to perform any
* per-eval setup steps required as the computation iteration begins. It's
* worth noting that there are situations where eval can be called multiple
* times, so the resources that are created should be idempotent in case
* it's called multiple times in a row.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void preEval(const vk::CommandBuffer& commandBuffer) = 0;
/**
* Post eval is called after the Sequence has called eval and submitted the
* commands to the GPU for processing, and can be used to perform any
* tear-down steps required as the computation iteration finishes. It's
* worth noting that there are situations where eval can be called multiple
* times, so the resources that are destroyed should not require a re-init
* unless explicitly provided by the user.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void postEval(const vk::CommandBuffer& commandBuffer) = 0;
};
} // End namespace kp

View File

@ -0,0 +1,50 @@
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "kompute/operations/OpBase.hpp"
namespace kp {
class OpBufferSyncDevice : public OpBase
{
public:
OpBufferSyncDevice(
vk::Buffer *primaryBuffer,
vk::Buffer *stagingBuffer,
vk::DeviceSize size);
/**
* Default destructor. This class does not manage memory so it won't be
* expecting the parent to perform a release.
*/
~OpBufferSyncDevice() override;
/**
* For device buffers, it records the copy command for the buffer to copy
* the data from its staging to device memory.
*
* @param commandBuffer The command buffer to record the command into.
*/
void record(const vk::CommandBuffer& commandBuffer) override;
/**
* Does not perform any preEval commands.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
/**
* Does not perform any postEval commands.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
private:
vk::Buffer *mPrimaryBuffer;
vk::Buffer *mStagingBuffer;
vk::DeviceSize mSize;
};
} // End namespace kp

View File

@ -0,0 +1,50 @@
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "kompute/operations/OpBase.hpp"
namespace kp {
class OpBufferSyncLocal : public OpBase
{
public:
OpBufferSyncLocal(
vk::Buffer *primaryBuffer,
vk::Buffer *stagingBuffer,
vk::DeviceSize size);
/**
* Default destructor. This class does not manage memory so it won't be
* expecting the parent to perform a release.
*/
~OpBufferSyncLocal() override;
/**
* For device buffers, it records the copy command for the buffer to copy
* the data from its staging to device memory.
*
* @param commandBuffer The command buffer to record the command into.
*/
void record(const vk::CommandBuffer& commandBuffer) override;
/**
* Does not perform any preEval commands.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
/**
* Does not perform any postEval commands.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
private:
vk::Buffer *mPrimaryBuffer;
vk::Buffer *mStagingBuffer;
vk::DeviceSize mSize;
};
} // End namespace kp

View File

@ -0,0 +1,81 @@
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "kompute/Algorithm.hpp"
#include "kompute/Core.hpp"
#include "kompute/Tensor.hpp"
#include "kompute/operations/OpBase.hpp"
namespace kp {
/**
* Operation that provides a general abstraction that simplifies the use of
* algorithm and parameter components which can be used with shaders.
* It exposes the pipeline barrier functionality specifically for memory
* barriers that can be configured through the respective source and destination
* masks
*/
class OpMemoryBarrier : public OpBase
{
public:
/**
* Constructor that stores tensors as well as memory barrier parameters to
* be used to create a pipeline barrier on the respective primary or staging
* tensor.
*
* @param tensors The tensors to apply the memory barriers on
* @param srcAccessMask The kp::AccessFlagBits for the source access mask
* @param dstAccessMask The kp::AccessFlagBits for the destination access
* mask
* @param srcStageMask The kp::PipelineStageFlagBits for the source stage
* mask
* @param dstStageMask The kp::PipelineStageFlagBits for the destination
* stage mask
* @param barrierOnPrimary Boolean to select primary or secondary buffers on
* tensors
*/
OpMemoryBarrier(const std::vector<std::shared_ptr<Tensor>>& tensors,
const vk::AccessFlagBits& srcAccessMask,
const vk::AccessFlagBits& dstAccessMask,
const vk::PipelineStageFlagBits& srcStageMask,
const vk::PipelineStageFlagBits& dstStageMask,
bool barrierOnPrimary = true);
/**
* Default destructor, which is in charge of destroying the reference to the
* tensors and all the relevant access / stage masks created
*/
virtual ~OpMemoryBarrier() override;
/**
* This records the memory barrier with the access and stage masks provided
* across all relevant tensors.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void record(const vk::CommandBuffer& commandBuffer) override;
/**
* Does not perform any preEval commands.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
/**
* Does not perform any postEval commands.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
private:
const vk::AccessFlagBits mSrcAccessMask;
const vk::AccessFlagBits mDstAccessMask;
const vk::PipelineStageFlagBits mSrcStageMask;
const vk::PipelineStageFlagBits mDstStageMask;
const bool mBarrierOnPrimary;
const std::vector<std::shared_ptr<Tensor>> mTensors;
};
} // End namespace kp

View File

@ -0,0 +1,58 @@
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include <fstream>
#include "kompute/Core.hpp"
#include "ShaderOpMult.hpp"
#include "kompute/Algorithm.hpp"
#include "kompute/Tensor.hpp"
#include "kompute/operations/OpAlgoDispatch.hpp"
namespace kp {
/**
* Operation that performs multiplication on two tensors and outpus on third
* tensor.
*/
class OpMult : public OpAlgoDispatch
{
public:
/**
* Default constructor with parameters that provides the bare minimum
* requirements for the operations to be able to create and manage their
* sub-components.
*
* @param tensors Tensors that are to be used in this operation
* @param algorithm An algorithm that will be overridden with the OpMult
* shader data and the tensors provided which are expected to be 3
*/
OpMult(std::vector<std::shared_ptr<Tensor>> tensors,
std::shared_ptr<Algorithm> algorithm)
: OpAlgoDispatch(algorithm)
{
KP_LOG_DEBUG("Kompute OpMult constructor with params");
if (tensors.size() != 3) {
throw std::runtime_error(
"Kompute OpMult expected 3 tensors but got " +
std::to_string(tensors.size()));
}
const std::vector<uint32_t> spirv = std::vector<uint32_t>(
SHADEROPMULT_COMP_SPV.begin(), SHADEROPMULT_COMP_SPV.end());
algorithm->rebuild<>(tensors, spirv);
}
/**
* Default destructor, which is in charge of destroying the algorithm
* components but does not destroy the underlying tensors
*/
~OpMult() override { KP_LOG_DEBUG("Kompute OpMult destructor started"); }
};
} // End namespace kp

View File

@ -0,0 +1,63 @@
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "kompute/Core.hpp"
#include "kompute/Tensor.hpp"
#include "kompute/operations/OpBase.hpp"
namespace kp {
/**
* Operation that copies the data from the first tensor to the rest of the
* tensors provided, using a record command for all the vectors. This operation
* does not own/manage the memory of the tensors passed to it. The operation
* must only receive tensors of type
*/
class OpTensorCopy : public OpBase
{
public:
/**
* Default constructor with parameters that provides the core vulkan
* resources and the tensors that will be used in the operation.
*
* @param tensors Tensors that will be used to create in operation.
*/
OpTensorCopy(const std::vector<std::shared_ptr<Tensor>>& tensors);
/**
* Default destructor. This class does not manage memory so it won't be
* expecting the parent to perform a release.
*/
~OpTensorCopy() override;
/**
* Records the copy commands from the first tensor into all the other
* tensors provided. Also optionally records a barrier.
*
* @param commandBuffer The command buffer to record the command into.
*/
void record(const vk::CommandBuffer& commandBuffer) override;
/**
* Does not perform any preEval commands.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
/**
* Copies the local vectors for all the tensors to sync the data with the
* gpu.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
private:
// -------------- ALWAYS OWNED RESOURCES
std::vector<std::shared_ptr<Tensor>> mTensors;
};
} // End namespace kp

View File

@ -0,0 +1,66 @@
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "kompute/Core.hpp"
#include "kompute/Tensor.hpp"
#include "kompute/operations/OpBase.hpp"
namespace kp {
/**
* Operation that syncs tensor's device by mapping local data into the device
* memory. For TensorTypes::eDevice it will use a record operation for the
* memory to be syncd into GPU memory which means that the operation will be
* done in sync with GPU commands. For TensorTypes::eHost it will only map the
* data into host memory which will happen during preEval before the recorded
* commands are dispatched.
*/
class OpTensorSyncDevice : public OpBase
{
public:
/**
* Default constructor with parameters that provides the core vulkan
* resources and the tensors that will be used in the operation. The tensos
* provided cannot be of type TensorTypes::eStorage.
*
* @param tensors Tensors that will be used to create in operation.
*/
OpTensorSyncDevice(const std::vector<std::shared_ptr<Tensor>>& tensors);
/**
* Default destructor. This class does not manage memory so it won't be
* expecting the parent to perform a release.
*/
~OpTensorSyncDevice() override;
/**
* For device tensors, it records the copy command for the tensor to copy
* the data from its staging to device memory.
*
* @param commandBuffer The command buffer to record the command into.
*/
void record(const vk::CommandBuffer& commandBuffer) override;
/**
* Does not perform any preEval commands.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
/**
* Does not perform any postEval commands.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
private:
// -------------- ALWAYS OWNED RESOURCES
std::vector<std::shared_ptr<Tensor>> mTensors;
vk::Buffer *mPrimaryBuffer;
vk::Buffer *mStagingBuffer;
vk::DeviceSize mSize;
};
} // End namespace kp

View File

@ -0,0 +1,66 @@
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "kompute/Core.hpp"
#include "kompute/Tensor.hpp"
#include "kompute/operations/OpBase.hpp"
namespace kp {
/**
* Operation that syncs tensor's local memory by mapping device data into the
* local CPU memory. For TensorTypes::eDevice it will use a record operation
* for the memory to be syncd into GPU memory which means that the operation
* will be done in sync with GPU commands. For TensorTypes::eHost it will
* only map the data into host memory which will happen during preEval before
* the recorded commands are dispatched.
*/
class OpTensorSyncLocal : public OpBase
{
public:
/**
* Default constructor with parameters that provides the core vulkan
* resources and the tensors that will be used in the operation. The tensors
* provided cannot be of type TensorTypes::eStorage.
*
* @param tensors Tensors that will be used to create in operation.
*/
OpTensorSyncLocal(const std::vector<std::shared_ptr<Tensor>>& tensors);
/**
* Default destructor. This class does not manage memory so it won't be
* expecting the parent to perform a release.
*/
~OpTensorSyncLocal() override;
/**
* For device tensors, it records the copy command for the tensor to copy
* the data from its device to staging memory.
*
* @param commandBuffer The command buffer to record the command into.
*/
void record(const vk::CommandBuffer& commandBuffer) override;
/**
* Does not perform any preEval commands.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
/**
* For host tensors it performs the map command from the host memory into
* local memory.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
private:
// -------------- ALWAYS OWNED RESOURCES
std::vector<std::shared_ptr<Tensor>> mTensors;
};
} // End namespace kp

View File

@ -0,0 +1,69 @@
cmake_minimum_required(VERSION 3.20)
set(LOGGER_SOURCES Logger.cpp)
add_library(kp_logger ${LOGGER_SOURCES})
# Define log levels in code
add_compile_definitions(KOMPUTE_LOG_LEVEL_TRACE=0)
add_compile_definitions(KOMPUTE_LOG_LEVEL_DEBUG=1)
add_compile_definitions(KOMPUTE_LOG_LEVEL_INFO=2)
add_compile_definitions(KOMPUTE_LOG_LEVEL_WARN=3)
add_compile_definitions(KOMPUTE_LOG_LEVEL_ERROR=4)
add_compile_definitions(KOMPUTE_LOG_LEVEL_CRITICAL=5)
add_compile_definitions(KOMPUTE_LOG_LEVEL_OFF=6)
if(KOMPUTE_OPT_BUILD_PYTHON AND KOMPUTE_OPT_USE_SPDLOG)
message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_PYTHON' is incompatible with 'KOMPUTE_OPT_USE_SPDLOG'. To continue set either one option to 'OFF'.")
endif()
if(KOMPUTE_OPT_ANDROID_BUILD AND KOMPUTE_OPT_USE_SPDLOG)
message(FATAL_ERROR "'KOMPUTE_OPT_ANDROID_BUILD' is incompatible with 'KOMPUTE_OPT_USE_SPDLOG'. To continue set either one option to 'OFF'.")
endif()
if(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Trace")
set(KOMPUTE_OPT_LOG_LEVEL TRACE)
message(STATUS "Using log level Trace")
elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Debug")
set(KOMPUTE_OPT_LOG_LEVEL DEBUG)
message(STATUS "Using log level Debug")
elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Info")
set(KOMPUTE_OPT_LOG_LEVEL INFO)
message(STATUS "Using log level Info")
elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Warn")
set(KOMPUTE_OPT_LOG_LEVEL WARN)
message(STATUS "Using log level Warn")
elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Error")
set(KOMPUTE_OPT_LOG_LEVEL ERROR)
message(STATUS "Using log level Error")
elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Critical")
set(KOMPUTE_OPT_LOG_LEVEL CRITICAL)
message(STATUS "Using log level Critical")
elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Off")
set(KOMPUTE_OPT_LOG_LEVEL OFF)
message(STATUS "Using log level Off")
elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Default")
set(KOMPUTE_OPT_LOG_LEVEL $<IF:$<CONFIG:Debug>,DEBUG,INFO>)
message(STATUS "Setting KOMPUTE_OPT_LOG_LEVEL to according to the build type")
else()
message(FATAL_ERROR "Log level '${KOMPUTE_OPT_LOG_LEVEL}' unknown, use -DKOMPUTE_OPT_LOG_LEVEL={Trace, Debug, Info, Warn, Error, Critical, Off, Default} to set it to a correct value.")
endif()
# Always make sure we define the Kompute log level independent of the Spdlog log level
target_compile_definitions(kp_logger INTERFACE KOMPUTE_OPT_ACTIVE_LOG_LEVEL=KOMPUTE_LOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
# Link depending on how the logger should be setup
if(NOT KOMPUTE_OPT_LOG_LEVEL_DISABLED)
if(KOMPUTE_OPT_USE_SPDLOG)
target_link_libraries(kp_logger PUBLIC spdlog::spdlog)
target_compile_definitions(spdlog INTERFACE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
target_compile_definitions(kp_logger INTERFACE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
message(STATUS "setting SPDLOG_ACTIVE_LEVEL to SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL}")
if(KOMPUTE_OPT_SPDLOG_ASYNC_MODE)
target_compile_definitions(kp_logger INTERFACE KOMPUTE_SPDLOG_ASYNC_LOGGING=1)
endif()
else()
target_link_libraries(kp_logger PUBLIC fmt::fmt)
endif()
endif()

View File

@ -0,0 +1,101 @@
#include "kompute/logger/Logger.hpp"
#if !KOMPUTE_OPT_LOG_LEVEL_DISABLED
#if !KOMPUTE_OPT_USE_SPDLOG
#else
#include <cassert>
#include <iostream>
#include <memory>
#include <mutex>
#include <spdlog/async.h>
#include <spdlog/common.h>
#include <spdlog/logger.h>
#include <spdlog/sinks/stdout_color_sinks.h>
#include <spdlog/spdlog.h>
#include <string>
#endif // !KOMPUTE_OPT_USE_SPDLOG
namespace logger {
#if !KOMPUTE_OPT_USE_SPDLOG
void
setupLogger()
{
}
#else
constexpr int THREAD_QUEUE_LENGTH = 8192;
void
setupLogger()
{
// Ensure we setup the logger only once
static bool setup = false;
static std::mutex setupMutex{};
setupMutex.lock();
if (setup) {
setupMutex.unlock();
return;
}
setup = true;
setupMutex.unlock();
spdlog::init_thread_pool(THREAD_QUEUE_LENGTH, 1);
spdlog::sink_ptr console_sink =
std::make_shared<spdlog::sinks::stdout_color_sink_mt>();
#if SPDLOG_ACTIVE_LEVEL < SPDLOG_LEVEL_INFO
console_sink->set_pattern("[%H:%M:%S %z] [%^%=9l%$] [%=21s] %v");
#else
console_sink->set_pattern("[%H:%M:%S %z] [%^%=9l%$] [%=15s] %v");
#endif
std::vector<spdlog::sink_ptr> sinks{ console_sink };
// TODO: Add flag in compile flags
std::shared_ptr<spdlog::logger> logger =
#if KOMPUTE_SPDLOG_ASYNC_LOGGING
std::make_shared<spdlog::async_logger>(
"",
sinks.begin(),
sinks.end(),
spdlog::thread_pool(),
spdlog::async_overflow_policy::block);
#else
std::make_shared<spdlog::logger>(
"",
sinks.begin(),
sinks.end());
#endif
logger->set_level(getLogLevel());
spdlog::set_default_logger(logger);
}
spdlog::level::level_enum
getLogLevel()
{
#if SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_TRACE
return spdlog::level::trace;
#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_DEBUG
return spdlog::level::debug;
#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_INFO
return spdlog::level::info;
#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_WARN
return spdlog::level::warn;
#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_ERROR
return spdlog::level::error;
#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_CRITICAL
return spdlog::level::critical;
#else
return spdlog::level::off;
#endif
}
void
setLogLevel(const spdlog::level::level_enum level)
{
spdlog::default_logger()->set_level(level);
}
#endif // !KOMPUTE_OPT_USE_SPDLOG
} // namespace logger
#endif

View File

@ -0,0 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# ######################
cmake_minimum_required(VERSION 3.20)
add_subdirectory(glsl)

View File

@ -0,0 +1,26 @@
# SPDX-License-Identifier: Apache-2.0
# ######################
cmake_minimum_required(VERSION 3.20)
# Check if build shaders from source is enabled
if(KOMPUTE_OPT_BUILD_SHADERS)
vulkan_compile_shader(INFILE ShaderOpMult.comp
OUTFILE ShaderOpMult.hpp
NAMESPACE "kp")
vulkan_compile_shader(INFILE ShaderLogisticRegression.comp
OUTFILE ShaderLogisticRegression.hpp
NAMESPACE "kp")
else() # Else we will use our precompiled versions
add_custom_command(OUTPUT $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/ShaderOpMult.hpp.in $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp)
add_custom_command(OUTPUT $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/ShaderLogisticRegression.hpp.in $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp)
endif()
add_library(kp_shader INTERFACE "${CMAKE_CURRENT_BINARY_DIR}/ShaderOpMult.hpp"
"${CMAKE_CURRENT_BINARY_DIR}/ShaderLogisticRegression.hpp")
target_include_directories(kp_shader INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
# Make sure we install shaders:
install(FILES $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
install(FILES $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

View File

@ -0,0 +1,52 @@
#version 450
layout (constant_id = 0) const float m = 0;
layout (local_size_x = 1) in;
layout(set = 0, binding = 0) buffer bxi { float xi[]; };
layout(set = 0, binding = 1) buffer bxj { float xj[]; };
layout(set = 0, binding = 2) buffer by { float y[]; };
layout(set = 0, binding = 3) buffer bwin { float win[]; };
layout(set = 0, binding = 4) buffer bwouti { float wouti[]; };
layout(set = 0, binding = 5) buffer bwoutj { float woutj[]; };
layout(set = 0, binding = 6) buffer bbin { float bin[]; };
layout(set = 0, binding = 7) buffer bbout { float bout[]; };
layout(set = 0, binding = 8) buffer blout { float lout[]; };
float sigmoid(float z) {
return 1.0 / (1.0 + exp(-z));
}
float inference(vec2 x, vec2 w, float b) {
// Compute the linear mapping function
float z = dot(w, x) + b;
// Calculate the y-hat with sigmoid
float yHat = sigmoid(z);
return yHat;
}
float calculateLoss(float yHat, float y) {
return -(y * log(yHat) + (1.0 - y) * log(1.0 - yHat));
}
void main() {
uint idx = gl_GlobalInvocationID.x;
vec2 wCurr = vec2(win[0], win[1]);
float bCurr = bin[0];
vec2 xCurr = vec2(xi[idx], xj[idx]);
float yCurr = y[idx];
float yHat = inference(xCurr, wCurr, bCurr);
float dZ = yHat - yCurr;
vec2 dW = (1. / m) * xCurr * dZ;
float dB = (1. / m) * dZ;
wouti[idx] = dW.x;
woutj[idx] = dW.y;
bout[idx] = dB;
lout[idx] = calculateLoss(yHat, yCurr);
}

View File

@ -0,0 +1,310 @@
#pragma once
#include <array>
#include <cstdint>
namespace kp {
const std::array<uint32_t, 1204> SHADERLOGISTICREGRESSION_COMP_SPV = {
0x07230203, 0x00010000, 0x0008000a, 0x000000ae,
0x00000000, 0x00020011, 0x00000001, 0x0006000b,
0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e,
0x00000000, 0x0003000e, 0x00000000, 0x00000001,
0x0006000f, 0x00000005, 0x00000004, 0x6e69616d,
0x00000000, 0x00000041, 0x00060010, 0x00000004,
0x00000011, 0x00000001, 0x00000001, 0x00000001,
0x00030003, 0x00000002, 0x000001c2, 0x00040005,
0x00000004, 0x6e69616d, 0x00000000, 0x00050005,
0x0000000a, 0x6d676973, 0x2864696f, 0x003b3166,
0x00030005, 0x00000009, 0x0000007a, 0x00080005,
0x00000012, 0x65666e69, 0x636e6572, 0x66762865,
0x66763b32, 0x31663b32, 0x0000003b, 0x00030005,
0x0000000f, 0x00000078, 0x00030005, 0x00000010,
0x00000077, 0x00030005, 0x00000011, 0x00000062,
0x00080005, 0x00000017, 0x636c6163, 0x74616c75,
0x736f4c65, 0x31662873, 0x3b31663b, 0x00000000,
0x00040005, 0x00000015, 0x74614879, 0x00000000,
0x00030005, 0x00000016, 0x00000079, 0x00030005,
0x00000021, 0x0000007a, 0x00040005, 0x00000027,
0x74614879, 0x00000000, 0x00040005, 0x00000028,
0x61726170, 0x0000006d, 0x00030005, 0x0000003e,
0x00786469, 0x00080005, 0x00000041, 0x475f6c67,
0x61626f6c, 0x766e496c, 0x7461636f, 0x496e6f69,
0x00000044, 0x00040005, 0x00000046, 0x72754377,
0x00000072, 0x00040005, 0x00000048, 0x6e697762,
0x00000000, 0x00040006, 0x00000048, 0x00000000,
0x006e6977, 0x00030005, 0x0000004a, 0x00000000,
0x00040005, 0x00000054, 0x72754362, 0x00000072,
0x00040005, 0x00000056, 0x6e696262, 0x00000000,
0x00040006, 0x00000056, 0x00000000, 0x006e6962,
0x00030005, 0x00000058, 0x00000000, 0x00040005,
0x0000005b, 0x72754378, 0x00000072, 0x00030005,
0x0000005d, 0x00697862, 0x00040006, 0x0000005d,
0x00000000, 0x00006978, 0x00030005, 0x0000005f,
0x00000000, 0x00030005, 0x00000064, 0x006a7862,
0x00040006, 0x00000064, 0x00000000, 0x00006a78,
0x00030005, 0x00000066, 0x00000000, 0x00040005,
0x0000006b, 0x72754379, 0x00000072, 0x00030005,
0x0000006d, 0x00007962, 0x00040006, 0x0000006d,
0x00000000, 0x00000079, 0x00030005, 0x0000006f,
0x00000000, 0x00040005, 0x00000073, 0x74614879,
0x00000000, 0x00040005, 0x00000074, 0x61726170,
0x0000006d, 0x00040005, 0x00000076, 0x61726170,
0x0000006d, 0x00040005, 0x00000078, 0x61726170,
0x0000006d, 0x00030005, 0x0000007b, 0x00005a64,
0x00030005, 0x0000007f, 0x00005764, 0x00030005,
0x00000080, 0x0000006d, 0x00030005, 0x00000086,
0x00004264, 0x00040005, 0x0000008b, 0x756f7762,
0x00006974, 0x00050006, 0x0000008b, 0x00000000,
0x74756f77, 0x00000069, 0x00030005, 0x0000008d,
0x00000000, 0x00040005, 0x00000093, 0x756f7762,
0x00006a74, 0x00050006, 0x00000093, 0x00000000,
0x74756f77, 0x0000006a, 0x00030005, 0x00000095,
0x00000000, 0x00040005, 0x0000009c, 0x756f6262,
0x00000074, 0x00050006, 0x0000009c, 0x00000000,
0x74756f62, 0x00000000, 0x00030005, 0x0000009e,
0x00000000, 0x00040005, 0x000000a3, 0x756f6c62,
0x00000074, 0x00050006, 0x000000a3, 0x00000000,
0x74756f6c, 0x00000000, 0x00030005, 0x000000a5,
0x00000000, 0x00040005, 0x000000a7, 0x61726170,
0x0000006d, 0x00040005, 0x000000a9, 0x61726170,
0x0000006d, 0x00040047, 0x00000041, 0x0000000b,
0x0000001c, 0x00040047, 0x00000047, 0x00000006,
0x00000004, 0x00050048, 0x00000048, 0x00000000,
0x00000023, 0x00000000, 0x00030047, 0x00000048,
0x00000003, 0x00040047, 0x0000004a, 0x00000022,
0x00000000, 0x00040047, 0x0000004a, 0x00000021,
0x00000003, 0x00040047, 0x00000055, 0x00000006,
0x00000004, 0x00050048, 0x00000056, 0x00000000,
0x00000023, 0x00000000, 0x00030047, 0x00000056,
0x00000003, 0x00040047, 0x00000058, 0x00000022,
0x00000000, 0x00040047, 0x00000058, 0x00000021,
0x00000006, 0x00040047, 0x0000005c, 0x00000006,
0x00000004, 0x00050048, 0x0000005d, 0x00000000,
0x00000023, 0x00000000, 0x00030047, 0x0000005d,
0x00000003, 0x00040047, 0x0000005f, 0x00000022,
0x00000000, 0x00040047, 0x0000005f, 0x00000021,
0x00000000, 0x00040047, 0x00000063, 0x00000006,
0x00000004, 0x00050048, 0x00000064, 0x00000000,
0x00000023, 0x00000000, 0x00030047, 0x00000064,
0x00000003, 0x00040047, 0x00000066, 0x00000022,
0x00000000, 0x00040047, 0x00000066, 0x00000021,
0x00000001, 0x00040047, 0x0000006c, 0x00000006,
0x00000004, 0x00050048, 0x0000006d, 0x00000000,
0x00000023, 0x00000000, 0x00030047, 0x0000006d,
0x00000003, 0x00040047, 0x0000006f, 0x00000022,
0x00000000, 0x00040047, 0x0000006f, 0x00000021,
0x00000002, 0x00040047, 0x00000080, 0x00000001,
0x00000000, 0x00040047, 0x0000008a, 0x00000006,
0x00000004, 0x00050048, 0x0000008b, 0x00000000,
0x00000023, 0x00000000, 0x00030047, 0x0000008b,
0x00000003, 0x00040047, 0x0000008d, 0x00000022,
0x00000000, 0x00040047, 0x0000008d, 0x00000021,
0x00000004, 0x00040047, 0x00000092, 0x00000006,
0x00000004, 0x00050048, 0x00000093, 0x00000000,
0x00000023, 0x00000000, 0x00030047, 0x00000093,
0x00000003, 0x00040047, 0x00000095, 0x00000022,
0x00000000, 0x00040047, 0x00000095, 0x00000021,
0x00000005, 0x00040047, 0x0000009b, 0x00000006,
0x00000004, 0x00050048, 0x0000009c, 0x00000000,
0x00000023, 0x00000000, 0x00030047, 0x0000009c,
0x00000003, 0x00040047, 0x0000009e, 0x00000022,
0x00000000, 0x00040047, 0x0000009e, 0x00000021,
0x00000007, 0x00040047, 0x000000a2, 0x00000006,
0x00000004, 0x00050048, 0x000000a3, 0x00000000,
0x00000023, 0x00000000, 0x00030047, 0x000000a3,
0x00000003, 0x00040047, 0x000000a5, 0x00000022,
0x00000000, 0x00040047, 0x000000a5, 0x00000021,
0x00000008, 0x00040047, 0x000000ad, 0x0000000b,
0x00000019, 0x00020013, 0x00000002, 0x00030021,
0x00000003, 0x00000002, 0x00030016, 0x00000006,
0x00000020, 0x00040020, 0x00000007, 0x00000007,
0x00000006, 0x00040021, 0x00000008, 0x00000006,
0x00000007, 0x00040017, 0x0000000c, 0x00000006,
0x00000002, 0x00040020, 0x0000000d, 0x00000007,
0x0000000c, 0x00060021, 0x0000000e, 0x00000006,
0x0000000d, 0x0000000d, 0x00000007, 0x00050021,
0x00000014, 0x00000006, 0x00000007, 0x00000007,
0x0004002b, 0x00000006, 0x00000019, 0x3f800000,
0x00040015, 0x0000003c, 0x00000020, 0x00000000,
0x00040020, 0x0000003d, 0x00000007, 0x0000003c,
0x00040017, 0x0000003f, 0x0000003c, 0x00000003,
0x00040020, 0x00000040, 0x00000001, 0x0000003f,
0x0004003b, 0x00000040, 0x00000041, 0x00000001,
0x0004002b, 0x0000003c, 0x00000042, 0x00000000,
0x00040020, 0x00000043, 0x00000001, 0x0000003c,
0x0003001d, 0x00000047, 0x00000006, 0x0003001e,
0x00000048, 0x00000047, 0x00040020, 0x00000049,
0x00000002, 0x00000048, 0x0004003b, 0x00000049,
0x0000004a, 0x00000002, 0x00040015, 0x0000004b,
0x00000020, 0x00000001, 0x0004002b, 0x0000004b,
0x0000004c, 0x00000000, 0x00040020, 0x0000004d,
0x00000002, 0x00000006, 0x0004002b, 0x0000004b,
0x00000050, 0x00000001, 0x0003001d, 0x00000055,
0x00000006, 0x0003001e, 0x00000056, 0x00000055,
0x00040020, 0x00000057, 0x00000002, 0x00000056,
0x0004003b, 0x00000057, 0x00000058, 0x00000002,
0x0003001d, 0x0000005c, 0x00000006, 0x0003001e,
0x0000005d, 0x0000005c, 0x00040020, 0x0000005e,
0x00000002, 0x0000005d, 0x0004003b, 0x0000005e,
0x0000005f, 0x00000002, 0x0003001d, 0x00000063,
0x00000006, 0x0003001e, 0x00000064, 0x00000063,
0x00040020, 0x00000065, 0x00000002, 0x00000064,
0x0004003b, 0x00000065, 0x00000066, 0x00000002,
0x0003001d, 0x0000006c, 0x00000006, 0x0003001e,
0x0000006d, 0x0000006c, 0x00040020, 0x0000006e,
0x00000002, 0x0000006d, 0x0004003b, 0x0000006e,
0x0000006f, 0x00000002, 0x00040032, 0x00000006,
0x00000080, 0x00000000, 0x0003001d, 0x0000008a,
0x00000006, 0x0003001e, 0x0000008b, 0x0000008a,
0x00040020, 0x0000008c, 0x00000002, 0x0000008b,
0x0004003b, 0x0000008c, 0x0000008d, 0x00000002,
0x0003001d, 0x00000092, 0x00000006, 0x0003001e,
0x00000093, 0x00000092, 0x00040020, 0x00000094,
0x00000002, 0x00000093, 0x0004003b, 0x00000094,
0x00000095, 0x00000002, 0x0004002b, 0x0000003c,
0x00000097, 0x00000001, 0x0003001d, 0x0000009b,
0x00000006, 0x0003001e, 0x0000009c, 0x0000009b,
0x00040020, 0x0000009d, 0x00000002, 0x0000009c,
0x0004003b, 0x0000009d, 0x0000009e, 0x00000002,
0x0003001d, 0x000000a2, 0x00000006, 0x0003001e,
0x000000a3, 0x000000a2, 0x00040020, 0x000000a4,
0x00000002, 0x000000a3, 0x0004003b, 0x000000a4,
0x000000a5, 0x00000002, 0x0006002c, 0x0000003f,
0x000000ad, 0x00000097, 0x00000097, 0x00000097,
0x00050036, 0x00000002, 0x00000004, 0x00000000,
0x00000003, 0x000200f8, 0x00000005, 0x0004003b,
0x0000003d, 0x0000003e, 0x00000007, 0x0004003b,
0x0000000d, 0x00000046, 0x00000007, 0x0004003b,
0x00000007, 0x00000054, 0x00000007, 0x0004003b,
0x0000000d, 0x0000005b, 0x00000007, 0x0004003b,
0x00000007, 0x0000006b, 0x00000007, 0x0004003b,
0x00000007, 0x00000073, 0x00000007, 0x0004003b,
0x0000000d, 0x00000074, 0x00000007, 0x0004003b,
0x0000000d, 0x00000076, 0x00000007, 0x0004003b,
0x00000007, 0x00000078, 0x00000007, 0x0004003b,
0x00000007, 0x0000007b, 0x00000007, 0x0004003b,
0x0000000d, 0x0000007f, 0x00000007, 0x0004003b,
0x00000007, 0x00000086, 0x00000007, 0x0004003b,
0x00000007, 0x000000a7, 0x00000007, 0x0004003b,
0x00000007, 0x000000a9, 0x00000007, 0x00050041,
0x00000043, 0x00000044, 0x00000041, 0x00000042,
0x0004003d, 0x0000003c, 0x00000045, 0x00000044,
0x0003003e, 0x0000003e, 0x00000045, 0x00060041,
0x0000004d, 0x0000004e, 0x0000004a, 0x0000004c,
0x0000004c, 0x0004003d, 0x00000006, 0x0000004f,
0x0000004e, 0x00060041, 0x0000004d, 0x00000051,
0x0000004a, 0x0000004c, 0x00000050, 0x0004003d,
0x00000006, 0x00000052, 0x00000051, 0x00050050,
0x0000000c, 0x00000053, 0x0000004f, 0x00000052,
0x0003003e, 0x00000046, 0x00000053, 0x00060041,
0x0000004d, 0x00000059, 0x00000058, 0x0000004c,
0x0000004c, 0x0004003d, 0x00000006, 0x0000005a,
0x00000059, 0x0003003e, 0x00000054, 0x0000005a,
0x0004003d, 0x0000003c, 0x00000060, 0x0000003e,
0x00060041, 0x0000004d, 0x00000061, 0x0000005f,
0x0000004c, 0x00000060, 0x0004003d, 0x00000006,
0x00000062, 0x00000061, 0x0004003d, 0x0000003c,
0x00000067, 0x0000003e, 0x00060041, 0x0000004d,
0x00000068, 0x00000066, 0x0000004c, 0x00000067,
0x0004003d, 0x00000006, 0x00000069, 0x00000068,
0x00050050, 0x0000000c, 0x0000006a, 0x00000062,
0x00000069, 0x0003003e, 0x0000005b, 0x0000006a,
0x0004003d, 0x0000003c, 0x00000070, 0x0000003e,
0x00060041, 0x0000004d, 0x00000071, 0x0000006f,
0x0000004c, 0x00000070, 0x0004003d, 0x00000006,
0x00000072, 0x00000071, 0x0003003e, 0x0000006b,
0x00000072, 0x0004003d, 0x0000000c, 0x00000075,
0x0000005b, 0x0003003e, 0x00000074, 0x00000075,
0x0004003d, 0x0000000c, 0x00000077, 0x00000046,
0x0003003e, 0x00000076, 0x00000077, 0x0004003d,
0x00000006, 0x00000079, 0x00000054, 0x0003003e,
0x00000078, 0x00000079, 0x00070039, 0x00000006,
0x0000007a, 0x00000012, 0x00000074, 0x00000076,
0x00000078, 0x0003003e, 0x00000073, 0x0000007a,
0x0004003d, 0x00000006, 0x0000007c, 0x00000073,
0x0004003d, 0x00000006, 0x0000007d, 0x0000006b,
0x00050083, 0x00000006, 0x0000007e, 0x0000007c,
0x0000007d, 0x0003003e, 0x0000007b, 0x0000007e,
0x00050088, 0x00000006, 0x00000081, 0x00000019,
0x00000080, 0x0004003d, 0x0000000c, 0x00000082,
0x0000005b, 0x0005008e, 0x0000000c, 0x00000083,
0x00000082, 0x00000081, 0x0004003d, 0x00000006,
0x00000084, 0x0000007b, 0x0005008e, 0x0000000c,
0x00000085, 0x00000083, 0x00000084, 0x0003003e,
0x0000007f, 0x00000085, 0x00050088, 0x00000006,
0x00000087, 0x00000019, 0x00000080, 0x0004003d,
0x00000006, 0x00000088, 0x0000007b, 0x00050085,
0x00000006, 0x00000089, 0x00000087, 0x00000088,
0x0003003e, 0x00000086, 0x00000089, 0x0004003d,
0x0000003c, 0x0000008e, 0x0000003e, 0x00050041,
0x00000007, 0x0000008f, 0x0000007f, 0x00000042,
0x0004003d, 0x00000006, 0x00000090, 0x0000008f,
0x00060041, 0x0000004d, 0x00000091, 0x0000008d,
0x0000004c, 0x0000008e, 0x0003003e, 0x00000091,
0x00000090, 0x0004003d, 0x0000003c, 0x00000096,
0x0000003e, 0x00050041, 0x00000007, 0x00000098,
0x0000007f, 0x00000097, 0x0004003d, 0x00000006,
0x00000099, 0x00000098, 0x00060041, 0x0000004d,
0x0000009a, 0x00000095, 0x0000004c, 0x00000096,
0x0003003e, 0x0000009a, 0x00000099, 0x0004003d,
0x0000003c, 0x0000009f, 0x0000003e, 0x0004003d,
0x00000006, 0x000000a0, 0x00000086, 0x00060041,
0x0000004d, 0x000000a1, 0x0000009e, 0x0000004c,
0x0000009f, 0x0003003e, 0x000000a1, 0x000000a0,
0x0004003d, 0x0000003c, 0x000000a6, 0x0000003e,
0x0004003d, 0x00000006, 0x000000a8, 0x00000073,
0x0003003e, 0x000000a7, 0x000000a8, 0x0004003d,
0x00000006, 0x000000aa, 0x0000006b, 0x0003003e,
0x000000a9, 0x000000aa, 0x00060039, 0x00000006,
0x000000ab, 0x00000017, 0x000000a7, 0x000000a9,
0x00060041, 0x0000004d, 0x000000ac, 0x000000a5,
0x0000004c, 0x000000a6, 0x0003003e, 0x000000ac,
0x000000ab, 0x000100fd, 0x00010038, 0x00050036,
0x00000006, 0x0000000a, 0x00000000, 0x00000008,
0x00030037, 0x00000007, 0x00000009, 0x000200f8,
0x0000000b, 0x0004003d, 0x00000006, 0x0000001a,
0x00000009, 0x0004007f, 0x00000006, 0x0000001b,
0x0000001a, 0x0006000c, 0x00000006, 0x0000001c,
0x00000001, 0x0000001b, 0x0000001b, 0x00050081,
0x00000006, 0x0000001d, 0x00000019, 0x0000001c,
0x00050088, 0x00000006, 0x0000001e, 0x00000019,
0x0000001d, 0x000200fe, 0x0000001e, 0x00010038,
0x00050036, 0x00000006, 0x00000012, 0x00000000,
0x0000000e, 0x00030037, 0x0000000d, 0x0000000f,
0x00030037, 0x0000000d, 0x00000010, 0x00030037,
0x00000007, 0x00000011, 0x000200f8, 0x00000013,
0x0004003b, 0x00000007, 0x00000021, 0x00000007,
0x0004003b, 0x00000007, 0x00000027, 0x00000007,
0x0004003b, 0x00000007, 0x00000028, 0x00000007,
0x0004003d, 0x0000000c, 0x00000022, 0x00000010,
0x0004003d, 0x0000000c, 0x00000023, 0x0000000f,
0x00050094, 0x00000006, 0x00000024, 0x00000022,
0x00000023, 0x0004003d, 0x00000006, 0x00000025,
0x00000011, 0x00050081, 0x00000006, 0x00000026,
0x00000024, 0x00000025, 0x0003003e, 0x00000021,
0x00000026, 0x0004003d, 0x00000006, 0x00000029,
0x00000021, 0x0003003e, 0x00000028, 0x00000029,
0x00050039, 0x00000006, 0x0000002a, 0x0000000a,
0x00000028, 0x0003003e, 0x00000027, 0x0000002a,
0x0004003d, 0x00000006, 0x0000002b, 0x00000027,
0x000200fe, 0x0000002b, 0x00010038, 0x00050036,
0x00000006, 0x00000017, 0x00000000, 0x00000014,
0x00030037, 0x00000007, 0x00000015, 0x00030037,
0x00000007, 0x00000016, 0x000200f8, 0x00000018,
0x0004003d, 0x00000006, 0x0000002e, 0x00000016,
0x0004003d, 0x00000006, 0x0000002f, 0x00000015,
0x0006000c, 0x00000006, 0x00000030, 0x00000001,
0x0000001c, 0x0000002f, 0x00050085, 0x00000006,
0x00000031, 0x0000002e, 0x00000030, 0x0004003d,
0x00000006, 0x00000032, 0x00000016, 0x00050083,
0x00000006, 0x00000033, 0x00000019, 0x00000032,
0x0004003d, 0x00000006, 0x00000034, 0x00000015,
0x00050083, 0x00000006, 0x00000035, 0x00000019,
0x00000034, 0x0006000c, 0x00000006, 0x00000036,
0x00000001, 0x0000001c, 0x00000035, 0x00050085,
0x00000006, 0x00000037, 0x00000033, 0x00000036,
0x00050081, 0x00000006, 0x00000038, 0x00000031,
0x00000037, 0x0004007f, 0x00000006, 0x00000039,
0x00000038, 0x000200fe, 0x00000039, 0x00010038 };
} // namespace kp

View File

@ -0,0 +1,28 @@
#version 450
layout(set = 0, binding = 0) buffer tensorLhs {
float valuesLhs[ ];
};
layout(set = 0, binding = 1) buffer tensorRhs {
float valuesRhs[ ];
};
layout(set = 0, binding = 2) buffer tensorOutput {
float valuesOutput[ ];
};
layout (constant_id = 0) const uint LEN_LHS = 0;
layout (constant_id = 1) const uint LEN_RHS = 0;
layout (constant_id = 2) const uint LEN_OUT = 0;
layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
void main()
{
uint index = gl_GlobalInvocationID.x;
valuesOutput[index] = valuesLhs[index] * valuesRhs[index];
}

View File

@ -0,0 +1,101 @@
#pragma once
#include <array>
#include <cstdint>
namespace kp {
const std::array<uint32_t, 366> SHADEROPMULT_COMP_SPV = {
0x07230203, 0x00010000, 0x0008000a, 0x0000002e,
0x00000000, 0x00020011, 0x00000001, 0x0006000b,
0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e,
0x00000000, 0x0003000e, 0x00000000, 0x00000001,
0x0006000f, 0x00000005, 0x00000004, 0x6e69616d,
0x00000000, 0x0000000b, 0x00060010, 0x00000004,
0x00000011, 0x00000001, 0x00000001, 0x00000001,
0x00030003, 0x00000002, 0x000001c2, 0x00040005,
0x00000004, 0x6e69616d, 0x00000000, 0x00040005,
0x00000008, 0x65646e69, 0x00000078, 0x00080005,
0x0000000b, 0x475f6c67, 0x61626f6c, 0x766e496c,
0x7461636f, 0x496e6f69, 0x00000044, 0x00060005,
0x00000012, 0x736e6574, 0x754f726f, 0x74757074,
0x00000000, 0x00070006, 0x00000012, 0x00000000,
0x756c6176, 0x754f7365, 0x74757074, 0x00000000,
0x00030005, 0x00000014, 0x00000000, 0x00050005,
0x00000019, 0x736e6574, 0x684c726f, 0x00000073,
0x00060006, 0x00000019, 0x00000000, 0x756c6176,
0x684c7365, 0x00000073, 0x00030005, 0x0000001b,
0x00000000, 0x00050005, 0x00000021, 0x736e6574,
0x6852726f, 0x00000073, 0x00060006, 0x00000021,
0x00000000, 0x756c6176, 0x68527365, 0x00000073,
0x00030005, 0x00000023, 0x00000000, 0x00040005,
0x00000029, 0x5f4e454c, 0x0053484c, 0x00040005,
0x0000002a, 0x5f4e454c, 0x00534852, 0x00040005,
0x0000002b, 0x5f4e454c, 0x0054554f, 0x00040047,
0x0000000b, 0x0000000b, 0x0000001c, 0x00040047,
0x00000011, 0x00000006, 0x00000004, 0x00050048,
0x00000012, 0x00000000, 0x00000023, 0x00000000,
0x00030047, 0x00000012, 0x00000003, 0x00040047,
0x00000014, 0x00000022, 0x00000000, 0x00040047,
0x00000014, 0x00000021, 0x00000002, 0x00040047,
0x00000018, 0x00000006, 0x00000004, 0x00050048,
0x00000019, 0x00000000, 0x00000023, 0x00000000,
0x00030047, 0x00000019, 0x00000003, 0x00040047,
0x0000001b, 0x00000022, 0x00000000, 0x00040047,
0x0000001b, 0x00000021, 0x00000000, 0x00040047,
0x00000020, 0x00000006, 0x00000004, 0x00050048,
0x00000021, 0x00000000, 0x00000023, 0x00000000,
0x00030047, 0x00000021, 0x00000003, 0x00040047,
0x00000023, 0x00000022, 0x00000000, 0x00040047,
0x00000023, 0x00000021, 0x00000001, 0x00040047,
0x00000029, 0x00000001, 0x00000000, 0x00040047,
0x0000002a, 0x00000001, 0x00000001, 0x00040047,
0x0000002b, 0x00000001, 0x00000002, 0x00040047,
0x0000002d, 0x0000000b, 0x00000019, 0x00020013,
0x00000002, 0x00030021, 0x00000003, 0x00000002,
0x00040015, 0x00000006, 0x00000020, 0x00000000,
0x00040020, 0x00000007, 0x00000007, 0x00000006,
0x00040017, 0x00000009, 0x00000006, 0x00000003,
0x00040020, 0x0000000a, 0x00000001, 0x00000009,
0x0004003b, 0x0000000a, 0x0000000b, 0x00000001,
0x0004002b, 0x00000006, 0x0000000c, 0x00000000,
0x00040020, 0x0000000d, 0x00000001, 0x00000006,
0x00030016, 0x00000010, 0x00000020, 0x0003001d,
0x00000011, 0x00000010, 0x0003001e, 0x00000012,
0x00000011, 0x00040020, 0x00000013, 0x00000002,
0x00000012, 0x0004003b, 0x00000013, 0x00000014,
0x00000002, 0x00040015, 0x00000015, 0x00000020,
0x00000001, 0x0004002b, 0x00000015, 0x00000016,
0x00000000, 0x0003001d, 0x00000018, 0x00000010,
0x0003001e, 0x00000019, 0x00000018, 0x00040020,
0x0000001a, 0x00000002, 0x00000019, 0x0004003b,
0x0000001a, 0x0000001b, 0x00000002, 0x00040020,
0x0000001d, 0x00000002, 0x00000010, 0x0003001d,
0x00000020, 0x00000010, 0x0003001e, 0x00000021,
0x00000020, 0x00040020, 0x00000022, 0x00000002,
0x00000021, 0x0004003b, 0x00000022, 0x00000023,
0x00000002, 0x00040032, 0x00000006, 0x00000029,
0x00000000, 0x00040032, 0x00000006, 0x0000002a,
0x00000000, 0x00040032, 0x00000006, 0x0000002b,
0x00000000, 0x0004002b, 0x00000006, 0x0000002c,
0x00000001, 0x0006002c, 0x00000009, 0x0000002d,
0x0000002c, 0x0000002c, 0x0000002c, 0x00050036,
0x00000002, 0x00000004, 0x00000000, 0x00000003,
0x000200f8, 0x00000005, 0x0004003b, 0x00000007,
0x00000008, 0x00000007, 0x00050041, 0x0000000d,
0x0000000e, 0x0000000b, 0x0000000c, 0x0004003d,
0x00000006, 0x0000000f, 0x0000000e, 0x0003003e,
0x00000008, 0x0000000f, 0x0004003d, 0x00000006,
0x00000017, 0x00000008, 0x0004003d, 0x00000006,
0x0000001c, 0x00000008, 0x00060041, 0x0000001d,
0x0000001e, 0x0000001b, 0x00000016, 0x0000001c,
0x0004003d, 0x00000010, 0x0000001f, 0x0000001e,
0x0004003d, 0x00000006, 0x00000024, 0x00000008,
0x00060041, 0x0000001d, 0x00000025, 0x00000023,
0x00000016, 0x00000024, 0x0004003d, 0x00000010,
0x00000026, 0x00000025, 0x00050085, 0x00000010,
0x00000027, 0x0000001f, 0x00000026, 0x00060041,
0x0000001d, 0x00000028, 0x00000014, 0x00000016,
0x00000017, 0x0003003e, 0x00000028, 0x00000027,
0x000100fd, 0x00010038 };
} // namespace kp

View File

@ -0,0 +1,29 @@
// Copyright 2020 Google LLC
RWStructuredBuffer<uint> values : register(u0);
[[vk::constant_id(0)]] const uint BUFFER_ELEMENTS = 32;
uint fibonacci(uint n) {
if(n <= 1){
return n;
}
uint curr = 1;
uint prev = 1;
for(uint i = 2; i < n; ++i) {
uint temp = curr;
curr += prev;
prev = temp;
}
return curr;
}
[numthreads(1, 1, 1)]
void main(uint3 GlobalInvocationID : SV_DispatchThreadID)
{
uint index = GlobalInvocationID.x;
if (index >= BUFFER_ELEMENTS)
return;
values[index] = fibonacci(values[index]);
}

View File

@ -9,6 +9,8 @@
# include "ggml-cuda.h" # include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
# include "ggml-opencl.h" # include "ggml-opencl.h"
#elif defined(GGML_USE_KOMPUTE)
# include "ggml-vulkan.h"
#endif #endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
@ -1182,11 +1184,14 @@ struct llama_context {
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
ggml_metal_context * ctx_metal = NULL; ggml_metal_context * ctx_metal = NULL;
#elif defined(GGML_USE_KOMPUTE)
ggml_kompute_context * ctx_kompute = NULL;
#endif #endif
#ifdef GGML_USE_MPI #ifdef GGML_USE_MPI
ggml_mpi_context * ctx_mpi = NULL; ggml_mpi_context * ctx_mpi = NULL;
#endif #endif
}; };
// //
@ -2474,6 +2479,9 @@ static struct ggml_cgraph * llm_build_llama(
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
#if defined(GGML_USE_KOMPUTE)
struct ggml_tensor * toDeviceTensor = nullptr;
#endif
if (tokens) { if (tokens) {
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
@ -2483,6 +2491,9 @@ static struct ggml_cgraph * llm_build_llama(
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
} }
ggml_set_name(inp_tokens, "inp_tokens"); ggml_set_name(inp_tokens, "inp_tokens");
#if defined(GGML_USE_KOMPUTE)
toDeviceTensor = inp_tokens;
#endif
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
} else { } else {
@ -2491,6 +2502,9 @@ static struct ggml_cgraph * llm_build_llama(
#endif #endif
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
#if defined(GGML_USE_KOMPUTE)
toDeviceTensor = inpL;
#endif
ggml_allocr_alloc(lctx.alloc, inpL); ggml_allocr_alloc(lctx.alloc, inpL);
if (!ggml_allocr_is_measure(lctx.alloc)) { if (!ggml_allocr_is_measure(lctx.alloc)) {
@ -2693,7 +2707,6 @@ static struct ggml_cgraph * llm_build_llama(
offload_func(cur); offload_func(cur);
ggml_set_name(cur, "ffn_norm"); ggml_set_name(cur, "ffn_norm");
} }
struct ggml_tensor * tmp = ggml_mul_mat(ctx0, struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
model.layers[il].w3, model.layers[il].w3,
cur); cur);
@ -2752,6 +2765,16 @@ static struct ggml_cgraph * llm_build_llama(
ggml_free(ctx0); ggml_free(ctx0);
#if defined(GGML_USE_KOMPUTE)
if (lctx.ctx_kompute && N == 1) {
if (!ggml_vk_has_h2d_all(lctx.ctx_kompute)) {
ggml_vk_h2d_all(lctx.ctx_kompute);
} else {
ggml_vk_h2d_tensor(lctx.ctx_kompute, toDeviceTensor);
}
}
#endif
return gf; return gf;
} }
@ -3792,6 +3815,17 @@ static bool llama_eval_internal(
} else { } else {
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
} }
#elif defined(GGML_USE_KOMPUTE)
if (lctx.ctx_kompute && N == 1) {
ggml_vk_graph_compute(lctx.ctx_kompute, gf);
ggml_vk_d2h_tensor(lctx.ctx_kompute, res);
} else {
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
if (lctx.ctx_kompute) {
ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.k);
ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.v);
}
}
#else #else
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
#endif #endif
@ -3833,12 +3867,12 @@ static bool llama_eval_internal(
} }
// extract embeddings // extract embeddings
if (!lctx.embedding.empty()) { //if (!lctx.embedding.empty()) {
auto & embedding_out = lctx.embedding; // auto & embedding_out = lctx.embedding;
embedding_out.resize(n_embd); // embedding_out.resize(n_embd);
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); // memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
} //}
// measure the performance only for the single-token evals // measure the performance only for the single-token evals
if (N == 1) { if (N == 1) {
@ -5904,6 +5938,7 @@ static int llama_apply_lora_from_file_internal(
) { ) {
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
const int64_t t_start_lora_us = ggml_time_us(); const int64_t t_start_lora_us = ggml_time_us();
auto fin = std::ifstream(path_lora, std::ios::binary); auto fin = std::ifstream(path_lora, std::ios::binary);

View File

@ -42,7 +42,7 @@
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 1 #define LLAMA_SESSION_VERSION 1
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_KOMPUTE)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU. // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
#define LLAMA_SUPPORTS_GPU_OFFLOAD #define LLAMA_SUPPORTS_GPU_OFFLOAD
#endif #endif

18
undump.py Normal file
View File

@ -0,0 +1,18 @@
import struct
import numpy as np
from pathlib import Path
def undump(fn):
with open(fn, 'rb') as df:
dims = struct.unpack('=QQQQ', df.read(8*4))
(dsz,) = struct.unpack('=Q', df.read(8))
## assume f32
data = df.read(dsz)
data = [i for (i,) in struct.iter_unpack('=f', data)]
return np.array(data).reshape(dims).squeeze()
if __name__ == '__main__':
for dfn in sorted(Path('.').glob('*.dump')):
darr = undump(dfn)
print(f'{dfn}: {darr.shape}\n{darr}')